From b2d94630738072cfc991f6ed57993c18e6e29b92 Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Fri, 20 Dec 2024 11:35:19 +0000
Subject: [PATCH] 2024-12-20 nightly release
 (74e6e7b6dbe76ac6c8a3515349c1e1b2952a4841)

---
 recipes/configs/eleuther_evaluation.yaml      |  4 +-
 recipes/configs/gemma/evaluation.yaml         |  4 +-
 recipes/configs/generation.yaml               |  4 +-
 recipes/configs/llama2/7B_lora_dpo.yaml       |  2 +-
 .../llama2/7B_lora_dpo_single_device.yaml     |  2 +-
 recipes/configs/llama2/generation_v2.yaml     |  4 +-
 recipes/configs/llama3/8B_qat_lora.yaml       |  9 ++-
 recipes/configs/llama3_1/8B_lora_dpo.yaml     |  2 +-
 .../llama3_1/8B_lora_dpo_single_device.yaml   |  2 +-
 recipes/configs/llama3_1/8B_qat_lora.yaml     |  9 ++-
 recipes/configs/llama3_2/1B_qat_lora.yaml     |  9 ++-
 recipes/configs/llama3_2/3B_qat_lora.yaml     |  9 ++-
 .../8B_to_1B_KD_lora_distributed.yaml         |  2 +-
 .../8B_to_1B_KD_lora_single_device.yaml       |  2 +-
 .../llama3_2_vision/11B_evaluation.yaml       |  4 +-
 .../llama3_2_vision/11B_generation_v2.yaml    |  4 +-
 recipes/configs/mistral/evaluation.yaml       |  4 +-
 recipes/configs/phi3/evaluation.yaml          |  4 +-
 recipes/configs/quantization.yaml             |  4 +-
 .../1.5_to_0.5B_KD_lora_distributed.yaml      |  2 +-
 .../1.5_to_0.5B_KD_lora_single_device.yaml    |  2 +-
 recipes/configs/qwen2/evaluation.yaml         |  4 +-
 recipes/dev/7B_full_early_exit.yaml           |  6 +-
 recipes/lora_dpo_distributed.py               | 12 ++++
 recipes/lora_dpo_single_device.py             | 13 ++++
 recipes/qat_distributed.py                    |  6 ++
 recipes/qat_lora_finetune_distributed.py      |  8 ++-
 .../test_ppo_full_finetune_single_device.py   |  4 +-
 .../checkpointing/test_checkpointer.py        | 71 ++++++++++++-------
 .../checkpointing/test_checkpointer_utils.py  | 45 ++++++++++++
 .../training/checkpointing/_checkpointer.py   | 63 +++++++++-------
 torchtune/training/checkpointing/_utils.py    | 31 +++++++-
 torchtune/training/quantization.py            | 10 ++-
 33 files changed, 263 insertions(+), 98 deletions(-)

diff --git a/recipes/configs/eleuther_evaluation.yaml b/recipes/configs/eleuther_evaluation.yaml
index e62fa0219c..8bdde28ba5 100644
--- a/recipes/configs/eleuther_evaluation.yaml
+++ b/recipes/configs/eleuther_evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run eleuther_eval --config eleuther_evaluation tasks=["truthfulqa_mc2","hellaswag"]
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.llama2.llama2_7b
@@ -14,7 +16,7 @@ checkpointer:
     pytorch_model-00001-of-00002.bin,
     pytorch_model-00002-of-00002.bin,
   ]
-  output_dir: /tmp/Llama-2-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 
 # Tokenizer
diff --git a/recipes/configs/gemma/evaluation.yaml b/recipes/configs/gemma/evaluation.yaml
index 2ff8f78546..9f4f73fb67 100644
--- a/recipes/configs/gemma/evaluation.yaml
+++ b/recipes/configs/gemma/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config gemma/evaluation
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.gemma.gemma_2b
@@ -15,7 +17,7 @@ checkpointer:
     model-00001-of-00002.safetensors,
     model-00002-of-00002.safetensors,
   ]
-  output_dir: ./ # Not needed
+  output_dir: ${output_dir}
   model_type: GEMMA
 
 # Tokenizer
diff --git a/recipes/configs/generation.yaml b/recipes/configs/generation.yaml
index e9c5d0d4f5..c2081a1ed7 100644
--- a/recipes/configs/generation.yaml
+++ b/recipes/configs/generation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run generate --config generation
 
+output_dir: ./ # Not needed
+
 # Model arguments
 model:
   _component_: torchtune.models.llama2.llama2_7b
@@ -14,7 +16,7 @@ checkpointer:
     pytorch_model-00001-of-00002.bin,
     pytorch_model-00002-of-00002.bin,
   ]
-  output_dir: /tmp/Llama-2-7b-hf/
+  output_dir: ${output_dir}
   model_type: LLAMA2
 
 device: cuda
diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml
index 250d62db44..887be92925 100644
--- a/recipes/configs/llama2/7B_lora_dpo.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo.yaml
@@ -32,7 +32,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/Llama-2-7b-hf/tokenizer.model
-  max_seq_len: 1024
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
index 4d154c38ce..6e0049cfd5 100644
--- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
@@ -31,7 +31,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/Llama-2-7b-hf/tokenizer.model
-  max_seq_len: 1024
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/llama2/generation_v2.yaml b/recipes/configs/llama2/generation_v2.yaml
index 7a9222862d..da2c7f622a 100644
--- a/recipes/configs/llama2/generation_v2.yaml
+++ b/recipes/configs/llama2/generation_v2.yaml
@@ -6,6 +6,8 @@
 # To launch, run the following command:
 #    tune run dev/generate_v2 --config llama2/generation_v2
 
+output_dir: ./ # Not needed
+
 # Model arguments
 model:
   _component_: torchtune.models.llama2.llama2_7b
@@ -24,7 +26,7 @@ checkpointer:
     pytorch_model-00001-of-00002.bin,
     pytorch_model-00002-of-00002.bin
   ]
-  output_dir: ./
+  output_dir: ${output_dir}
   model_type: LLAMA2
 
 # Device
diff --git a/recipes/configs/llama3/8B_qat_lora.yaml b/recipes/configs/llama3/8B_qat_lora.yaml
index 5a889a3d63..5f88f175ec 100644
--- a/recipes/configs/llama3/8B_qat_lora.yaml
+++ b/recipes/configs/llama3/8B_qat_lora.yaml
@@ -83,6 +83,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -108,8 +112,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_1/8B_lora_dpo.yaml b/recipes/configs/llama3_1/8B_lora_dpo.yaml
index 7160362b2a..4425e7414b 100644
--- a/recipes/configs/llama3_1/8B_lora_dpo.yaml
+++ b/recipes/configs/llama3_1/8B_lora_dpo.yaml
@@ -32,7 +32,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
   path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model
-  max_seq_len: null
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml b/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml
index 81d6158b28..236b623f7d 100644
--- a/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml
@@ -31,7 +31,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
   path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model
-  max_seq_len: null
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/llama3_1/8B_qat_lora.yaml b/recipes/configs/llama3_1/8B_qat_lora.yaml
index d25351a0e4..3d7c94744e 100644
--- a/recipes/configs/llama3_1/8B_qat_lora.yaml
+++ b/recipes/configs/llama3_1/8B_qat_lora.yaml
@@ -86,6 +86,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -111,8 +115,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/1B_qat_lora.yaml b/recipes/configs/llama3_2/1B_qat_lora.yaml
index 79f628367f..bffc52a4ac 100644
--- a/recipes/configs/llama3_2/1B_qat_lora.yaml
+++ b/recipes/configs/llama3_2/1B_qat_lora.yaml
@@ -82,6 +82,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -107,8 +111,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/3B_qat_lora.yaml b/recipes/configs/llama3_2/3B_qat_lora.yaml
index 6b69aebac2..64985de1ea 100644
--- a/recipes/configs/llama3_2/3B_qat_lora.yaml
+++ b/recipes/configs/llama3_2/3B_qat_lora.yaml
@@ -83,6 +83,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -108,8 +112,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml b/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml
index 3b5d34f13f..9ab51fc0e3 100644
--- a/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml
+++ b/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml
@@ -59,7 +59,7 @@ teacher_checkpointer:
     model-00004-of-00004.safetensors
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Meta-Llama-3.1-8B-Instruct/
+  output_dir: ${output_dir}
   model_type: LLAMA3
 
 # Dataset and Sampler
diff --git a/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml b/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml
index 7ab0f23bc2..0a2dfea9f5 100644
--- a/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml
+++ b/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml
@@ -59,7 +59,7 @@ teacher_checkpointer:
     model-00004-of-00004.safetensors
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Meta-Llama-3.1-8B-Instruct/
+  output_dir: ${output_dir}
   model_type: LLAMA3
 
 # Dataset and Sampler
diff --git a/recipes/configs/llama3_2_vision/11B_evaluation.yaml b/recipes/configs/llama3_2_vision/11B_evaluation.yaml
index 832f5072b5..cb444ab31c 100644
--- a/recipes/configs/llama3_2_vision/11B_evaluation.yaml
+++ b/recipes/configs/llama3_2_vision/11B_evaluation.yaml
@@ -9,6 +9,8 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run eleuther_eval --config llama3_2_vision/11B_evaluation
 
+output_dir: ./ # Not needed
+
 # Model arguments
 model:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_11b
@@ -26,7 +28,7 @@ checkpointer:
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
     max_filename: "00005"
-  output_dir: ./
+  output_dir: ${output_dir}
   model_type: LLAMA3_VISION
 
 # Environment
diff --git a/recipes/configs/llama3_2_vision/11B_generation_v2.yaml b/recipes/configs/llama3_2_vision/11B_generation_v2.yaml
index 7682296849..c78e0e52b6 100644
--- a/recipes/configs/llama3_2_vision/11B_generation_v2.yaml
+++ b/recipes/configs/llama3_2_vision/11B_generation_v2.yaml
@@ -7,6 +7,8 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run dev/generate_v2 --config llama3_2_vision/generation_v2
 
+output_dir: ./ # Not needed
+
 # Model arguments
 model:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_11b
@@ -25,7 +27,7 @@ checkpointer:
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
     max_filename: "00005"
-  output_dir: ./
+  output_dir: ${output_dir}
   model_type: LLAMA3_VISION
 
 # Device
diff --git a/recipes/configs/mistral/evaluation.yaml b/recipes/configs/mistral/evaluation.yaml
index 61d69dcb40..1db2334b11 100644
--- a/recipes/configs/mistral/evaluation.yaml
+++ b/recipes/configs/mistral/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config mistral/evaluation
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.mistral.mistral_7b
@@ -15,7 +17,7 @@ checkpointer:
     pytorch_model-00001-of-00002.bin,
     pytorch_model-00002-of-00002.bin
   ]
-  output_dir: /tmp/Mistral-7B-v0.1/
+  output_dir: ${output_dir}
   model_type: MISTRAL
 resume_from_checkpoint: False
 
diff --git a/recipes/configs/phi3/evaluation.yaml b/recipes/configs/phi3/evaluation.yaml
index ca2f1c9759..4a1d5a02a7 100644
--- a/recipes/configs/phi3/evaluation.yaml
+++ b/recipes/configs/phi3/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config phi3/evaluation
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.phi3.phi3_mini
@@ -16,7 +18,7 @@ checkpointer:
     model-00002-of-00002.safetensors
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Phi-3-mini-4k-instruct
+  output_dir: ${output_dir}
   model_type: PHI3_MINI
 resume_from_checkpoint: False
 
diff --git a/recipes/configs/quantization.yaml b/recipes/configs/quantization.yaml
index 89df543f7d..ffb66eac83 100644
--- a/recipes/configs/quantization.yaml
+++ b/recipes/configs/quantization.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run quantize --config quantization
 
+output_dir: /tmp/torchtune/llama2_7B/quantized # /tmp may be deleted by your system. Change it to your preference.
+
 #
 # Model arguments
 model:
@@ -16,7 +18,7 @@ checkpointer:
     pytorch_model-00002-of-00002.bin,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Llama-2-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 
 device: cuda
diff --git a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml
index faae6cd3b8..b68704cdc2 100644
--- a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml
+++ b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml
@@ -51,7 +51,7 @@ teacher_checkpointer:
     hf_model_0001_0.pt
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Qwen2-1.5B-Instruct-lora-finetune
+  output_dir: ${output_dir}
   model_type: QWEN2
 
 resume_from_checkpoint: False
diff --git a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml
index 48dbe4b117..385c1d453a 100644
--- a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml
+++ b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml
@@ -51,7 +51,7 @@ teacher_checkpointer:
     model.safetensors
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Qwen2-1.5B-Instruct
+  output_dir: ${output_dir}
   model_type: QWEN2
 
 resume_from_checkpoint: False
diff --git a/recipes/configs/qwen2/evaluation.yaml b/recipes/configs/qwen2/evaluation.yaml
index 708d63d600..b909c30fe4 100644
--- a/recipes/configs/qwen2/evaluation.yaml
+++ b/recipes/configs/qwen2/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config qwen2/evaluation
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.qwen2.qwen2_7b
@@ -17,7 +19,7 @@ checkpointer:
     model-00003-of-00004.safetensors,
     model-00004-of-00004.safetensors
   ]
-  output_dir: ./ # Not needed
+  output_dir: ${output_dir}
   model_type: QWEN2
 
 # Tokenizer
diff --git a/recipes/dev/7B_full_early_exit.yaml b/recipes/dev/7B_full_early_exit.yaml
index 7d02a34f0e..0253bf82e2 100644
--- a/recipes/dev/7B_full_early_exit.yaml
+++ b/recipes/dev/7B_full_early_exit.yaml
@@ -30,6 +30,7 @@
 # This config works best for distributed training, hence when the model is being fine-tuned on 2+ GPUs.
 #
 
+output_dir: /tmp/torchtune/llama2_7b/full_early_exit # /tmp may be deleted by your system. Change it to your preference.
 
 # Tokenizer
 tokenizer:
@@ -61,7 +62,7 @@ checkpointer:
     pytorch_model-00002-of-00002.bin
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Llama-2-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 resume_from_checkpoint: False
 
@@ -92,8 +93,7 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/topv2-llama2-finetune
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index c86d5720b2..ae0eb2f0ab 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -26,6 +26,7 @@
     DoRALinear,
     get_adapter_params,
     get_adapter_state_dict,
+    get_lora_module_names,
     get_merged_lora_ckpt,
     LoRALinear,
     set_trainable_params,
@@ -595,6 +596,17 @@ def save_checkpoint(
                     }
                 )
 
+            adapter_config = {
+                "r": self._lora_rank,
+                "lora_alpha": self._lora_alpha,
+                "target_modules": get_lora_module_names(
+                    self._lora_attn_modules,
+                    self._apply_lora_to_mlp,
+                    self._apply_lora_to_output,
+                ),
+                "peft_type": "LORA",
+            }
+            checkpoint_dict.update({training.ADAPTER_CONFIG: adapter_config})
             self._checkpointer.save_checkpoint(
                 checkpoint_dict,
                 epoch=epoch,
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index bee78ad0d3..9b5dc6fb1a 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -24,6 +24,7 @@
     disable_adapter,
     get_adapter_params,
     get_adapter_state_dict,
+    get_lora_module_names,
     get_merged_lora_ckpt,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
@@ -448,6 +449,18 @@ def save_checkpoint(self, epoch: int) -> None:
 
             ckpt_dict.update({training.MODEL_KEY: merged_state_dict})
 
+        adapter_config = {
+            "r": self._lora_rank,
+            "lora_alpha": self._lora_alpha,
+            "target_modules": get_lora_module_names(
+                self._lora_attn_modules,
+                self._apply_lora_to_mlp,
+                self._apply_lora_to_output,
+            ),
+            "peft_type": "LORA",
+        }
+        ckpt_dict.update({training.ADAPTER_CONFIG: adapter_config})
+
         self._checkpointer.save_checkpoint(
             ckpt_dict,
             epoch=epoch,
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index 6c79a6cefa..efb9c4c2b5 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -118,6 +118,7 @@ class QATRecipeDistributed(FTRecipeInterface):
 
     Raises:
         ValueError: If ``dtype`` is set to fp16.
+        ValueError: If ``compile`` is set to True.
         RuntimeError: If ``dtype`` is set to bf16 and the hardware does not support bf16.
         RuntimeError: If ``left_pad_sequence`` is set as the data collator.
         RuntimeError: If ``enable_activation_offloading`` is True and device is not CUDA.
@@ -133,6 +134,11 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
+        if cfg.get("compile", False):
+            raise ValueError(
+                "Compile is not yet supported for QAT. Please set compile=False."
+            )
+
         # logging attributes
         self._output_dir = cfg.output_dir
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
index d047d77d41..57e4a09108 100644
--- a/recipes/qat_lora_finetune_distributed.py
+++ b/recipes/qat_lora_finetune_distributed.py
@@ -126,7 +126,8 @@ class QATLoRAFinetuneRecipeDistributed(FTRecipeInterface):
 
     Raises:
         ValueError: If ``dtype`` is set to fp16.
-        ValueError: If world_size is 1
+        ValueError: If world_size is 1.
+        ValueError: If ``compile`` is set to True.
         RuntimeError: If ``dtype`` is set to bf16 and the hardware does not support bf16.
         RuntimeError: If ``left_pad_sequence`` is set as the data collator.
         RuntimeError: If ``enable_activation_offloading`` is True and device is not CUDA.
@@ -149,6 +150,11 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
+        if cfg.get("compile", False):
+            raise ValueError(
+                "Compile is not yet supported for QAT. Please set compile=False."
+            )
+
         _, rank = utils.get_world_size_and_rank()
 
         # _is_rank_zero is used primarily for logging. In the future, the logger
diff --git a/tests/recipes/test_ppo_full_finetune_single_device.py b/tests/recipes/test_ppo_full_finetune_single_device.py
index 36352cb0f1..412c4c06dd 100644
--- a/tests/recipes/test_ppo_full_finetune_single_device.py
+++ b/tests/recipes/test_ppo_full_finetune_single_device.py
@@ -358,7 +358,7 @@ def test_training_state_on_resume_with_optimizer_in_bwd(self, tmpdir, monkeypatc
             --config mistral/7B_full_ppo_low_memory \
             output_dir={tmpdir} \
             checkpointer._component_=torchtune.training.FullModelHFCheckpointer \
-            checkpointer.checkpoint_dir='{policy_tmpdir}' \
+            checkpointer.checkpoint_dir='{ckpt_dir}' \
             checkpointer.checkpoint_files=[{os.path.join(epoch_folder_minus_one, model_ckpt_fname)}]\
             checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")}\
             checkpointer.output_dir={policy_tmpdir} \
@@ -367,7 +367,7 @@ def test_training_state_on_resume_with_optimizer_in_bwd(self, tmpdir, monkeypatc
             ref_policy_checkpointer.checkpoint_dir='{ckpt_dir}' \
             ref_policy_checkpointer.checkpoint_files=[{policy_ckpt_path}]\
 
-            value_checkpointer.checkpoint_dir='{value_tmpdir}' \
+            value_checkpointer.checkpoint_dir='{ckpt_dir}' \
             value_checkpointer.checkpoint_files=[{os.path.join(value_tmpdir, epoch_folder_minus_one, model_ckpt_fname)}]\
             value_checkpointer.output_dir={value_tmpdir} \
 
diff --git a/tests/torchtune/training/checkpointing/test_checkpointer.py b/tests/torchtune/training/checkpointing/test_checkpointer.py
index abaa1d6ea7..5bf885512e 100644
--- a/tests/torchtune/training/checkpointing/test_checkpointer.py
+++ b/tests/torchtune/training/checkpointing/test_checkpointer.py
@@ -152,8 +152,11 @@ def llama2_hf_checkpoints(self, tmp_path, state_dict_1, state_dict_2):
             * embed_dim: 64
             * max_seq_len: 128
         """
-        checkpoint_file_1 = tmp_path / "llama2_hf_checkpoint_01.pt"
-        checkpoint_file_2 = tmp_path / "llama2_hf_checkpoint_02.pt"
+        checkpoint_dir = Path.joinpath(tmp_path, "checkpoint_dir")
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+
+        checkpoint_file_1 = checkpoint_dir / "llama2_hf_checkpoint_01.pt"
+        checkpoint_file_2 = checkpoint_dir / "llama2_hf_checkpoint_02.pt"
 
         torch.save(state_dict_1, checkpoint_file_1)
         torch.save(state_dict_2, checkpoint_file_2)
@@ -163,7 +166,7 @@ def llama2_hf_checkpoints(self, tmp_path, state_dict_1, state_dict_2):
             "num_attention_heads": 4,
             "num_key_value_heads": 4,
         }
-        config_file = Path.joinpath(tmp_path, "config.json")
+        config_file = Path.joinpath(checkpoint_dir, "config.json")
         with config_file.open("w") as f:
             json.dump(config, f)
 
@@ -174,11 +177,13 @@ def single_file_checkpointer(
         self, llama2_hf_checkpoints, tmp_path
     ) -> FullModelHFCheckpointer:
         checkpoint_file, _ = llama2_hf_checkpoints
+        checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir"))
+        output_dir = str(Path.joinpath(tmp_path, "output_dir"))
         return FullModelHFCheckpointer(
-            checkpoint_dir=tmp_path,
+            checkpoint_dir=checkpoint_dir,
             checkpoint_files=[checkpoint_file],
             model_type="LLAMA2",
-            output_dir=tmp_path,
+            output_dir=output_dir,
         )
 
     @pytest.fixture
@@ -186,11 +191,13 @@ def multi_file_checkpointer(
         self, llama2_hf_checkpoints, tmp_path
     ) -> FullModelHFCheckpointer:
         checkpoint_file_1, checkpoint_file_2 = llama2_hf_checkpoints
+        checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir"))
+        output_dir = str(Path.joinpath(tmp_path, "output_dir"))
         return FullModelHFCheckpointer(
-            checkpoint_dir=tmp_path,
+            checkpoint_dir=checkpoint_dir,
             checkpoint_files=[checkpoint_file_1, checkpoint_file_2],
             model_type="LLAMA2",
-            output_dir=tmp_path,
+            output_dir=output_dir,
         )
 
     def test_load_save_checkpoint_single_file(
@@ -242,7 +249,7 @@ def test_load_save_checkpoint_single_file(
         # assumes we know what the name of the file is. This is fine, breaking this logic
         # should be something we capture through this test
         output_file = Path.joinpath(
-            checkpoint_file.parent,
+            checkpoint_file.parent.parent / "output_dir",
             "epoch_1",
             SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)),
         ).with_suffix(".safetensors")
@@ -306,12 +313,12 @@ def test_save_load_checkpoint_multiple_file(
         # assumes we know what the name of the file is. This is fine, breaking this logic
         # should be something we capture through this test
         output_file_1 = Path.joinpath(
-            checkpoint_file_1.parent,
+            checkpoint_file_1.parent.parent / "output_dir",
             "epoch_1",
             SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="2".zfill(5)),
         ).with_suffix(".safetensors")
         output_file_2 = Path.joinpath(
-            checkpoint_file_2.parent,
+            checkpoint_file_2.parent.parent / "output_dir",
             "epoch_1",
             SHARD_FNAME.format(cpt_idx="2".zfill(5), num_shards="2".zfill(5)),
         ).with_suffix(".safetensors")
@@ -338,12 +345,14 @@ def test_load_save_adapter_only(
         single_file_checkpointer.save_checkpoint(state_dict, epoch=2, adapter_only=True)
 
         output_file_1 = Path.joinpath(
-            tmp_path,
+            tmp_path / "output_dir",
             "epoch_2",
             SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)),
         )
         output_file_2 = Path.joinpath(
-            tmp_path, "epoch_2", f"{ADAPTER_MODEL_FNAME}.safetensors"
+            tmp_path / "output_dir",
+            "epoch_2",
+            f"{ADAPTER_MODEL_FNAME}.safetensors",
         )
 
         with pytest.raises(ValueError, match="Unable to load checkpoint from"):
@@ -437,12 +446,16 @@ def test_save_checkpoint_in_peft_format(
 
         # Load saved adapter weights and config from file for comparison
         adapter_weights_file = Path.joinpath(
-            checkpoint_file.parent, "epoch_1", f"{ADAPTER_MODEL_FNAME}.safetensors"
+            checkpoint_file.parent.parent / "output_dir",
+            "epoch_1",
+            f"{ADAPTER_MODEL_FNAME}.safetensors",
         )
         actual_adapter_state_dict = safe_torch_load(adapter_weights_file)
 
         adapter_config_file = Path.joinpath(
-            checkpoint_file.parent, "epoch_1", f"{ADAPTER_CONFIG_FNAME}.json"
+            checkpoint_file.parent.parent / "output_dir",
+            "epoch_1",
+            f"{ADAPTER_CONFIG_FNAME}.json",
         )
         with open(adapter_config_file, "r") as f:
             adapter_config = json.load(f)
@@ -558,7 +571,10 @@ def mistral_reward_model_hf_checkpoint(self, tmp_path, state_dict):
             * intermediate_dim: 256
 
         """
-        checkpoint_file = tmp_path / "mistral_reward_model_hf_checkpoint.pt"
+        checkpoint_dir = Path.joinpath(tmp_path, "checkpoint_dir")
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+
+        checkpoint_file = checkpoint_dir / "mistral_reward_model_hf_checkpoint.pt"
 
         torch.save(state_dict, checkpoint_file)
 
@@ -568,7 +584,7 @@ def mistral_reward_model_hf_checkpoint(self, tmp_path, state_dict):
             "num_key_value_heads": 4,
             "num_classes": 1,
         }
-        config_file = Path.joinpath(tmp_path, "config.json")
+        config_file = Path.joinpath(checkpoint_dir, "config.json")
         with config_file.open("w") as f:
             json.dump(config, f)
 
@@ -579,11 +595,13 @@ def single_file_checkpointer(
         self, mistral_reward_model_hf_checkpoint, tmp_path
     ) -> FullModelHFCheckpointer:
         checkpoint_file = mistral_reward_model_hf_checkpoint
+        checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir"))
+        output_dir = str(Path.joinpath(tmp_path, "output_dir"))
         return FullModelHFCheckpointer(
-            checkpoint_dir=tmp_path,
+            checkpoint_dir=checkpoint_dir,
             checkpoint_files=[checkpoint_file],
             model_type="REWARD",
-            output_dir=tmp_path,
+            output_dir=output_dir,
         )
 
     def test_load_save_checkpoint_single_file(
@@ -636,7 +654,7 @@ def test_load_save_checkpoint_single_file(
         # assumes we know what the name of the file is. This is fine, breaking this logic
         # should be something we capture through this test
         output_file = Path.joinpath(
-            checkpoint_file.parent,
+            checkpoint_file.parent.parent / "output_dir",
             "epoch_1",
             SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)),
         ).with_suffix(".safetensors")
@@ -708,7 +726,10 @@ def gemma_hf_checkpoint(self, tmp_path, state_dict):
             * head_dim : 16
 
         """
-        checkpoint_file = tmp_path / "gemma_hf_checkpoint.pt"
+        checkpoint_dir = Path.joinpath(tmp_path, "checkpoint_dir")
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+
+        checkpoint_file = checkpoint_dir / "gemma_hf_checkpoint.pt"
 
         torch.save(state_dict, checkpoint_file)
 
@@ -719,7 +740,7 @@ def gemma_hf_checkpoint(self, tmp_path, state_dict):
             "head_dim": _HEAD_DIM,
             "intermediate_size": _HIDDEN_DIM,
         }
-        config_file = Path.joinpath(tmp_path, "config.json")
+        config_file = Path.joinpath(checkpoint_dir, "config.json")
         with config_file.open("w") as f:
             json.dump(config, f)
 
@@ -730,11 +751,13 @@ def single_file_checkpointer(
         self, gemma_hf_checkpoint, tmp_path
     ) -> FullModelHFCheckpointer:
         checkpoint_file = gemma_hf_checkpoint
+        checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir"))
+        output_dir = str(Path.joinpath(tmp_path, "output_dir"))
         return FullModelHFCheckpointer(
-            checkpoint_dir=tmp_path,
+            checkpoint_dir=checkpoint_dir,
             checkpoint_files=[checkpoint_file],
             model_type="GEMMA",
-            output_dir=tmp_path,
+            output_dir=output_dir,
         )
 
     def test_load_save_checkpoint_single_file(
@@ -788,7 +811,7 @@ def test_load_save_checkpoint_single_file(
         # assumes we know what the name of the file is. This is fine, breaking this logic
         # should be something we capture through this test
         output_file = Path.joinpath(
-            checkpoint_file.parent,
+            checkpoint_file.parent.parent / "output_dir",
             "epoch_1",
             SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)),
         ).with_suffix(".safetensors")
diff --git a/tests/torchtune/training/checkpointing/test_checkpointer_utils.py b/tests/torchtune/training/checkpointing/test_checkpointer_utils.py
index d73bb6fc03..86f84d9a43 100644
--- a/tests/torchtune/training/checkpointing/test_checkpointer_utils.py
+++ b/tests/torchtune/training/checkpointing/test_checkpointer_utils.py
@@ -11,6 +11,7 @@
 import torch
 from torchtune.models.llama2 import llama2, llama2_classifier
 from torchtune.training.checkpointing._utils import (
+    check_outdir_not_in_ckptdir,
     FormattedCheckpointFiles,
     safe_torch_load,
     update_state_dict_for_classifier,
@@ -226,3 +227,47 @@ def test_build_checkpoint_filenames(self, expected_filenames):
         formatted_files = FormattedCheckpointFiles.from_dict(formatted_file_dict)
         actual_filenames = formatted_files.build_checkpoint_filenames()
         assert actual_filenames == expected_filenames
+
+
+class TestCheckOutdirNotInCkptdir:
+    def test_sibling_directories(self):
+        # Sibling directories should pass without raising an error
+        ckpt_dir = Path("/path/to/ckpt")
+        out_dir = Path("/path/to/output")
+        check_outdir_not_in_ckptdir(ckpt_dir, out_dir)
+
+    def test_ckpt_dir_in_output_dir(self):
+        # out_dir is a parent of ckpt_dir, should pass without raising an error
+        ckpt_dir = Path("/path/to/output/ckpt_dir")
+        out_dir = Path("/path/to/output")
+        check_outdir_not_in_ckptdir(ckpt_dir, out_dir)
+
+    def test_equal_directories(self):
+        # Equal directories should raise a ValueError
+        ckpt_dir = Path("/path/to/ckpt")
+        out_dir = Path("/path/to/ckpt")
+        with pytest.raises(
+            ValueError,
+            match="The output directory cannot be the same as or a subdirectory of the checkpoint directory.",
+        ):
+            check_outdir_not_in_ckptdir(ckpt_dir, out_dir)
+
+    def test_output_dir_in_ckpt_dir(self):
+        # out_dir is a subdirectory of ckpt_dir, should raise a ValueError
+        ckpt_dir = Path("/path/to/ckpt")
+        out_dir = Path("/path/to/ckpt/subdir")
+        with pytest.raises(
+            ValueError,
+            match="The output directory cannot be the same as or a subdirectory of the checkpoint directory.",
+        ):
+            check_outdir_not_in_ckptdir(ckpt_dir, out_dir)
+
+    def test_output_dir_ckpt_dir_few_levels_down(self):
+        # out_dir is a few levels down in ckpt_dir, should raise a ValueError
+        ckpt_dir = Path("/path/to/ckpt")
+        out_dir = Path("/path/to/ckpt/subdir/another_subdir")
+        with pytest.raises(
+            ValueError,
+            match="The output directory cannot be the same as or a subdirectory of the checkpoint directory.",
+        ):
+            check_outdir_not_in_ckptdir(ckpt_dir, out_dir)
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
index a5d72af320..21a7cfb471 100644
--- a/torchtune/training/checkpointing/_checkpointer.py
+++ b/torchtune/training/checkpointing/_checkpointer.py
@@ -30,7 +30,7 @@
 from torchtune.training.checkpointing._utils import (
     ADAPTER_CONFIG_FNAME,
     ADAPTER_MODEL_FNAME,
-    BASE_MODEL_DIRNAME,
+    check_outdir_not_in_ckptdir,
     copy_files,
     get_adapter_checkpoint_path,
     get_model_checkpoint_path,
@@ -163,7 +163,7 @@ def __init__(
         # TODO: support loading more than one file
         if len(checkpoint_files) != 1:
             raise ValueError(
-                "Currently we only support reading from a single torchtune checkpoint file. "
+                "Currently we only support reading from a single checkpoint file. "
                 f"Got {len(checkpoint_files)} files instead."
             )
 
@@ -178,15 +178,10 @@ def __init__(
 
         self._model_type = ModelType[model_type]
         self._output_dir = Path(output_dir)
-        self._output_dir.mkdir(parents=True, exist_ok=True)
-
-        # save all files in input_dir, except model weights and mapping, to output_dir
-        # this is useful to preserve the tokenizer, configs, license, etc.
-        copy_files(
-            self._checkpoint_dir,
-            Path.joinpath(self._output_dir, BASE_MODEL_DIRNAME),
-            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
+        check_outdir_not_in_ckptdir(
+            ckpt_dir=self._checkpoint_dir, out_dir=self._output_dir
         )
+        self._output_dir.mkdir(parents=True, exist_ok=True)
 
         #  resume from adapter_model ckpt
         self._adapter_checkpoint = get_adapter_checkpoint_path(
@@ -331,6 +326,14 @@ def save_checkpoint(
                 "Adapter checkpoint not found in state_dict. Please ensure that the state_dict contains adapter weights."
             )
 
+        # Save all files in ckpt_dir, except model weights and mapping, to output_dir/epoch_{epoch}
+        # So its easy to run inference with the model using this epoch's checkpoint
+        copy_files(
+            self._checkpoint_dir,
+            Path.joinpath(self._output_dir, f"epoch_{epoch}"),
+            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
+        )
+
         # If the recipe state needs to be output, first remove the model state dict
         if intermediate_checkpoint:
             _ = state_dict.pop(training.MODEL_KEY, None)
@@ -423,6 +426,9 @@ def __init__(
         self._checkpoint_dir = Path(checkpoint_dir)
         self._model_type = ModelType[model_type]
         self._output_dir = Path(output_dir)
+        check_outdir_not_in_ckptdir(
+            ckpt_dir=self._checkpoint_dir, out_dir=self._output_dir
+        )
         self._output_dir.mkdir(parents=True, exist_ok=True)
 
         # weight_map contains the state_dict key -> checkpoint file mapping so we can correctly
@@ -435,14 +441,6 @@ def __init__(
             Path.joinpath(self._checkpoint_dir, "config.json").read_text()
         )
 
-        # save all files in input_dir, except model weights and mapping, to output_dir
-        # this is useful to preserve the tokenizer, configs, license, etc.
-        copy_files(
-            self._checkpoint_dir,
-            Path.joinpath(self._output_dir, BASE_MODEL_DIRNAME),
-            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
-        )
-
         # repo_id is necessary for when saving an adapter config, so its compatible with HF.
         # This json file is produced and saved in the download step.
         # contents are {"repo_id": "some_model/some_model_version"}
@@ -873,6 +871,14 @@ def save_checkpoint(
                     f"saved to {output_path}"
                 )
 
+        # Save all files in ckpt_dir, except model weights and mapping, to output_dir/epoch_{epoch}
+        # So its easy to run inference with the model using this epoch's checkpoint
+        copy_files(
+            self._checkpoint_dir,
+            Path.joinpath(self._output_dir, f"epoch_{epoch}"),
+            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
+        )
+
         # If the recipe state needs to be output, first remove the model state dict
         # and if it exists, remove the adapter state dict as well
         if intermediate_checkpoint:
@@ -951,7 +957,7 @@ def __init__(
         # TODO: support loading more than one file
         if len(checkpoint_files) != 1:
             raise ValueError(
-                "Currently we only support reading from a single torchtune checkpoint file. "
+                "Currently we only support reading from a single checkpoint file. "
                 f"Got {len(checkpoint_files)} files instead."
             )
 
@@ -964,15 +970,10 @@ def __init__(
             )
         self._model_type = ModelType[model_type]
         self._output_dir = Path(output_dir)
-        self._output_dir.mkdir(parents=True, exist_ok=True)
-
-        # save all files in input_dir, except model weights and mapping, to output_dir
-        # this is useful to preserve the tokenizer, configs, license, etc.
-        copy_files(
-            self._checkpoint_dir,
-            Path.joinpath(self._output_dir, BASE_MODEL_DIRNAME),
-            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
+        check_outdir_not_in_ckptdir(
+            ckpt_dir=self._checkpoint_dir, out_dir=self._output_dir
         )
+        self._output_dir.mkdir(parents=True, exist_ok=True)
 
         #  resume from adapter_model ckpt
         self._adapter_checkpoint = get_adapter_checkpoint_path(
@@ -1126,6 +1127,14 @@ def save_checkpoint(
                 "Adapter checkpoint not found in state_dict. Please ensure that the state_dict contains adapter weights."
             )
 
+        # Save all files in ckpt_dir, except model weights and mapping, to output_dir/epoch_{epoch}
+        # So its easy to run inference with the model using this epoch's checkpoint
+        copy_files(
+            self._checkpoint_dir,
+            Path.joinpath(self._output_dir, f"epoch_{epoch}"),
+            ignore_suffixes=SUFFIXES_TO_NOT_COPY,
+        )
+
         # If the recipe state needs to be output, first remove the model state dict
         # and if it exists, remove the adapter state dict as well
         if intermediate_checkpoint:
diff --git a/torchtune/training/checkpointing/_utils.py b/torchtune/training/checkpointing/_utils.py
index f8dc55452b..963f1f96f3 100644
--- a/torchtune/training/checkpointing/_utils.py
+++ b/torchtune/training/checkpointing/_utils.py
@@ -38,7 +38,6 @@
 # standardize checkpointing
 SHARD_FNAME = "ft-model-{cpt_idx}-of-{num_shards}"
 RECIPE_STATE_DIRNAME = "recipe_state"
-BASE_MODEL_DIRNAME = "base_model"
 
 # Needed when setting up output dir in checkpointing
 REPO_ID_FNAME = "original_repo_id"
@@ -334,6 +333,7 @@ def copy_files(
     output_dir: Union[str, Path],
     *,
     ignore_suffixes: Optional[List[str]] = None,
+    max_file_size_mb: int = 100,
 ) -> None:
     """
     Copies files from the input directory to the output directory, preserving the directory structure.
@@ -346,6 +346,7 @@ def copy_files(
         output_dir (Union[str, Path]): The path to the output directory where files should be copied.
         ignore_suffixes (Optional[List[str]]): A list of file suffixes to exclude from copying.
           Defaults to ['.pt', '.bin', '.safetensors'] if not provided.
+        max_file_size_mb (int): The maximum file size in megabytes to copy. Defaults to 100 MB.
     Returns:
         None
     Example:
@@ -355,6 +356,7 @@ def copy_files(
     already exist in the destination or have the specified suffixes.
     """
 
+    max_file_size = max_file_size_mb * 1024 * 1024
     for root, dirs, files in os.walk(input_dir):
 
         # Filter out directories that start with '.'. E.g. ".cache/"
@@ -381,6 +383,13 @@ def copy_files(
             src_file = os.path.join(root, file)
             dest_file = os.path.join(dest_dir, file)
 
+            # Check the file size
+            if os.path.getsize(src_file) > max_file_size:
+                print(
+                    f"Skipping copying {src_file} to {output_dir} as it exceeds the size limit of {max_file_size_mb} MiB."
+                )
+                continue
+
             # Copy the file if it doesn't already exist in the destination
             if not os.path.exists(dest_file):
                 shutil.copy2(src_file, dest_file)
@@ -563,3 +572,23 @@ def validate_checkpoint_files(
     )
 
     return checkpoint_paths
+
+
+def check_outdir_not_in_ckptdir(ckpt_dir: Path, out_dir: Path) -> bool:
+    """
+    Checks that the output directory is not equal to or a subdirectory of the checkpoint directory.
+    This is necessary to avoid making copies of copies when geting config files from ckpt_dir.
+    """
+
+    # Resolve the absolute paths to avoid issues with relative paths
+    _ckpt_dir = ckpt_dir.resolve()
+    _out_dir = out_dir.resolve()
+
+    # Check if out_dir is the same as ckpt_dir or a subdirectory of it
+    if _out_dir == _ckpt_dir or _ckpt_dir in _out_dir.parents:
+        raise ValueError(
+            "The output directory cannot be the same as or a subdirectory of the checkpoint directory. "
+            f"Found {ckpt_dir=} and {out_dir=}."
+        )
+
+    return True
diff --git a/torchtune/training/quantization.py b/torchtune/training/quantization.py
index 4e21cb4936..b158d4b9a3 100644
--- a/torchtune/training/quantization.py
+++ b/torchtune/training/quantization.py
@@ -130,7 +130,10 @@ def quantize(self, model):
 
 
 # int4 weight-only
-Int4WeightOnlyQATQuantizerModuleSwap = Int4WeightOnlyQATQuantizer
+class Int4WeightOnlyQATQuantizerModuleSwap(Int4WeightOnlyQATQuantizer):
+    pass
+
+
 disable_4w_fake_quant_module_swap = disable_4w_fake_quant
 enable_4w_fake_quant_module_swap = enable_4w_fake_quant
 _quantizer_to_mode[Int4WeightOnlyQATQuantizerModuleSwap] = "4w-qat-module-swap"
@@ -142,7 +145,10 @@ def quantize(self, model):
 ] = enable_4w_fake_quant_module_swap
 
 # int8 dynamic activations + int4 weight
-Int8DynActInt4WeightQATQuantizerModuleSwap = Int8DynActInt4WeightQATQuantizer
+class Int8DynActInt4WeightQATQuantizerModuleSwap(Int8DynActInt4WeightQATQuantizer):
+    pass
+
+
 disable_8da4w_fake_quant_module_swap = disable_8da4w_fake_quant
 enable_8da4w_fake_quant_module_swap = enable_8da4w_fake_quant
 _quantizer_to_mode[Int8DynActInt4WeightQATQuantizerModuleSwap] = "8da4w-qat-module-swap"