From b2d94630738072cfc991f6ed57993c18e6e29b92 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Fri, 20 Dec 2024 11:35:19 +0000 Subject: [PATCH] 2024-12-20 nightly release (74e6e7b6dbe76ac6c8a3515349c1e1b2952a4841) --- recipes/configs/eleuther_evaluation.yaml | 4 +- recipes/configs/gemma/evaluation.yaml | 4 +- recipes/configs/generation.yaml | 4 +- recipes/configs/llama2/7B_lora_dpo.yaml | 2 +- .../llama2/7B_lora_dpo_single_device.yaml | 2 +- recipes/configs/llama2/generation_v2.yaml | 4 +- recipes/configs/llama3/8B_qat_lora.yaml | 9 ++- recipes/configs/llama3_1/8B_lora_dpo.yaml | 2 +- .../llama3_1/8B_lora_dpo_single_device.yaml | 2 +- recipes/configs/llama3_1/8B_qat_lora.yaml | 9 ++- recipes/configs/llama3_2/1B_qat_lora.yaml | 9 ++- recipes/configs/llama3_2/3B_qat_lora.yaml | 9 ++- .../8B_to_1B_KD_lora_distributed.yaml | 2 +- .../8B_to_1B_KD_lora_single_device.yaml | 2 +- .../llama3_2_vision/11B_evaluation.yaml | 4 +- .../llama3_2_vision/11B_generation_v2.yaml | 4 +- recipes/configs/mistral/evaluation.yaml | 4 +- recipes/configs/phi3/evaluation.yaml | 4 +- recipes/configs/quantization.yaml | 4 +- .../1.5_to_0.5B_KD_lora_distributed.yaml | 2 +- .../1.5_to_0.5B_KD_lora_single_device.yaml | 2 +- recipes/configs/qwen2/evaluation.yaml | 4 +- recipes/dev/7B_full_early_exit.yaml | 6 +- recipes/lora_dpo_distributed.py | 12 ++++ recipes/lora_dpo_single_device.py | 13 ++++ recipes/qat_distributed.py | 6 ++ recipes/qat_lora_finetune_distributed.py | 8 ++- .../test_ppo_full_finetune_single_device.py | 4 +- .../checkpointing/test_checkpointer.py | 71 ++++++++++++------- .../checkpointing/test_checkpointer_utils.py | 45 ++++++++++++ .../training/checkpointing/_checkpointer.py | 63 +++++++++------- torchtune/training/checkpointing/_utils.py | 31 +++++++- torchtune/training/quantization.py | 10 ++- 33 files changed, 263 insertions(+), 98 deletions(-) diff --git a/recipes/configs/eleuther_evaluation.yaml b/recipes/configs/eleuther_evaluation.yaml index e62fa0219c..8bdde28ba5 100644 --- a/recipes/configs/eleuther_evaluation.yaml +++ b/recipes/configs/eleuther_evaluation.yaml @@ -3,6 +3,8 @@ # To launch, run the following command from root torchtune directory: # tune run eleuther_eval --config eleuther_evaluation tasks=["truthfulqa_mc2","hellaswag"] +output_dir: ./ # Not needed + # Model Arguments model: _component_: torchtune.models.llama2.llama2_7b @@ -14,7 +16,7 @@ checkpointer: pytorch_model-00001-of-00002.bin, pytorch_model-00002-of-00002.bin, ] - output_dir: /tmp/Llama-2-7b-hf + output_dir: ${output_dir} model_type: LLAMA2 # Tokenizer diff --git a/recipes/configs/gemma/evaluation.yaml b/recipes/configs/gemma/evaluation.yaml index 2ff8f78546..9f4f73fb67 100644 --- a/recipes/configs/gemma/evaluation.yaml +++ b/recipes/configs/gemma/evaluation.yaml @@ -3,6 +3,8 @@ # To launch, run the following command: # tune run eleuther_eval --config gemma/evaluation +output_dir: ./ # Not needed + # Model Arguments model: _component_: torchtune.models.gemma.gemma_2b @@ -15,7 +17,7 @@ checkpointer: model-00001-of-00002.safetensors, model-00002-of-00002.safetensors, ] - output_dir: ./ # Not needed + output_dir: ${output_dir} model_type: GEMMA # Tokenizer diff --git a/recipes/configs/generation.yaml b/recipes/configs/generation.yaml index e9c5d0d4f5..c2081a1ed7 100644 --- a/recipes/configs/generation.yaml +++ b/recipes/configs/generation.yaml @@ -3,6 +3,8 @@ # To launch, run the following command from root torchtune directory: # tune run generate --config generation +output_dir: ./ # Not needed + # Model arguments model: _component_: torchtune.models.llama2.llama2_7b @@ -14,7 +16,7 @@ checkpointer: pytorch_model-00001-of-00002.bin, pytorch_model-00002-of-00002.bin, ] - output_dir: /tmp/Llama-2-7b-hf/ + output_dir: ${output_dir} model_type: LLAMA2 device: cuda diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml index 250d62db44..887be92925 100644 --- a/recipes/configs/llama2/7B_lora_dpo.yaml +++ b/recipes/configs/llama2/7B_lora_dpo.yaml @@ -32,7 +32,7 @@ model: tokenizer: _component_: torchtune.models.llama2.llama2_tokenizer path: /tmp/Llama-2-7b-hf/tokenizer.model - max_seq_len: 1024 + max_seq_len: 1024 # higher increases memory checkpointer: _component_: torchtune.training.FullModelHFCheckpointer diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml index 4d154c38ce..6e0049cfd5 100644 --- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml +++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml @@ -31,7 +31,7 @@ model: tokenizer: _component_: torchtune.models.llama2.llama2_tokenizer path: /tmp/Llama-2-7b-hf/tokenizer.model - max_seq_len: 1024 + max_seq_len: 1024 # higher increases memory checkpointer: _component_: torchtune.training.FullModelHFCheckpointer diff --git a/recipes/configs/llama2/generation_v2.yaml b/recipes/configs/llama2/generation_v2.yaml index 7a9222862d..da2c7f622a 100644 --- a/recipes/configs/llama2/generation_v2.yaml +++ b/recipes/configs/llama2/generation_v2.yaml @@ -6,6 +6,8 @@ # To launch, run the following command: # tune run dev/generate_v2 --config llama2/generation_v2 +output_dir: ./ # Not needed + # Model arguments model: _component_: torchtune.models.llama2.llama2_7b @@ -24,7 +26,7 @@ checkpointer: pytorch_model-00001-of-00002.bin, pytorch_model-00002-of-00002.bin ] - output_dir: ./ + output_dir: ${output_dir} model_type: LLAMA2 # Device diff --git a/recipes/configs/llama3/8B_qat_lora.yaml b/recipes/configs/llama3/8B_qat_lora.yaml index 5a889a3d63..5f88f175ec 100644 --- a/recipes/configs/llama3/8B_qat_lora.yaml +++ b/recipes/configs/llama3/8B_qat_lora.yaml @@ -83,6 +83,10 @@ dtype: bf16 enable_activation_checkpointing: False # True reduces memory enable_activation_offloading: False # True reduces memory +# QAT arguments +quantizer: + _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer + groupsize: 256 # Profiler (disabled) profiler: @@ -108,8 +112,3 @@ profiler: warmup_steps: 3 active_steps: 2 num_cycles: 1 - -# QAT arguments -quantizer: - _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer - groupsize: 256 diff --git a/recipes/configs/llama3_1/8B_lora_dpo.yaml b/recipes/configs/llama3_1/8B_lora_dpo.yaml index 7160362b2a..4425e7414b 100644 --- a/recipes/configs/llama3_1/8B_lora_dpo.yaml +++ b/recipes/configs/llama3_1/8B_lora_dpo.yaml @@ -32,7 +32,7 @@ model: tokenizer: _component_: torchtune.models.llama3.llama3_tokenizer path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model - max_seq_len: null + max_seq_len: 1024 # higher increases memory checkpointer: _component_: torchtune.training.FullModelHFCheckpointer diff --git a/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml b/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml index 81d6158b28..236b623f7d 100644 --- a/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml +++ b/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml @@ -31,7 +31,7 @@ model: tokenizer: _component_: torchtune.models.llama3.llama3_tokenizer path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model - max_seq_len: null + max_seq_len: 1024 # higher increases memory checkpointer: _component_: torchtune.training.FullModelHFCheckpointer diff --git a/recipes/configs/llama3_1/8B_qat_lora.yaml b/recipes/configs/llama3_1/8B_qat_lora.yaml index d25351a0e4..3d7c94744e 100644 --- a/recipes/configs/llama3_1/8B_qat_lora.yaml +++ b/recipes/configs/llama3_1/8B_qat_lora.yaml @@ -86,6 +86,10 @@ dtype: bf16 enable_activation_checkpointing: False # True reduces memory enable_activation_offloading: False # True reduces memory +# QAT arguments +quantizer: + _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer + groupsize: 256 # Profiler (disabled) profiler: @@ -111,8 +115,3 @@ profiler: warmup_steps: 3 active_steps: 2 num_cycles: 1 - -# QAT arguments -quantizer: - _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer - groupsize: 256 diff --git a/recipes/configs/llama3_2/1B_qat_lora.yaml b/recipes/configs/llama3_2/1B_qat_lora.yaml index 79f628367f..bffc52a4ac 100644 --- a/recipes/configs/llama3_2/1B_qat_lora.yaml +++ b/recipes/configs/llama3_2/1B_qat_lora.yaml @@ -82,6 +82,10 @@ dtype: bf16 enable_activation_checkpointing: False # True reduces memory enable_activation_offloading: False # True reduces memory +# QAT arguments +quantizer: + _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer + groupsize: 256 # Profiler (disabled) profiler: @@ -107,8 +111,3 @@ profiler: warmup_steps: 3 active_steps: 2 num_cycles: 1 - -# QAT arguments -quantizer: - _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer - groupsize: 256 diff --git a/recipes/configs/llama3_2/3B_qat_lora.yaml b/recipes/configs/llama3_2/3B_qat_lora.yaml index 6b69aebac2..64985de1ea 100644 --- a/recipes/configs/llama3_2/3B_qat_lora.yaml +++ b/recipes/configs/llama3_2/3B_qat_lora.yaml @@ -83,6 +83,10 @@ dtype: bf16 enable_activation_checkpointing: False # True reduces memory enable_activation_offloading: False # True reduces memory +# QAT arguments +quantizer: + _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer + groupsize: 256 # Profiler (disabled) profiler: @@ -108,8 +112,3 @@ profiler: warmup_steps: 3 active_steps: 2 num_cycles: 1 - -# QAT arguments -quantizer: - _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer - groupsize: 256 diff --git a/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml b/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml index 3b5d34f13f..9ab51fc0e3 100644 --- a/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml +++ b/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml @@ -59,7 +59,7 @@ teacher_checkpointer: model-00004-of-00004.safetensors ] recipe_checkpoint: null - output_dir: /tmp/Meta-Llama-3.1-8B-Instruct/ + output_dir: ${output_dir} model_type: LLAMA3 # Dataset and Sampler diff --git a/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml b/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml index 7ab0f23bc2..0a2dfea9f5 100644 --- a/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml +++ b/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml @@ -59,7 +59,7 @@ teacher_checkpointer: model-00004-of-00004.safetensors ] recipe_checkpoint: null - output_dir: /tmp/Meta-Llama-3.1-8B-Instruct/ + output_dir: ${output_dir} model_type: LLAMA3 # Dataset and Sampler diff --git a/recipes/configs/llama3_2_vision/11B_evaluation.yaml b/recipes/configs/llama3_2_vision/11B_evaluation.yaml index 832f5072b5..cb444ab31c 100644 --- a/recipes/configs/llama3_2_vision/11B_evaluation.yaml +++ b/recipes/configs/llama3_2_vision/11B_evaluation.yaml @@ -9,6 +9,8 @@ # To launch, run the following command from root torchtune directory: # tune run eleuther_eval --config llama3_2_vision/11B_evaluation +output_dir: ./ # Not needed + # Model arguments model: _component_: torchtune.models.llama3_2_vision.llama3_2_vision_11b @@ -26,7 +28,7 @@ checkpointer: checkpoint_files: filename_format: model-{}-of-{}.safetensors max_filename: "00005" - output_dir: ./ + output_dir: ${output_dir} model_type: LLAMA3_VISION # Environment diff --git a/recipes/configs/llama3_2_vision/11B_generation_v2.yaml b/recipes/configs/llama3_2_vision/11B_generation_v2.yaml index 7682296849..c78e0e52b6 100644 --- a/recipes/configs/llama3_2_vision/11B_generation_v2.yaml +++ b/recipes/configs/llama3_2_vision/11B_generation_v2.yaml @@ -7,6 +7,8 @@ # To launch, run the following command from root torchtune directory: # tune run dev/generate_v2 --config llama3_2_vision/generation_v2 +output_dir: ./ # Not needed + # Model arguments model: _component_: torchtune.models.llama3_2_vision.llama3_2_vision_11b @@ -25,7 +27,7 @@ checkpointer: checkpoint_files: filename_format: model-{}-of-{}.safetensors max_filename: "00005" - output_dir: ./ + output_dir: ${output_dir} model_type: LLAMA3_VISION # Device diff --git a/recipes/configs/mistral/evaluation.yaml b/recipes/configs/mistral/evaluation.yaml index 61d69dcb40..1db2334b11 100644 --- a/recipes/configs/mistral/evaluation.yaml +++ b/recipes/configs/mistral/evaluation.yaml @@ -3,6 +3,8 @@ # To launch, run the following command: # tune run eleuther_eval --config mistral/evaluation +output_dir: ./ # Not needed + # Model Arguments model: _component_: torchtune.models.mistral.mistral_7b @@ -15,7 +17,7 @@ checkpointer: pytorch_model-00001-of-00002.bin, pytorch_model-00002-of-00002.bin ] - output_dir: /tmp/Mistral-7B-v0.1/ + output_dir: ${output_dir} model_type: MISTRAL resume_from_checkpoint: False diff --git a/recipes/configs/phi3/evaluation.yaml b/recipes/configs/phi3/evaluation.yaml index ca2f1c9759..4a1d5a02a7 100644 --- a/recipes/configs/phi3/evaluation.yaml +++ b/recipes/configs/phi3/evaluation.yaml @@ -3,6 +3,8 @@ # To launch, run the following command: # tune run eleuther_eval --config phi3/evaluation +output_dir: ./ # Not needed + # Model Arguments model: _component_: torchtune.models.phi3.phi3_mini @@ -16,7 +18,7 @@ checkpointer: model-00002-of-00002.safetensors ] recipe_checkpoint: null - output_dir: /tmp/Phi-3-mini-4k-instruct + output_dir: ${output_dir} model_type: PHI3_MINI resume_from_checkpoint: False diff --git a/recipes/configs/quantization.yaml b/recipes/configs/quantization.yaml index 89df543f7d..ffb66eac83 100644 --- a/recipes/configs/quantization.yaml +++ b/recipes/configs/quantization.yaml @@ -3,6 +3,8 @@ # To launch, run the following command from root torchtune directory: # tune run quantize --config quantization +output_dir: /tmp/torchtune/llama2_7B/quantized # /tmp may be deleted by your system. Change it to your preference. + # # Model arguments model: @@ -16,7 +18,7 @@ checkpointer: pytorch_model-00002-of-00002.bin, ] recipe_checkpoint: null - output_dir: /tmp/Llama-2-7b-hf + output_dir: ${output_dir} model_type: LLAMA2 device: cuda diff --git a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml index faae6cd3b8..b68704cdc2 100644 --- a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml +++ b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml @@ -51,7 +51,7 @@ teacher_checkpointer: hf_model_0001_0.pt ] recipe_checkpoint: null - output_dir: /tmp/Qwen2-1.5B-Instruct-lora-finetune + output_dir: ${output_dir} model_type: QWEN2 resume_from_checkpoint: False diff --git a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml index 48dbe4b117..385c1d453a 100644 --- a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml +++ b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml @@ -51,7 +51,7 @@ teacher_checkpointer: model.safetensors ] recipe_checkpoint: null - output_dir: /tmp/Qwen2-1.5B-Instruct + output_dir: ${output_dir} model_type: QWEN2 resume_from_checkpoint: False diff --git a/recipes/configs/qwen2/evaluation.yaml b/recipes/configs/qwen2/evaluation.yaml index 708d63d600..b909c30fe4 100644 --- a/recipes/configs/qwen2/evaluation.yaml +++ b/recipes/configs/qwen2/evaluation.yaml @@ -3,6 +3,8 @@ # To launch, run the following command: # tune run eleuther_eval --config qwen2/evaluation +output_dir: ./ # Not needed + # Model Arguments model: _component_: torchtune.models.qwen2.qwen2_7b @@ -17,7 +19,7 @@ checkpointer: model-00003-of-00004.safetensors, model-00004-of-00004.safetensors ] - output_dir: ./ # Not needed + output_dir: ${output_dir} model_type: QWEN2 # Tokenizer diff --git a/recipes/dev/7B_full_early_exit.yaml b/recipes/dev/7B_full_early_exit.yaml index 7d02a34f0e..0253bf82e2 100644 --- a/recipes/dev/7B_full_early_exit.yaml +++ b/recipes/dev/7B_full_early_exit.yaml @@ -30,6 +30,7 @@ # This config works best for distributed training, hence when the model is being fine-tuned on 2+ GPUs. # +output_dir: /tmp/torchtune/llama2_7b/full_early_exit # /tmp may be deleted by your system. Change it to your preference. # Tokenizer tokenizer: @@ -61,7 +62,7 @@ checkpointer: pytorch_model-00002-of-00002.bin ] recipe_checkpoint: null - output_dir: /tmp/Llama-2-7b-hf + output_dir: ${output_dir} model_type: LLAMA2 resume_from_checkpoint: False @@ -92,8 +93,7 @@ dtype: bf16 # Logging metric_logger: _component_: torchtune.training.metric_logging.DiskLogger - log_dir: ${output_dir} -output_dir: /tmp/topv2-llama2-finetune + log_dir: ${output_dir}/logs log_every_n_steps: 1 log_peak_memory_stats: True diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index c86d5720b2..ae0eb2f0ab 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -26,6 +26,7 @@ DoRALinear, get_adapter_params, get_adapter_state_dict, + get_lora_module_names, get_merged_lora_ckpt, LoRALinear, set_trainable_params, @@ -595,6 +596,17 @@ def save_checkpoint( } ) + adapter_config = { + "r": self._lora_rank, + "lora_alpha": self._lora_alpha, + "target_modules": get_lora_module_names( + self._lora_attn_modules, + self._apply_lora_to_mlp, + self._apply_lora_to_output, + ), + "peft_type": "LORA", + } + checkpoint_dict.update({training.ADAPTER_CONFIG: adapter_config}) self._checkpointer.save_checkpoint( checkpoint_dict, epoch=epoch, diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index bee78ad0d3..9b5dc6fb1a 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -24,6 +24,7 @@ disable_adapter, get_adapter_params, get_adapter_state_dict, + get_lora_module_names, get_merged_lora_ckpt, set_trainable_params, validate_missing_and_unexpected_for_lora, @@ -448,6 +449,18 @@ def save_checkpoint(self, epoch: int) -> None: ckpt_dict.update({training.MODEL_KEY: merged_state_dict}) + adapter_config = { + "r": self._lora_rank, + "lora_alpha": self._lora_alpha, + "target_modules": get_lora_module_names( + self._lora_attn_modules, + self._apply_lora_to_mlp, + self._apply_lora_to_output, + ), + "peft_type": "LORA", + } + ckpt_dict.update({training.ADAPTER_CONFIG: adapter_config}) + self._checkpointer.save_checkpoint( ckpt_dict, epoch=epoch, diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index 6c79a6cefa..efb9c4c2b5 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -118,6 +118,7 @@ class QATRecipeDistributed(FTRecipeInterface): Raises: ValueError: If ``dtype`` is set to fp16. + ValueError: If ``compile`` is set to True. RuntimeError: If ``dtype`` is set to bf16 and the hardware does not support bf16. RuntimeError: If ``left_pad_sequence`` is set as the data collator. RuntimeError: If ``enable_activation_offloading`` is True and device is not CUDA. @@ -133,6 +134,11 @@ def __init__(self, cfg: DictConfig) -> None: "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead." ) + if cfg.get("compile", False): + raise ValueError( + "Compile is not yet supported for QAT. Please set compile=False." + ) + # logging attributes self._output_dir = cfg.output_dir self._log_every_n_steps = cfg.get("log_every_n_steps", 1) diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py index d047d77d41..57e4a09108 100644 --- a/recipes/qat_lora_finetune_distributed.py +++ b/recipes/qat_lora_finetune_distributed.py @@ -126,7 +126,8 @@ class QATLoRAFinetuneRecipeDistributed(FTRecipeInterface): Raises: ValueError: If ``dtype`` is set to fp16. - ValueError: If world_size is 1 + ValueError: If world_size is 1. + ValueError: If ``compile`` is set to True. RuntimeError: If ``dtype`` is set to bf16 and the hardware does not support bf16. RuntimeError: If ``left_pad_sequence`` is set as the data collator. RuntimeError: If ``enable_activation_offloading`` is True and device is not CUDA. @@ -149,6 +150,11 @@ def __init__(self, cfg: DictConfig) -> None: "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead." ) + if cfg.get("compile", False): + raise ValueError( + "Compile is not yet supported for QAT. Please set compile=False." + ) + _, rank = utils.get_world_size_and_rank() # _is_rank_zero is used primarily for logging. In the future, the logger diff --git a/tests/recipes/test_ppo_full_finetune_single_device.py b/tests/recipes/test_ppo_full_finetune_single_device.py index 36352cb0f1..412c4c06dd 100644 --- a/tests/recipes/test_ppo_full_finetune_single_device.py +++ b/tests/recipes/test_ppo_full_finetune_single_device.py @@ -358,7 +358,7 @@ def test_training_state_on_resume_with_optimizer_in_bwd(self, tmpdir, monkeypatc --config mistral/7B_full_ppo_low_memory \ output_dir={tmpdir} \ checkpointer._component_=torchtune.training.FullModelHFCheckpointer \ - checkpointer.checkpoint_dir='{policy_tmpdir}' \ + checkpointer.checkpoint_dir='{ckpt_dir}' \ checkpointer.checkpoint_files=[{os.path.join(epoch_folder_minus_one, model_ckpt_fname)}]\ checkpointer.recipe_checkpoint={os.path.join(RECIPE_STATE_DIRNAME, "recipe_state.pt")}\ checkpointer.output_dir={policy_tmpdir} \ @@ -367,7 +367,7 @@ def test_training_state_on_resume_with_optimizer_in_bwd(self, tmpdir, monkeypatc ref_policy_checkpointer.checkpoint_dir='{ckpt_dir}' \ ref_policy_checkpointer.checkpoint_files=[{policy_ckpt_path}]\ - value_checkpointer.checkpoint_dir='{value_tmpdir}' \ + value_checkpointer.checkpoint_dir='{ckpt_dir}' \ value_checkpointer.checkpoint_files=[{os.path.join(value_tmpdir, epoch_folder_minus_one, model_ckpt_fname)}]\ value_checkpointer.output_dir={value_tmpdir} \ diff --git a/tests/torchtune/training/checkpointing/test_checkpointer.py b/tests/torchtune/training/checkpointing/test_checkpointer.py index abaa1d6ea7..5bf885512e 100644 --- a/tests/torchtune/training/checkpointing/test_checkpointer.py +++ b/tests/torchtune/training/checkpointing/test_checkpointer.py @@ -152,8 +152,11 @@ def llama2_hf_checkpoints(self, tmp_path, state_dict_1, state_dict_2): * embed_dim: 64 * max_seq_len: 128 """ - checkpoint_file_1 = tmp_path / "llama2_hf_checkpoint_01.pt" - checkpoint_file_2 = tmp_path / "llama2_hf_checkpoint_02.pt" + checkpoint_dir = Path.joinpath(tmp_path, "checkpoint_dir") + checkpoint_dir.mkdir(parents=True, exist_ok=True) + + checkpoint_file_1 = checkpoint_dir / "llama2_hf_checkpoint_01.pt" + checkpoint_file_2 = checkpoint_dir / "llama2_hf_checkpoint_02.pt" torch.save(state_dict_1, checkpoint_file_1) torch.save(state_dict_2, checkpoint_file_2) @@ -163,7 +166,7 @@ def llama2_hf_checkpoints(self, tmp_path, state_dict_1, state_dict_2): "num_attention_heads": 4, "num_key_value_heads": 4, } - config_file = Path.joinpath(tmp_path, "config.json") + config_file = Path.joinpath(checkpoint_dir, "config.json") with config_file.open("w") as f: json.dump(config, f) @@ -174,11 +177,13 @@ def single_file_checkpointer( self, llama2_hf_checkpoints, tmp_path ) -> FullModelHFCheckpointer: checkpoint_file, _ = llama2_hf_checkpoints + checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir")) + output_dir = str(Path.joinpath(tmp_path, "output_dir")) return FullModelHFCheckpointer( - checkpoint_dir=tmp_path, + checkpoint_dir=checkpoint_dir, checkpoint_files=[checkpoint_file], model_type="LLAMA2", - output_dir=tmp_path, + output_dir=output_dir, ) @pytest.fixture @@ -186,11 +191,13 @@ def multi_file_checkpointer( self, llama2_hf_checkpoints, tmp_path ) -> FullModelHFCheckpointer: checkpoint_file_1, checkpoint_file_2 = llama2_hf_checkpoints + checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir")) + output_dir = str(Path.joinpath(tmp_path, "output_dir")) return FullModelHFCheckpointer( - checkpoint_dir=tmp_path, + checkpoint_dir=checkpoint_dir, checkpoint_files=[checkpoint_file_1, checkpoint_file_2], model_type="LLAMA2", - output_dir=tmp_path, + output_dir=output_dir, ) def test_load_save_checkpoint_single_file( @@ -242,7 +249,7 @@ def test_load_save_checkpoint_single_file( # assumes we know what the name of the file is. This is fine, breaking this logic # should be something we capture through this test output_file = Path.joinpath( - checkpoint_file.parent, + checkpoint_file.parent.parent / "output_dir", "epoch_1", SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)), ).with_suffix(".safetensors") @@ -306,12 +313,12 @@ def test_save_load_checkpoint_multiple_file( # assumes we know what the name of the file is. This is fine, breaking this logic # should be something we capture through this test output_file_1 = Path.joinpath( - checkpoint_file_1.parent, + checkpoint_file_1.parent.parent / "output_dir", "epoch_1", SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="2".zfill(5)), ).with_suffix(".safetensors") output_file_2 = Path.joinpath( - checkpoint_file_2.parent, + checkpoint_file_2.parent.parent / "output_dir", "epoch_1", SHARD_FNAME.format(cpt_idx="2".zfill(5), num_shards="2".zfill(5)), ).with_suffix(".safetensors") @@ -338,12 +345,14 @@ def test_load_save_adapter_only( single_file_checkpointer.save_checkpoint(state_dict, epoch=2, adapter_only=True) output_file_1 = Path.joinpath( - tmp_path, + tmp_path / "output_dir", "epoch_2", SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)), ) output_file_2 = Path.joinpath( - tmp_path, "epoch_2", f"{ADAPTER_MODEL_FNAME}.safetensors" + tmp_path / "output_dir", + "epoch_2", + f"{ADAPTER_MODEL_FNAME}.safetensors", ) with pytest.raises(ValueError, match="Unable to load checkpoint from"): @@ -437,12 +446,16 @@ def test_save_checkpoint_in_peft_format( # Load saved adapter weights and config from file for comparison adapter_weights_file = Path.joinpath( - checkpoint_file.parent, "epoch_1", f"{ADAPTER_MODEL_FNAME}.safetensors" + checkpoint_file.parent.parent / "output_dir", + "epoch_1", + f"{ADAPTER_MODEL_FNAME}.safetensors", ) actual_adapter_state_dict = safe_torch_load(adapter_weights_file) adapter_config_file = Path.joinpath( - checkpoint_file.parent, "epoch_1", f"{ADAPTER_CONFIG_FNAME}.json" + checkpoint_file.parent.parent / "output_dir", + "epoch_1", + f"{ADAPTER_CONFIG_FNAME}.json", ) with open(adapter_config_file, "r") as f: adapter_config = json.load(f) @@ -558,7 +571,10 @@ def mistral_reward_model_hf_checkpoint(self, tmp_path, state_dict): * intermediate_dim: 256 """ - checkpoint_file = tmp_path / "mistral_reward_model_hf_checkpoint.pt" + checkpoint_dir = Path.joinpath(tmp_path, "checkpoint_dir") + checkpoint_dir.mkdir(parents=True, exist_ok=True) + + checkpoint_file = checkpoint_dir / "mistral_reward_model_hf_checkpoint.pt" torch.save(state_dict, checkpoint_file) @@ -568,7 +584,7 @@ def mistral_reward_model_hf_checkpoint(self, tmp_path, state_dict): "num_key_value_heads": 4, "num_classes": 1, } - config_file = Path.joinpath(tmp_path, "config.json") + config_file = Path.joinpath(checkpoint_dir, "config.json") with config_file.open("w") as f: json.dump(config, f) @@ -579,11 +595,13 @@ def single_file_checkpointer( self, mistral_reward_model_hf_checkpoint, tmp_path ) -> FullModelHFCheckpointer: checkpoint_file = mistral_reward_model_hf_checkpoint + checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir")) + output_dir = str(Path.joinpath(tmp_path, "output_dir")) return FullModelHFCheckpointer( - checkpoint_dir=tmp_path, + checkpoint_dir=checkpoint_dir, checkpoint_files=[checkpoint_file], model_type="REWARD", - output_dir=tmp_path, + output_dir=output_dir, ) def test_load_save_checkpoint_single_file( @@ -636,7 +654,7 @@ def test_load_save_checkpoint_single_file( # assumes we know what the name of the file is. This is fine, breaking this logic # should be something we capture through this test output_file = Path.joinpath( - checkpoint_file.parent, + checkpoint_file.parent.parent / "output_dir", "epoch_1", SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)), ).with_suffix(".safetensors") @@ -708,7 +726,10 @@ def gemma_hf_checkpoint(self, tmp_path, state_dict): * head_dim : 16 """ - checkpoint_file = tmp_path / "gemma_hf_checkpoint.pt" + checkpoint_dir = Path.joinpath(tmp_path, "checkpoint_dir") + checkpoint_dir.mkdir(parents=True, exist_ok=True) + + checkpoint_file = checkpoint_dir / "gemma_hf_checkpoint.pt" torch.save(state_dict, checkpoint_file) @@ -719,7 +740,7 @@ def gemma_hf_checkpoint(self, tmp_path, state_dict): "head_dim": _HEAD_DIM, "intermediate_size": _HIDDEN_DIM, } - config_file = Path.joinpath(tmp_path, "config.json") + config_file = Path.joinpath(checkpoint_dir, "config.json") with config_file.open("w") as f: json.dump(config, f) @@ -730,11 +751,13 @@ def single_file_checkpointer( self, gemma_hf_checkpoint, tmp_path ) -> FullModelHFCheckpointer: checkpoint_file = gemma_hf_checkpoint + checkpoint_dir = str(Path.joinpath(tmp_path, "checkpoint_dir")) + output_dir = str(Path.joinpath(tmp_path, "output_dir")) return FullModelHFCheckpointer( - checkpoint_dir=tmp_path, + checkpoint_dir=checkpoint_dir, checkpoint_files=[checkpoint_file], model_type="GEMMA", - output_dir=tmp_path, + output_dir=output_dir, ) def test_load_save_checkpoint_single_file( @@ -788,7 +811,7 @@ def test_load_save_checkpoint_single_file( # assumes we know what the name of the file is. This is fine, breaking this logic # should be something we capture through this test output_file = Path.joinpath( - checkpoint_file.parent, + checkpoint_file.parent.parent / "output_dir", "epoch_1", SHARD_FNAME.format(cpt_idx="1".zfill(5), num_shards="1".zfill(5)), ).with_suffix(".safetensors") diff --git a/tests/torchtune/training/checkpointing/test_checkpointer_utils.py b/tests/torchtune/training/checkpointing/test_checkpointer_utils.py index d73bb6fc03..86f84d9a43 100644 --- a/tests/torchtune/training/checkpointing/test_checkpointer_utils.py +++ b/tests/torchtune/training/checkpointing/test_checkpointer_utils.py @@ -11,6 +11,7 @@ import torch from torchtune.models.llama2 import llama2, llama2_classifier from torchtune.training.checkpointing._utils import ( + check_outdir_not_in_ckptdir, FormattedCheckpointFiles, safe_torch_load, update_state_dict_for_classifier, @@ -226,3 +227,47 @@ def test_build_checkpoint_filenames(self, expected_filenames): formatted_files = FormattedCheckpointFiles.from_dict(formatted_file_dict) actual_filenames = formatted_files.build_checkpoint_filenames() assert actual_filenames == expected_filenames + + +class TestCheckOutdirNotInCkptdir: + def test_sibling_directories(self): + # Sibling directories should pass without raising an error + ckpt_dir = Path("/path/to/ckpt") + out_dir = Path("/path/to/output") + check_outdir_not_in_ckptdir(ckpt_dir, out_dir) + + def test_ckpt_dir_in_output_dir(self): + # out_dir is a parent of ckpt_dir, should pass without raising an error + ckpt_dir = Path("/path/to/output/ckpt_dir") + out_dir = Path("/path/to/output") + check_outdir_not_in_ckptdir(ckpt_dir, out_dir) + + def test_equal_directories(self): + # Equal directories should raise a ValueError + ckpt_dir = Path("/path/to/ckpt") + out_dir = Path("/path/to/ckpt") + with pytest.raises( + ValueError, + match="The output directory cannot be the same as or a subdirectory of the checkpoint directory.", + ): + check_outdir_not_in_ckptdir(ckpt_dir, out_dir) + + def test_output_dir_in_ckpt_dir(self): + # out_dir is a subdirectory of ckpt_dir, should raise a ValueError + ckpt_dir = Path("/path/to/ckpt") + out_dir = Path("/path/to/ckpt/subdir") + with pytest.raises( + ValueError, + match="The output directory cannot be the same as or a subdirectory of the checkpoint directory.", + ): + check_outdir_not_in_ckptdir(ckpt_dir, out_dir) + + def test_output_dir_ckpt_dir_few_levels_down(self): + # out_dir is a few levels down in ckpt_dir, should raise a ValueError + ckpt_dir = Path("/path/to/ckpt") + out_dir = Path("/path/to/ckpt/subdir/another_subdir") + with pytest.raises( + ValueError, + match="The output directory cannot be the same as or a subdirectory of the checkpoint directory.", + ): + check_outdir_not_in_ckptdir(ckpt_dir, out_dir) diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py index a5d72af320..21a7cfb471 100644 --- a/torchtune/training/checkpointing/_checkpointer.py +++ b/torchtune/training/checkpointing/_checkpointer.py @@ -30,7 +30,7 @@ from torchtune.training.checkpointing._utils import ( ADAPTER_CONFIG_FNAME, ADAPTER_MODEL_FNAME, - BASE_MODEL_DIRNAME, + check_outdir_not_in_ckptdir, copy_files, get_adapter_checkpoint_path, get_model_checkpoint_path, @@ -163,7 +163,7 @@ def __init__( # TODO: support loading more than one file if len(checkpoint_files) != 1: raise ValueError( - "Currently we only support reading from a single torchtune checkpoint file. " + "Currently we only support reading from a single checkpoint file. " f"Got {len(checkpoint_files)} files instead." ) @@ -178,15 +178,10 @@ def __init__( self._model_type = ModelType[model_type] self._output_dir = Path(output_dir) - self._output_dir.mkdir(parents=True, exist_ok=True) - - # save all files in input_dir, except model weights and mapping, to output_dir - # this is useful to preserve the tokenizer, configs, license, etc. - copy_files( - self._checkpoint_dir, - Path.joinpath(self._output_dir, BASE_MODEL_DIRNAME), - ignore_suffixes=SUFFIXES_TO_NOT_COPY, + check_outdir_not_in_ckptdir( + ckpt_dir=self._checkpoint_dir, out_dir=self._output_dir ) + self._output_dir.mkdir(parents=True, exist_ok=True) # resume from adapter_model ckpt self._adapter_checkpoint = get_adapter_checkpoint_path( @@ -331,6 +326,14 @@ def save_checkpoint( "Adapter checkpoint not found in state_dict. Please ensure that the state_dict contains adapter weights." ) + # Save all files in ckpt_dir, except model weights and mapping, to output_dir/epoch_{epoch} + # So its easy to run inference with the model using this epoch's checkpoint + copy_files( + self._checkpoint_dir, + Path.joinpath(self._output_dir, f"epoch_{epoch}"), + ignore_suffixes=SUFFIXES_TO_NOT_COPY, + ) + # If the recipe state needs to be output, first remove the model state dict if intermediate_checkpoint: _ = state_dict.pop(training.MODEL_KEY, None) @@ -423,6 +426,9 @@ def __init__( self._checkpoint_dir = Path(checkpoint_dir) self._model_type = ModelType[model_type] self._output_dir = Path(output_dir) + check_outdir_not_in_ckptdir( + ckpt_dir=self._checkpoint_dir, out_dir=self._output_dir + ) self._output_dir.mkdir(parents=True, exist_ok=True) # weight_map contains the state_dict key -> checkpoint file mapping so we can correctly @@ -435,14 +441,6 @@ def __init__( Path.joinpath(self._checkpoint_dir, "config.json").read_text() ) - # save all files in input_dir, except model weights and mapping, to output_dir - # this is useful to preserve the tokenizer, configs, license, etc. - copy_files( - self._checkpoint_dir, - Path.joinpath(self._output_dir, BASE_MODEL_DIRNAME), - ignore_suffixes=SUFFIXES_TO_NOT_COPY, - ) - # repo_id is necessary for when saving an adapter config, so its compatible with HF. # This json file is produced and saved in the download step. # contents are {"repo_id": "some_model/some_model_version"} @@ -873,6 +871,14 @@ def save_checkpoint( f"saved to {output_path}" ) + # Save all files in ckpt_dir, except model weights and mapping, to output_dir/epoch_{epoch} + # So its easy to run inference with the model using this epoch's checkpoint + copy_files( + self._checkpoint_dir, + Path.joinpath(self._output_dir, f"epoch_{epoch}"), + ignore_suffixes=SUFFIXES_TO_NOT_COPY, + ) + # If the recipe state needs to be output, first remove the model state dict # and if it exists, remove the adapter state dict as well if intermediate_checkpoint: @@ -951,7 +957,7 @@ def __init__( # TODO: support loading more than one file if len(checkpoint_files) != 1: raise ValueError( - "Currently we only support reading from a single torchtune checkpoint file. " + "Currently we only support reading from a single checkpoint file. " f"Got {len(checkpoint_files)} files instead." ) @@ -964,15 +970,10 @@ def __init__( ) self._model_type = ModelType[model_type] self._output_dir = Path(output_dir) - self._output_dir.mkdir(parents=True, exist_ok=True) - - # save all files in input_dir, except model weights and mapping, to output_dir - # this is useful to preserve the tokenizer, configs, license, etc. - copy_files( - self._checkpoint_dir, - Path.joinpath(self._output_dir, BASE_MODEL_DIRNAME), - ignore_suffixes=SUFFIXES_TO_NOT_COPY, + check_outdir_not_in_ckptdir( + ckpt_dir=self._checkpoint_dir, out_dir=self._output_dir ) + self._output_dir.mkdir(parents=True, exist_ok=True) # resume from adapter_model ckpt self._adapter_checkpoint = get_adapter_checkpoint_path( @@ -1126,6 +1127,14 @@ def save_checkpoint( "Adapter checkpoint not found in state_dict. Please ensure that the state_dict contains adapter weights." ) + # Save all files in ckpt_dir, except model weights and mapping, to output_dir/epoch_{epoch} + # So its easy to run inference with the model using this epoch's checkpoint + copy_files( + self._checkpoint_dir, + Path.joinpath(self._output_dir, f"epoch_{epoch}"), + ignore_suffixes=SUFFIXES_TO_NOT_COPY, + ) + # If the recipe state needs to be output, first remove the model state dict # and if it exists, remove the adapter state dict as well if intermediate_checkpoint: diff --git a/torchtune/training/checkpointing/_utils.py b/torchtune/training/checkpointing/_utils.py index f8dc55452b..963f1f96f3 100644 --- a/torchtune/training/checkpointing/_utils.py +++ b/torchtune/training/checkpointing/_utils.py @@ -38,7 +38,6 @@ # standardize checkpointing SHARD_FNAME = "ft-model-{cpt_idx}-of-{num_shards}" RECIPE_STATE_DIRNAME = "recipe_state" -BASE_MODEL_DIRNAME = "base_model" # Needed when setting up output dir in checkpointing REPO_ID_FNAME = "original_repo_id" @@ -334,6 +333,7 @@ def copy_files( output_dir: Union[str, Path], *, ignore_suffixes: Optional[List[str]] = None, + max_file_size_mb: int = 100, ) -> None: """ Copies files from the input directory to the output directory, preserving the directory structure. @@ -346,6 +346,7 @@ def copy_files( output_dir (Union[str, Path]): The path to the output directory where files should be copied. ignore_suffixes (Optional[List[str]]): A list of file suffixes to exclude from copying. Defaults to ['.pt', '.bin', '.safetensors'] if not provided. + max_file_size_mb (int): The maximum file size in megabytes to copy. Defaults to 100 MB. Returns: None Example: @@ -355,6 +356,7 @@ def copy_files( already exist in the destination or have the specified suffixes. """ + max_file_size = max_file_size_mb * 1024 * 1024 for root, dirs, files in os.walk(input_dir): # Filter out directories that start with '.'. E.g. ".cache/" @@ -381,6 +383,13 @@ def copy_files( src_file = os.path.join(root, file) dest_file = os.path.join(dest_dir, file) + # Check the file size + if os.path.getsize(src_file) > max_file_size: + print( + f"Skipping copying {src_file} to {output_dir} as it exceeds the size limit of {max_file_size_mb} MiB." + ) + continue + # Copy the file if it doesn't already exist in the destination if not os.path.exists(dest_file): shutil.copy2(src_file, dest_file) @@ -563,3 +572,23 @@ def validate_checkpoint_files( ) return checkpoint_paths + + +def check_outdir_not_in_ckptdir(ckpt_dir: Path, out_dir: Path) -> bool: + """ + Checks that the output directory is not equal to or a subdirectory of the checkpoint directory. + This is necessary to avoid making copies of copies when geting config files from ckpt_dir. + """ + + # Resolve the absolute paths to avoid issues with relative paths + _ckpt_dir = ckpt_dir.resolve() + _out_dir = out_dir.resolve() + + # Check if out_dir is the same as ckpt_dir or a subdirectory of it + if _out_dir == _ckpt_dir or _ckpt_dir in _out_dir.parents: + raise ValueError( + "The output directory cannot be the same as or a subdirectory of the checkpoint directory. " + f"Found {ckpt_dir=} and {out_dir=}." + ) + + return True diff --git a/torchtune/training/quantization.py b/torchtune/training/quantization.py index 4e21cb4936..b158d4b9a3 100644 --- a/torchtune/training/quantization.py +++ b/torchtune/training/quantization.py @@ -130,7 +130,10 @@ def quantize(self, model): # int4 weight-only -Int4WeightOnlyQATQuantizerModuleSwap = Int4WeightOnlyQATQuantizer +class Int4WeightOnlyQATQuantizerModuleSwap(Int4WeightOnlyQATQuantizer): + pass + + disable_4w_fake_quant_module_swap = disable_4w_fake_quant enable_4w_fake_quant_module_swap = enable_4w_fake_quant _quantizer_to_mode[Int4WeightOnlyQATQuantizerModuleSwap] = "4w-qat-module-swap" @@ -142,7 +145,10 @@ def quantize(self, model): ] = enable_4w_fake_quant_module_swap # int8 dynamic activations + int4 weight -Int8DynActInt4WeightQATQuantizerModuleSwap = Int8DynActInt4WeightQATQuantizer +class Int8DynActInt4WeightQATQuantizerModuleSwap(Int8DynActInt4WeightQATQuantizer): + pass + + disable_8da4w_fake_quant_module_swap = disable_8da4w_fake_quant enable_8da4w_fake_quant_module_swap = enable_8da4w_fake_quant _quantizer_to_mode[Int8DynActInt4WeightQATQuantizerModuleSwap] = "8da4w-qat-module-swap"