2024-12-20 nightly release (74e6e7b)

pytorch · Dec 20, 2024 · b2d9463 · b2d9463
1 parent cb4baba
commit b2d9463
Show file tree

Hide file tree

Showing 33 changed files with 263 additions and 98 deletions.
diff --git a/recipes/configs/eleuther_evaluation.yaml b/recipes/configs/eleuther_evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run eleuther_eval --config eleuther_evaluation tasks=["truthfulqa_mc2","hellaswag"]
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.llama2.llama2_7b
@@ -14,7 +16,7 @@ checkpointer:
     pytorch_model-00001-of-00002.bin,
     pytorch_model-00002-of-00002.bin,
   ]
-  output_dir: /tmp/Llama-2-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 
 # Tokenizer

diff --git a/recipes/configs/gemma/evaluation.yaml b/recipes/configs/gemma/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config gemma/evaluation
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.gemma.gemma_2b
@@ -15,7 +17,7 @@ checkpointer:
     model-00001-of-00002.safetensors,
     model-00002-of-00002.safetensors,
   ]
-  output_dir: ./ # Not needed
+  output_dir: ${output_dir}
   model_type: GEMMA
 
 # Tokenizer

diff --git a/recipes/configs/generation.yaml b/recipes/configs/generation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run generate --config generation
 
+output_dir: ./ # Not needed
+
 # Model arguments
 model:
   _component_: torchtune.models.llama2.llama2_7b
@@ -14,7 +16,7 @@ checkpointer:
     pytorch_model-00001-of-00002.bin,
     pytorch_model-00002-of-00002.bin,
   ]
-  output_dir: /tmp/Llama-2-7b-hf/
+  output_dir: ${output_dir}
   model_type: LLAMA2
 
 device: cuda

diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml
@@ -32,7 +32,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/Llama-2-7b-hf/tokenizer.model
-  max_seq_len: 1024
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
@@ -31,7 +31,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/Llama-2-7b-hf/tokenizer.model
-  max_seq_len: 1024
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama2/generation_v2.yaml b/recipes/configs/llama2/generation_v2.yaml
@@ -6,6 +6,8 @@
 # To launch, run the following command:
 #    tune run dev/generate_v2 --config llama2/generation_v2
 
+output_dir: ./ # Not needed
+
 # Model arguments
 model:
   _component_: torchtune.models.llama2.llama2_7b
@@ -24,7 +26,7 @@ checkpointer:
     pytorch_model-00001-of-00002.bin,
     pytorch_model-00002-of-00002.bin
   ]
-  output_dir: ./
+  output_dir: ${output_dir}
   model_type: LLAMA2
 
 # Device

diff --git a/recipes/configs/llama3/8B_qat_lora.yaml b/recipes/configs/llama3/8B_qat_lora.yaml
@@ -83,6 +83,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -108,8 +112,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_1/8B_lora_dpo.yaml b/recipes/configs/llama3_1/8B_lora_dpo.yaml
@@ -32,7 +32,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
   path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model
-  max_seq_len: null
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml b/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml
@@ -31,7 +31,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
   path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model
-  max_seq_len: null
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama3_1/8B_qat_lora.yaml b/recipes/configs/llama3_1/8B_qat_lora.yaml
@@ -86,6 +86,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -111,8 +115,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/1B_qat_lora.yaml b/recipes/configs/llama3_2/1B_qat_lora.yaml
@@ -82,6 +82,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -107,8 +111,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/3B_qat_lora.yaml b/recipes/configs/llama3_2/3B_qat_lora.yaml
@@ -83,6 +83,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -108,8 +112,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml b/recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml
@@ -59,7 +59,7 @@ teacher_checkpointer:
     model-00004-of-00004.safetensors
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Meta-Llama-3.1-8B-Instruct/
+  output_dir: ${output_dir}
   model_type: LLAMA3
 
 # Dataset and Sampler

diff --git a/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml b/recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml
@@ -59,7 +59,7 @@ teacher_checkpointer:
     model-00004-of-00004.safetensors
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Meta-Llama-3.1-8B-Instruct/
+  output_dir: ${output_dir}
   model_type: LLAMA3
 
 # Dataset and Sampler

diff --git a/recipes/configs/llama3_2_vision/11B_evaluation.yaml b/recipes/configs/llama3_2_vision/11B_evaluation.yaml
@@ -9,6 +9,8 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run eleuther_eval --config llama3_2_vision/11B_evaluation
 
+output_dir: ./ # Not needed
+
 # Model arguments
 model:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_11b
@@ -26,7 +28,7 @@ checkpointer:
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
     max_filename: "00005"
-  output_dir: ./
+  output_dir: ${output_dir}
   model_type: LLAMA3_VISION
 
 # Environment

diff --git a/recipes/configs/llama3_2_vision/11B_generation_v2.yaml b/recipes/configs/llama3_2_vision/11B_generation_v2.yaml
@@ -7,6 +7,8 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run dev/generate_v2 --config llama3_2_vision/generation_v2
 
+output_dir: ./ # Not needed
+
 # Model arguments
 model:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_11b
@@ -25,7 +27,7 @@ checkpointer:
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
     max_filename: "00005"
-  output_dir: ./
+  output_dir: ${output_dir}
   model_type: LLAMA3_VISION
 
 # Device

diff --git a/recipes/configs/mistral/evaluation.yaml b/recipes/configs/mistral/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config mistral/evaluation
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.mistral.mistral_7b
@@ -15,7 +17,7 @@ checkpointer:
     pytorch_model-00001-of-00002.bin,
     pytorch_model-00002-of-00002.bin
   ]
-  output_dir: /tmp/Mistral-7B-v0.1/
+  output_dir: ${output_dir}
   model_type: MISTRAL
 resume_from_checkpoint: False
 

diff --git a/recipes/configs/phi3/evaluation.yaml b/recipes/configs/phi3/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config phi3/evaluation
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.phi3.phi3_mini
@@ -16,7 +18,7 @@ checkpointer:
     model-00002-of-00002.safetensors
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Phi-3-mini-4k-instruct
+  output_dir: ${output_dir}
   model_type: PHI3_MINI
 resume_from_checkpoint: False
 

diff --git a/recipes/configs/quantization.yaml b/recipes/configs/quantization.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run quantize --config quantization
 
+output_dir: /tmp/torchtune/llama2_7B/quantized # /tmp may be deleted by your system. Change it to your preference.
+
 #
 # Model arguments
 model:
@@ -16,7 +18,7 @@ checkpointer:
     pytorch_model-00002-of-00002.bin,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Llama-2-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 
 device: cuda

diff --git a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml
@@ -51,7 +51,7 @@ teacher_checkpointer:
     hf_model_0001_0.pt
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Qwen2-1.5B-Instruct-lora-finetune
+  output_dir: ${output_dir}
   model_type: QWEN2
 
 resume_from_checkpoint: False

diff --git a/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml b/recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml
@@ -51,7 +51,7 @@ teacher_checkpointer:
     model.safetensors
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Qwen2-1.5B-Instruct
+  output_dir: ${output_dir}
   model_type: QWEN2
 
 resume_from_checkpoint: False

diff --git a/recipes/configs/qwen2/evaluation.yaml b/recipes/configs/qwen2/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config qwen2/evaluation
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.qwen2.qwen2_7b
@@ -17,7 +19,7 @@ checkpointer:
     model-00003-of-00004.safetensors,
     model-00004-of-00004.safetensors
   ]
-  output_dir: ./ # Not needed
+  output_dir: ${output_dir}
   model_type: QWEN2
 
 # Tokenizer

diff --git a/recipes/dev/7B_full_early_exit.yaml b/recipes/dev/7B_full_early_exit.yaml
@@ -30,6 +30,7 @@
 # This config works best for distributed training, hence when the model is being fine-tuned on 2+ GPUs.
 #
 
+output_dir: /tmp/torchtune/llama2_7b/full_early_exit # /tmp may be deleted by your system. Change it to your preference.
 
 # Tokenizer
 tokenizer:
@@ -61,7 +62,7 @@ checkpointer:
     pytorch_model-00002-of-00002.bin
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Llama-2-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 resume_from_checkpoint: False
 
@@ -92,8 +93,7 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/topv2-llama2-finetune
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 

diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
@@ -26,6 +26,7 @@
     DoRALinear,
     get_adapter_params,
     get_adapter_state_dict,
+    get_lora_module_names,
     get_merged_lora_ckpt,
     LoRALinear,
     set_trainable_params,
@@ -595,6 +596,17 @@ def save_checkpoint(
                     }
                 )
 
+            adapter_config = {
+                "r": self._lora_rank,
+                "lora_alpha": self._lora_alpha,
+                "target_modules": get_lora_module_names(
+                    self._lora_attn_modules,
+                    self._apply_lora_to_mlp,
+                    self._apply_lora_to_output,
+                ),
+                "peft_type": "LORA",
+            }
+            checkpoint_dict.update({training.ADAPTER_CONFIG: adapter_config})
             self._checkpointer.save_checkpoint(
                 checkpoint_dict,
                 epoch=epoch,