diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml index 6bca6c378f..bae760c67e 100644 --- a/recipes/configs/code_llama2/7B_full_low_memory.yaml +++ b/recipes/configs/code_llama2/7B_full_low_memory.yaml @@ -45,7 +45,9 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset + seed: null shuffle: True @@ -75,4 +77,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/CodeLlama-7b-hf/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml index 263e3c12e1..1ada63446b 100644 --- a/recipes/configs/code_llama2/7B_lora_single_device.yaml +++ b/recipes/configs/code_llama2/7B_lora_single_device.yaml @@ -49,7 +49,9 @@ save_adapter_weights_only: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset + seed: null shuffle: True @@ -84,7 +86,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/CodeLlama-7b-hf/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Showcase the usage of PyTorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml index 4f6fd9be61..e7910d73cc 100644 --- a/recipes/configs/code_llama2/7B_qlora_single_device.yaml +++ b/recipes/configs/code_llama2/7B_qlora_single_device.yaml @@ -49,6 +49,7 @@ save_adapter_weights_only: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -84,7 +85,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/CodeLlama-7b-hf/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml index 4ed8a80e09..ee1e0f650c 100644 --- a/recipes/configs/dev/8B_full_experimental.yaml +++ b/recipes/configs/dev/8B_full_experimental.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -57,7 +58,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 - +compile: False # Training env device: cuda @@ -78,3 +79,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama3-finetune log_every_n_steps: null +log_peak_memory_stats: True diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml index e1bd3272d0..a3b8ed59f7 100644 --- a/recipes/configs/gemma/2B_full.yaml +++ b/recipes/configs/gemma/2B_full.yaml @@ -23,6 +23,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -54,6 +55,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -70,4 +72,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml index b82faa39e2..8ed92dd115 100644 --- a/recipes/configs/gemma/2B_lora.yaml +++ b/recipes/configs/gemma/2B_lora.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -66,6 +67,7 @@ batch_size: 4 epochs: 3 max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -82,4 +84,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml index d6e1664b71..b661710caf 100644 --- a/recipes/configs/gemma/2B_lora_single_device.yaml +++ b/recipes/configs/gemma/2B_lora_single_device.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -83,7 +84,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml index 9b24d6c0ee..2b5cbf96bb 100644 --- a/recipes/configs/gemma/2B_qlora_single_device.yaml +++ b/recipes/configs/gemma/2B_qlora_single_device.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -83,7 +84,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/gemma/7B_full.yaml b/recipes/configs/gemma/7B_full.yaml index a8924836fe..eb6b8c9426 100644 --- a/recipes/configs/gemma/7B_full.yaml +++ b/recipes/configs/gemma/7B_full.yaml @@ -23,6 +23,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -56,6 +57,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -72,4 +74,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml index 6db9b0ab82..4d74f93671 100644 --- a/recipes/configs/gemma/7B_lora.yaml +++ b/recipes/configs/gemma/7B_lora.yaml @@ -23,6 +23,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -68,6 +69,7 @@ batch_size: 4 epochs: 3 max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -84,4 +86,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml index c82f0b76ba..369ba715e5 100644 --- a/recipes/configs/gemma/7B_lora_single_device.yaml +++ b/recipes/configs/gemma/7B_lora_single_device.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -85,7 +86,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml index fcbccb786b..301a7b4a5d 100644 --- a/recipes/configs/gemma/7B_qlora_single_device.yaml +++ b/recipes/configs/gemma/7B_qlora_single_device.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -85,7 +86,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml index f5ecffc2ab..be5a4e8b1d 100644 --- a/recipes/configs/llama2/13B_full.yaml +++ b/recipes/configs/llama2/13B_full.yaml @@ -43,6 +43,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -58,6 +59,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -74,4 +76,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml index d657754139..797abc2a63 100644 --- a/recipes/configs/llama2/13B_lora.yaml +++ b/recipes/configs/llama2/13B_lora.yaml @@ -52,6 +52,7 @@ tokenizer: # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -74,6 +75,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 16 +compile: False # Logging output_dir: /tmp/lora_finetune_output @@ -81,7 +83,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml index 56431fdff5..9e8faaa800 100644 --- a/recipes/configs/llama2/13B_qlora_single_device.yaml +++ b/recipes/configs/llama2/13B_qlora_single_device.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml index b4d0d9c9a9..9502690be2 100644 --- a/recipes/configs/llama2/70B_lora.yaml +++ b/recipes/configs/llama2/70B_lora.yaml @@ -52,6 +52,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -81,7 +82,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml index c1de2c2358..c0e2e320f3 100644 --- a/recipes/configs/llama2/70B_qlora.yaml +++ b/recipes/configs/llama2/70B_qlora.yaml @@ -57,6 +57,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset train_on_input: True seed: null @@ -91,7 +92,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml index 2e80276c84..3a6e3c35f2 100644 --- a/recipes/configs/llama2/7B_full.yaml +++ b/recipes/configs/llama2/7B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -57,7 +58,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 - +compile: False # Training env device: cuda @@ -74,4 +75,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml index 06558009ed..b9b933c2df 100644 --- a/recipes/configs/llama2/7B_full_low_memory.yaml +++ b/recipes/configs/llama2/7B_full_low_memory.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -79,4 +80,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml index 2c9a694d7b..82276fa317 100644 --- a/recipes/configs/llama2/7B_lora.yaml +++ b/recipes/configs/llama2/7B_lora.yaml @@ -49,6 +49,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -78,7 +79,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda @@ -92,14 +93,14 @@ profiler: enabled: False - #Output directory of trace artifacts + # Output directory of trace artifacts output_dir: ${output_dir}/profiling_outputs #`torch.profiler.ProfilerActivity` types to trace cpu: True cuda: True - #trace options passed to `torch.profiler.profile` + # trace options passed to `torch.profiler.profile` profile_memory: False with_stack: False record_shapes: True diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml index 26f824814f..1a0b4bc390 100644 --- a/recipes/configs/llama2/7B_lora_dpo.yaml +++ b/recipes/configs/llama2/7B_lora_dpo.yaml @@ -70,6 +70,7 @@ loss: epochs: 1 max_steps_per_epoch: 1000 gradient_accumulation_steps: 8 +compile: False # Logging output_dir: /tmp/lora_dpo_output/ @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml index 2ad3988867..bfe8185f06 100644 --- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml +++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml @@ -75,7 +75,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml index ebaee584c2..a1c001b868 100644 --- a/recipes/configs/llama2/7B_lora_single_device.yaml +++ b/recipes/configs/llama2/7B_lora_single_device.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/7B_qat_full.yaml b/recipes/configs/llama2/7B_qat_full.yaml index 6fca6c4d4a..d1a408aca5 100644 --- a/recipes/configs/llama2/7B_qat_full.yaml +++ b/recipes/configs/llama2/7B_qat_full.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -53,6 +54,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # QAT arguments quantizer: @@ -75,4 +77,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml index 052cdb9296..26fc4faf11 100644 --- a/recipes/configs/llama2/7B_qlora.yaml +++ b/recipes/configs/llama2/7B_qlora.yaml @@ -48,6 +48,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset train_on_input: True seed: null @@ -82,7 +83,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml index 0893f48579..611c5b155b 100644 --- a/recipes/configs/llama2/7B_qlora_single_device.yaml +++ b/recipes/configs/llama2/7B_qlora_single_device.yaml @@ -46,6 +46,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -76,7 +77,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml index cd345822b6..608992f737 100644 --- a/recipes/configs/llama3/70B_full.yaml +++ b/recipes/configs/llama3/70B_full.yaml @@ -25,6 +25,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -94,7 +95,7 @@ device: cuda enable_activation_checkpointing: True custom_sharded_layers: ['tok_embeddings', 'output'] fsdp_cpu_offload: True -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Reduced precision dtype: bf16 @@ -105,4 +106,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml index f3a921f289..247daba5cc 100644 --- a/recipes/configs/llama3/70B_lora.yaml +++ b/recipes/configs/llama3/70B_lora.yaml @@ -67,6 +67,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -89,7 +90,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 1 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output @@ -97,7 +98,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/8B_dora.yaml b/recipes/configs/llama3/8B_dora.yaml index 1265c82c72..a9ea97986e 100644 --- a/recipes/configs/llama3/8B_dora.yaml +++ b/recipes/configs/llama3/8B_dora.yaml @@ -42,6 +42,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -64,6 +65,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Logging output_dir: /tmp/dora_finetune_output @@ -71,7 +73,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/8B_dora_single_device.yaml b/recipes/configs/llama3/8B_dora_single_device.yaml index 0fc0a484dc..188b54f757 100644 --- a/recipes/configs/llama3/8B_dora_single_device.yaml +++ b/recipes/configs/llama3/8B_dora_single_device.yaml @@ -44,6 +44,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -74,7 +75,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/8B_full.yaml b/recipes/configs/llama3/8B_full.yaml index 7f24376db7..baa4a79417 100644 --- a/recipes/configs/llama3/8B_full.yaml +++ b/recipes/configs/llama3/8B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -57,7 +58,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 - +compile: False # Training env device: cuda @@ -75,4 +76,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3/8B_full_single_device.yaml b/recipes/configs/llama3/8B_full_single_device.yaml index cd3e3586ce..6b8e1ad4b8 100644 --- a/recipes/configs/llama3/8B_full_single_device.yaml +++ b/recipes/configs/llama3/8B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -78,4 +79,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml index d65138f348..69a2349035 100644 --- a/recipes/configs/llama3/8B_lora.yaml +++ b/recipes/configs/llama3/8B_lora.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -69,6 +70,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 32 +compile: False # Logging output_dir: /tmp/lora_finetune_output @@ -76,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml index e49afacbb1..661bbe86db 100644 --- a/recipes/configs/llama3/8B_lora_single_device.yaml +++ b/recipes/configs/llama3/8B_lora_single_device.yaml @@ -46,6 +46,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -76,7 +77,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda @@ -91,14 +92,14 @@ profiler: _component_: torchtune.training.setup_torch_profiler enabled: False - #Output directory of trace artifacts + # Output directory of trace artifacts output_dir: ${output_dir}/profiling_outputs #`torch.profiler.ProfilerActivity` types to trace cpu: True cuda: True - #trace options passed to `torch.profiler.profile` + # trace options passed to `torch.profiler.profile` profile_memory: False with_stack: False record_shapes: True diff --git a/recipes/configs/llama3/8B_qat_full.yaml b/recipes/configs/llama3/8B_qat_full.yaml index ff4d9c3195..07461e8243 100644 --- a/recipes/configs/llama3/8B_qat_full.yaml +++ b/recipes/configs/llama3/8B_qat_full.yaml @@ -21,6 +21,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -43,6 +44,7 @@ resume_from_checkpoint: False # Fine-tuning arguments batch_size: 2 epochs: 3 +compile: False # QAT arguments quantizer: @@ -74,4 +76,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama3-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3/8B_qdora_single_device.yaml b/recipes/configs/llama3/8B_qdora_single_device.yaml index 7180c5a72c..fafda9a123 100644 --- a/recipes/configs/llama3/8B_qdora_single_device.yaml +++ b/recipes/configs/llama3/8B_qdora_single_device.yaml @@ -45,6 +45,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -75,7 +76,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml index 1eef476d17..83c0dcb9d1 100644 --- a/recipes/configs/llama3/8B_qlora_single_device.yaml +++ b/recipes/configs/llama3/8B_qlora_single_device.yaml @@ -45,6 +45,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -75,7 +76,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml index 58f5eb9b1f..f640581ba1 100644 --- a/recipes/configs/llama3_1/405B_qlora.yaml +++ b/recipes/configs/llama3_1/405B_qlora.yaml @@ -41,6 +41,7 @@ save_adapter_weights_only: True # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset train_on_input: True seed: null @@ -67,7 +68,7 @@ fsdp: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 16 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/qlora_finetune_output diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml index 2a7d19399c..97ca0a7052 100644 --- a/recipes/configs/llama3_1/70B_full.yaml +++ b/recipes/configs/llama3_1/70B_full.yaml @@ -24,6 +24,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -96,7 +97,7 @@ device: cuda enable_activation_checkpointing: True custom_sharded_layers: ['tok_embeddings', 'output'] fsdp_cpu_offload: True -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Reduced precision dtype: bf16 @@ -107,4 +108,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3_1-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml index 861279127a..ad1bc64110 100644 --- a/recipes/configs/llama3_1/70B_lora.yaml +++ b/recipes/configs/llama3_1/70B_lora.yaml @@ -66,6 +66,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -88,7 +89,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 1 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora-llama3_1-finetune-output @@ -96,7 +97,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml index 4420b0cae5..da27c91852 100644 --- a/recipes/configs/llama3_1/8B_full.yaml +++ b/recipes/configs/llama3_1/8B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -60,7 +61,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 - +compile: False # Training env device: cuda @@ -68,7 +69,7 @@ device: cuda # Memory management enable_activation_checkpointing: True custom_sharded_layers: ['tok_embeddings', 'output'] -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Reduced precision dtype: bf16 @@ -79,4 +80,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.1-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_1/8B_full_single_device.yaml b/recipes/configs/llama3_1/8B_full_single_device.yaml index 9f7d9472ce..04ba339b23 100644 --- a/recipes/configs/llama3_1/8B_full_single_device.yaml +++ b/recipes/configs/llama3_1/8B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -61,7 +62,7 @@ loss: max_steps_per_epoch: null gradient_accumulation_steps: 1 optimizer_in_bwd: True -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training environment device: cuda @@ -78,7 +79,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.1-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml index 5f101b170f..d0a5202847 100644 --- a/recipes/configs/llama3_1/8B_lora.yaml +++ b/recipes/configs/llama3_1/8B_lora.yaml @@ -50,6 +50,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -72,7 +73,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 32 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output @@ -80,7 +81,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml index 3991f728ce..bc9a3956f3 100644 --- a/recipes/configs/llama3_1/8B_lora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml @@ -49,6 +49,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -71,7 +72,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 64 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output @@ -79,7 +80,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml index a9b0662105..b194acb181 100644 --- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml @@ -48,6 +48,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -70,7 +71,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 16 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/qlora_finetune_output/ @@ -78,7 +79,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/1B_full.yaml b/recipes/configs/llama3_2/1B_full.yaml index 23b699f754..c90fea966f 100644 --- a/recipes/configs/llama3_2/1B_full.yaml +++ b/recipes/configs/llama3_2/1B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -64,7 +65,7 @@ device: cuda # Memory management enable_activation_checkpointing: False -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Reduced precision dtype: bf16 @@ -75,4 +76,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml index fc4b0a507c..e4d1f87fac 100644 --- a/recipes/configs/llama3_2/1B_full_single_device.yaml +++ b/recipes/configs/llama3_2/1B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -58,7 +59,7 @@ loss: max_steps_per_epoch: null gradient_accumulation_steps: 1 optimizer_in_bwd: True -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training environment device: cuda @@ -75,7 +76,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/llama3_2/1B_lora.yaml b/recipes/configs/llama3_2/1B_lora.yaml index 228e4989d5..b5e53900ef 100644 --- a/recipes/configs/llama3_2/1B_lora.yaml +++ b/recipes/configs/llama3_2/1B_lora.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -69,7 +70,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/1B_lora_single_device.yaml b/recipes/configs/llama3_2/1B_lora_single_device.yaml index c9ebed6dc7..8c94bb0582 100644 --- a/recipes/configs/llama3_2/1B_lora_single_device.yaml +++ b/recipes/configs/llama3_2/1B_lora_single_device.yaml @@ -46,6 +46,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -68,7 +69,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output @@ -76,7 +77,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/1B_qlora_single_device.yaml b/recipes/configs/llama3_2/1B_qlora_single_device.yaml index da552b2a0f..282d0d9e89 100644 --- a/recipes/configs/llama3_2/1B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2/1B_qlora_single_device.yaml @@ -45,6 +45,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -67,7 +68,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output @@ -75,7 +76,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/3B_full.yaml b/recipes/configs/llama3_2/3B_full.yaml index 6d738331ae..bfe9ef6420 100644 --- a/recipes/configs/llama3_2/3B_full.yaml +++ b/recipes/configs/llama3_2/3B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -64,7 +65,7 @@ device: cuda # Memory management enable_activation_checkpointing: True -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Reduced precision dtype: bf16 @@ -75,4 +76,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_2/3B_full_single_device.yaml b/recipes/configs/llama3_2/3B_full_single_device.yaml index 9b21f4f865..14a5369e71 100644 --- a/recipes/configs/llama3_2/3B_full_single_device.yaml +++ b/recipes/configs/llama3_2/3B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -59,7 +60,7 @@ loss: max_steps_per_epoch: null gradient_accumulation_steps: 1 optimizer_in_bwd: True -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training environment device: cuda @@ -76,7 +77,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/llama3_2/3B_lora.yaml b/recipes/configs/llama3_2/3B_lora.yaml index d13a303814..076f9d9171 100644 --- a/recipes/configs/llama3_2/3B_lora.yaml +++ b/recipes/configs/llama3_2/3B_lora.yaml @@ -48,6 +48,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -70,7 +71,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output @@ -78,7 +79,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/3B_lora_single_device.yaml b/recipes/configs/llama3_2/3B_lora_single_device.yaml index 255c75e227..b36d18f872 100644 --- a/recipes/configs/llama3_2/3B_lora_single_device.yaml +++ b/recipes/configs/llama3_2/3B_lora_single_device.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -69,7 +70,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/3B_qlora_single_device.yaml b/recipes/configs/llama3_2/3B_qlora_single_device.yaml index 360443b9e1..3efbd6c43c 100644 --- a/recipes/configs/llama3_2/3B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2/3B_qlora_single_device.yaml @@ -46,6 +46,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -68,7 +69,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output @@ -76,7 +77,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml index 9cb029666f..ba39474639 100644 --- a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml +++ b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml @@ -7,6 +7,7 @@ # tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" # # You get better results using KD if the teacher model has already been fine-tuned on the target dataset: + packed: False # Set to true for great speed ups # tune run lora_finetune_single_device --config llama3_1/8B_lora_single_device # # To launch on a single device, run the following command from root: @@ -62,6 +63,7 @@ teacher_checkpointer: # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -96,7 +98,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2_vision/11B_full.yaml b/recipes/configs/llama3_2_vision/11B_full.yaml index a9f4a41eb1..789ee952cb 100644 --- a/recipes/configs/llama3_2_vision/11B_full.yaml +++ b/recipes/configs/llama3_2_vision/11B_full.yaml @@ -42,6 +42,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.multimodal.the_cauldron_dataset subset: ocrvqa seed: null @@ -60,7 +61,7 @@ optimizer: loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training env device: cuda @@ -76,4 +77,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml index 3372c1a540..93d8c2cd11 100644 --- a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml @@ -44,6 +44,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.multimodal.the_cauldron_dataset subset: ocrvqa seed: null @@ -62,7 +63,7 @@ optimizer_in_bwd: False loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training env device: cuda @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (default is disabled) profiler: diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml index 1e5c0323ac..a27f5f3510 100644 --- a/recipes/configs/llama3_2_vision/11B_lora.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora.yaml @@ -48,6 +48,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.multimodal.the_cauldron_dataset subset: ocrvqa seed: null @@ -70,7 +71,7 @@ lr_scheduler: loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training env device: cuda @@ -86,4 +87,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml index 88e51aa355..45288521a1 100644 --- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml @@ -46,6 +46,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.multimodal.the_cauldron_dataset subset: ocrvqa seed: null @@ -69,7 +70,7 @@ lr_scheduler: loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile: False # set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training env device: cuda @@ -85,7 +86,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/mistral/7B_full.yaml b/recipes/configs/mistral/7B_full.yaml index 602b3fe082..25cf783846 100644 --- a/recipes/configs/mistral/7B_full.yaml +++ b/recipes/configs/mistral/7B_full.yaml @@ -29,6 +29,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -60,6 +61,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -76,4 +78,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Mistral-7B-v0.1/ log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/mistral/7B_full_low_memory.yaml b/recipes/configs/mistral/7B_full_low_memory.yaml index 7e68ee8066..a6cf37fa8c 100644 --- a/recipes/configs/mistral/7B_full_low_memory.yaml +++ b/recipes/configs/mistral/7B_full_low_memory.yaml @@ -31,6 +31,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -81,4 +82,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Mistral-7B-v0.1/ log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml index bf9aad71c3..db3b3f5e86 100644 --- a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml +++ b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml @@ -135,7 +135,7 @@ optimizer: _component_: bitsandbytes.optim.PagedAdamW lr: 3e-6 optimizer_in_bwd: True -log_peak_memory_stats: False +log_peak_memory_stats: True enable_activation_checkpointing: True # Reduced precision diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml index 08196660fc..a2dc801925 100644 --- a/recipes/configs/mistral/7B_lora.yaml +++ b/recipes/configs/mistral/7B_lora.yaml @@ -30,6 +30,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -74,6 +75,7 @@ batch_size: 4 epochs: 3 max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -90,4 +92,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Mistral-7B-v0.1 log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml index 2ebc9f798e..21212f4983 100644 --- a/recipes/configs/mistral/7B_lora_single_device.yaml +++ b/recipes/configs/mistral/7B_lora_single_device.yaml @@ -27,6 +27,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -89,7 +90,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Mistral-7B-v0.1 log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/mistral/7B_qlora_single_device.yaml b/recipes/configs/mistral/7B_qlora_single_device.yaml index 3bbfebe3ba..e2f6884a9f 100644 --- a/recipes/configs/mistral/7B_qlora_single_device.yaml +++ b/recipes/configs/mistral/7B_qlora_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -90,7 +91,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Mistral-7B-v0.1 log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/phi3/mini_full.yaml b/recipes/configs/phi3/mini_full.yaml index 0ee746ddd4..0be89337a7 100644 --- a/recipes/configs/phi3/mini_full.yaml +++ b/recipes/configs/phi3/mini_full.yaml @@ -42,6 +42,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -57,6 +58,7 @@ optimizer: lr: 5e-6 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +compile: False # Training env device: cuda @@ -71,4 +73,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Phi-3-mini-4k-instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/phi3/mini_full_low_memory.yaml b/recipes/configs/phi3/mini_full_low_memory.yaml index 182a4f6a98..470f4a1afe 100644 --- a/recipes/configs/phi3/mini_full_low_memory.yaml +++ b/recipes/configs/phi3/mini_full_low_memory.yaml @@ -44,6 +44,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -74,4 +75,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Phi-3-mini-4k-instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/phi3/mini_lora.yaml b/recipes/configs/phi3/mini_lora.yaml index fff05885ef..1af4929985 100644 --- a/recipes/configs/phi3/mini_lora.yaml +++ b/recipes/configs/phi3/mini_lora.yaml @@ -49,6 +49,7 @@ save_adapter_weights_only: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -68,6 +69,7 @@ lr_scheduler: num_warmup_steps: 100 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +compile: False # Training env device: cuda @@ -82,4 +84,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Phi-3-mini-4k-instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/phi3/mini_lora_single_device.yaml b/recipes/configs/phi3/mini_lora_single_device.yaml index b5c14b19ca..21a12a3cc1 100644 --- a/recipes/configs/phi3/mini_lora_single_device.yaml +++ b/recipes/configs/phi3/mini_lora_single_device.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -84,7 +85,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Phi-3-mini-4k-instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Showcase the usage of PyTorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/phi3/mini_qlora_single_device.yaml b/recipes/configs/phi3/mini_qlora_single_device.yaml index 10114bc67a..21c9403bef 100644 --- a/recipes/configs/phi3/mini_qlora_single_device.yaml +++ b/recipes/configs/phi3/mini_qlora_single_device.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -84,7 +85,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Phi-3-mini-4k-instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Showcase the usage of PyTorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/qwen2/0.5B_full.yaml b/recipes/configs/qwen2/0.5B_full.yaml index 5bf14591f9..39748ee052 100644 --- a/recipes/configs/qwen2/0.5B_full.yaml +++ b/recipes/configs/qwen2/0.5B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -56,7 +57,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 16 - +compile: False # Training env device: cuda @@ -73,4 +74,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-0.5B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/0.5B_full_single_device.yaml b/recipes/configs/qwen2/0.5B_full_single_device.yaml index 67091a4e8a..2d2afe883e 100644 --- a/recipes/configs/qwen2/0.5B_full_single_device.yaml +++ b/recipes/configs/qwen2/0.5B_full_single_device.yaml @@ -24,6 +24,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -74,4 +75,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-0.5B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/0.5B_lora.yaml b/recipes/configs/qwen2/0.5B_lora.yaml index e0608eba5c..33b5e968d0 100644 --- a/recipes/configs/qwen2/0.5B_lora.yaml +++ b/recipes/configs/qwen2/0.5B_lora.yaml @@ -46,6 +46,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null @@ -70,6 +71,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 +compile: False # Logging output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune @@ -78,7 +80,7 @@ metric_logger: log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/0.5B_lora_single_device.yaml b/recipes/configs/qwen2/0.5B_lora_single_device.yaml index 602c63853a..beeb21b072 100644 --- a/recipes/configs/qwen2/0.5B_lora_single_device.yaml +++ b/recipes/configs/qwen2/0.5B_lora_single_device.yaml @@ -45,6 +45,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -76,7 +77,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/1.5B_full.yaml b/recipes/configs/qwen2/1.5B_full.yaml index cb7b5e2318..8e850bae50 100644 --- a/recipes/configs/qwen2/1.5B_full.yaml +++ b/recipes/configs/qwen2/1.5B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -56,7 +57,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 - +compile: False # Training env device: cuda @@ -73,4 +74,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-1.5B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/1.5B_full_single_device.yaml b/recipes/configs/qwen2/1.5B_full_single_device.yaml index 5da79ceb69..cc7fd5f566 100644 --- a/recipes/configs/qwen2/1.5B_full_single_device.yaml +++ b/recipes/configs/qwen2/1.5B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null @@ -79,4 +80,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-1.5B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/1.5B_lora.yaml b/recipes/configs/qwen2/1.5B_lora.yaml index a496dade08..845cb71184 100644 --- a/recipes/configs/qwen2/1.5B_lora.yaml +++ b/recipes/configs/qwen2/1.5B_lora.yaml @@ -44,6 +44,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -66,6 +67,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 8 +compile: False # Logging output_dir: /tmp/Qwen2-1.5B-Instruct-lora-finetune @@ -73,7 +75,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/1.5B_lora_single_device.yaml b/recipes/configs/qwen2/1.5B_lora_single_device.yaml index b41269de1a..f2e8d2beb4 100644 --- a/recipes/configs/qwen2/1.5B_lora_single_device.yaml +++ b/recipes/configs/qwen2/1.5B_lora_single_device.yaml @@ -44,6 +44,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -74,7 +75,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml index 7ffc07e457..06083d908f 100644 --- a/recipes/configs/qwen2/7B_full.yaml +++ b/recipes/configs/qwen2/7B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -59,7 +60,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 16 - +compile: False # Training env device: cuda @@ -76,4 +77,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-7B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/7B_full_single_device.yaml b/recipes/configs/qwen2/7B_full_single_device.yaml index 560dd5fc9f..13290d82a0 100644 --- a/recipes/configs/qwen2/7B_full_single_device.yaml +++ b/recipes/configs/qwen2/7B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -78,4 +79,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-7B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml index d3b63fd1df..6e778ecd7d 100644 --- a/recipes/configs/qwen2/7B_lora.yaml +++ b/recipes/configs/qwen2/7B_lora.yaml @@ -50,6 +50,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -72,6 +73,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 32 +compile: False # Logging output_dir: /tmp/Qwen2-7B-Instruct-lora-finetune @@ -79,7 +81,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml index 6f9fb35b15..e0b19d03a3 100644 --- a/recipes/configs/qwen2/7B_lora_single_device.yaml +++ b/recipes/configs/qwen2/7B_lora_single_device.yaml @@ -48,6 +48,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -78,7 +79,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml index 9cc894a7e5..f7d1b191cd 100644 --- a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml +++ b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml @@ -56,6 +56,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -89,7 +90,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 01bc457ee3..4e1e3f24c5 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -122,6 +122,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type != "cuda": + log.info( + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # _is_rank_zero is used primarily for logging. In the future, the logger # should directly take care of this _, rank = training.get_world_size_and_rank() diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index c9bcf23a30..fd01aabf15 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -117,6 +117,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type != "cuda": + log.info( + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # Training cfg self._resume_from_checkpoint = cfg.resume_from_checkpoint self._gradient_accumulation_steps = cfg.gradient_accumulation_steps diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index a56382f0ae..4c97d6829d 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -120,6 +120,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type != "cuda": + log.info( + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests self.seed = training.set_seed(seed=cfg.seed) diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index 18801ea76e..1ab88deaf8 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -130,6 +130,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type != "cuda": + log.info( + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # training attributes self._enable_activation_checkpointing = cfg.enable_activation_checkpointing diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index c158d17875..f34694ccc8 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -95,6 +95,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type != "cuda": + log.info( + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests self.seed = training.set_seed(seed=cfg.seed) diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 1624d6fcbb..86147e08ca 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -151,6 +151,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type != "cuda": + log.info( + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # training attributes self._enable_activation_checkpointing = cfg.enable_activation_checkpointing self._enable_activation_offloading = cfg.get( @@ -833,6 +839,7 @@ def train(self) -> None: log_dict.update( training.get_memory_stats(device=self._device) ) + if self._clip_grad_norm is not None: log_dict.update({"grad_norm": grad_norm}) self._metric_logger.log_dict( diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 00c4659f12..cbde0305f0 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -141,6 +141,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type != "cuda": + log.info( + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests self.seed = training.set_seed(seed=cfg.seed) diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index 7679af3fd3..1030217d74 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -119,6 +119,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type != "cuda": + log.info( + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests self.seed = training.set_seed(seed=cfg.seed) diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index 6e676d0ce2..afb8e8d0e8 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -127,6 +127,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type != "cuda": + log.info( + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # _is_rank_zero is used primarily for logging. In the future, the logger # should directly take care of this _, rank = training.get_world_size_and_rank()