-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_phi3.sh
167 lines (160 loc) · 5.75 KB
/
train_phi3.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/bin/bash
# Elva
# Copyright (c) 2024-present NAVER Cloud Corp.
# MIT license
RunName=elva_3.8b
BaseLLM=microsoft/Phi-3-mini-4k-instruct
PromptFormat=phi3
VisionEncoder=gwkrsrch2/elva-encoder-base-patch32
AlignmentDatasetDIR=./data/pretrain/LLaVA-Pretrain
# Stage1: Alignment
# https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain
# "$AlignmentDatasetDIR"
# ├── blip_laion_cc_sbu_558k.json
# └── images
# Note:
# This is a test script.
# In practice, remove --max_steps
deepspeed LLaVA/llava/train/train_xformers.py \
--deepspeed LLaVA/scripts/zero2.json \
--version plain \
--model_name_or_path $BaseLLM \
--vision_tower $VisionEncoder \
--data_path $AlignmentDatasetDIR/blip_laion_cc_sbu_558k.json \
--image_folder $AlignmentDatasetDIR/images \
--mm_projector_type mlp2x_gelu \
--tune_mm_mlp_adapter True \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_grid_pinpoints "[[224, 224], [224, 448], [448, 224], [448, 448], [448, 672], [672, 448], [672, 672], [672, 896], [896, 672]]" \
--mm_patch_merge_type spatial \
--image_aspect_ratio anyres \
--fp16 True \
--bf16 False \
--tf32 False \
--num_train_epochs 1 \
--max_steps 50 \
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 24000 \
--save_total_limit 1 \
--learning_rate 1e-3 \
--adam_epsilon 1e-6 \
--max_grad_norm 0.5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 1 \
--lazy_preprocess True \
--report_to none \
--output_dir ./checkpoints/pretrain_llava_$RunName \
--run_name pretrain_llava_$RunName
InstructDatasetDIR=./data
# Stage2: Visual Instruct Tuning
# https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_v1_5_mix665k.json
# https://github.com/haotian-liu/LLaVA/tree/c121f0432da27facab705978f83c4ada465e46fd?tab=readme-ov-file#visual-instruction-tuning
# "$InstructDatasetDIR"
# ├── llava_v1_5_mix665k.json
# ├── coco
# │ └── train2017
# ├── gqa
# │ └── images
# ├── ocr_vqa
# │ └── images
# ├── textvqa
# │ └── train_images
# └── vg
# ├── VG_100K
# └── VG_100K_2
deepspeed LLaVA/llava/train/train_xformers.py \
--deepspeed LLaVA/scripts/zero3.json \
--version $PromptFormat \
--model_name_or_path $BaseLLM \
--vision_tower $VisionEncoder \
--pretrain_mm_mlp_adapter ./checkpoints/pretrain_llava_$RunName/mm_projector.bin \
--data_path $InstructDatasetDIR/llava_v1_5_mix665k.json \
--image_folder $InstructDatasetDIR \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_grid_pinpoints "[[224, 224], [224, 448], [448, 224], [448, 448], [448, 672], [672, 448], [672, 672], [672, 896], [896, 672]]" \
--mm_patch_merge_type spatial \
--image_aspect_ratio anyres \
--group_by_modality_length True \
--fp16 True \
--bf16 False \
--tf32 False \
--num_train_epochs 1 \
--max_steps 50 \
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50000 \
--save_total_limit 1 \
--learning_rate 3e-4 \
--adam_epsilon 1e-6 \
--max_grad_norm 0.5 \
--weight_decay 0.001 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 1 \
--lazy_preprocess True \
--report_to none \
--output_dir ./checkpoints/finetune_llava_$RunName \
--run_name finetune_llava_$RunName
# Finished!
# checkpoints/
# ├── finetune_llava_elva_3.8b
# │ ├── added_tokens.json
# │ ├── checkpoint-50
# │ │ ├── added_tokens.json
# │ │ ├── config.json
# │ │ ├── generation_config.json
# │ │ ├── global_step50
# │ │ ├── latest
# │ │ ├── model-00001-of-00002.safetensors
# │ │ ├── model-00002-of-00002.safetensors
# │ │ ├── model.safetensors.index.json
# │ │ ├── rng_state_0.pth
# │ │ ├── rng_state_1.pth
# │ │ ├── rng_state_2.pth
# │ │ ├── rng_state_3.pth
# │ │ ├── rng_state_4.pth
# │ │ ├── rng_state_5.pth
# │ │ ├── rng_state_6.pth
# │ │ ├── rng_state_7.pth
# │ │ ├── scheduler.pt
# │ │ ├── special_tokens_map.json
# │ │ ├── tokenizer.model
# │ │ ├── tokenizer_config.json
# │ │ ├── trainer_state.json
# │ │ ├── training_args.bin
# │ │ └── zero_to_fp32.py
# │ ├── config.json
# │ ├── generation_config.json
# │ ├── model-00001-of-00002.safetensors
# │ ├── model-00002-of-00002.safetensors
# │ ├── model.safetensors.index.json
# │ ├── special_tokens_map.json
# │ ├── tokenizer.model
# │ ├── tokenizer_config.json
# │ ├── trainer_state.json
# │ └── training_args.bin
# └── pretrain_llava_elva_3.8b
# ├── checkpoint-50
# │ ├── config.json
# │ └── mm_projector.bin
# ├── config.json
# ├── mm_projector.bin
# └── trainer_state.json