-
Notifications
You must be signed in to change notification settings - Fork 0
/
quantize.py
122 lines (107 loc) · 3.55 KB
/
quantize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.gptq import GPTQQuantizer
import json
import torch
from loguru import logger
import random
import time
def main(args):
random.seed(args.seed)
files = args.data.split(",")
data = []
for file in files:
with open(file, "r") as f:
data += json.load(f)
examples = [d["text"] for d in data]
logger.info(f"loaded {len(examples)} examples")
logger.info(f"shuffling {len(examples)} examples")
random.shuffle(examples)
logger.info(f"will use {args.max_samples} examples")
examples = examples[: args.max_samples]
# print 10 examples
for i in range(10):
print(examples[i])
print("=======================")
print("Loading model")
tokenizer = AutoTokenizer.from_pretrained(
args.model_name, trust_remote_code=args.trust_remote_code
)
model = AutoModelForCausalLM.from_pretrained(
args.model_name,
torch_dtype=torch.float16 if args.torch_dtype == "float16" else torch.bfloat16,
trust_remote_code=args.trust_remote_code,
)
print("Quantizing model")
# print model device and dtype at every 20 layers
for i, (name, param) in enumerate(model.named_parameters()):
if i % 20 == 0:
print(name, param.device, param.dtype)
quantizer = GPTQQuantizer(
bits=args.bits,
dataset=examples,
block_name_to_quantize=args.block_name_to_quantize,
model_seqlen=2048,
group_size=args.group_size,
damp_percent=0.1,
desc_act=False,
sym=True,
use_cuda_fp16=True,
batch_size=1,
)
logger.info("Quantizing model...it will take about 6 hours")
t = time.time()
quantized_model = quantizer.quantize_model(model, tokenizer)
logger.info(f"Quantizing model...done in {time.time()/3600 - t/3600} hours")
logger.info("Saving model")
quantized_model.save_pretrained(args.output_folder)
# save the tokenizer
tokenizer.save_pretrained(args.output_folder)
logger.info("done")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Quantize a GPT model")
parser.add_argument(
"--model_name", type=str, help="Path to the pre-trained model directory"
)
parser.add_argument("--data", type=str, help="Path to the JSON data file")
parser.add_argument(
"--bits",
type=int,
default=4,
help="Number of bits for quantization (default: 4)",
)
parser.add_argument(
"--output_folder",
type=str,
default="llama2-70B-gptq-4bit",
help="Path to save the quantized model (default: llama2-70B-gptq-4bit)",
)
parser.add_argument(
"--max_samples",
type=int,
default=2048,
help="Maximum number of samples (default: 2048)",
)
parser.add_argument(
"--trust_remote_code",
action="store_true",
help="Trust remote code (default: False)",
)
parser.add_argument("--group_size", type=int, default=1, help="Group size")
parser.add_argument(
"--seed", type=int, default=42, help="Random seed (default: 42)"
)
parser.add_argument(
"--block_name_to_quantize",
type=str,
default="transformer.h",
help="Block name to quantize (default: transformer.h)",
) # "model.layers"
parser.add_argument(
"--torch_dtype",
type=str,
default="float16",
help="Torch dtype (default: float16)",
)
args = parser.parse_args()
main(args)