Skip to content

Commit

Permalink
Remove fp16 & Add new schedulers (#394)
Browse files Browse the repository at this point in the history
* Remove fp16 & Add new schedulers

* Rename variables

* Update dataloader.py
  • Loading branch information
Eric8932 authored Oct 12, 2023
1 parent 9e1e635 commit 7079fa5
Show file tree
Hide file tree
Showing 21 changed files with 158 additions and 211 deletions.
8 changes: 0 additions & 8 deletions finetune/run_c3.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,14 +160,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
8 changes: 0 additions & 8 deletions finetune/run_chid.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,11 +185,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_bat
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

if args.use_adv and args.adv_type == "fgm":
args.adv_method.attack(epsilon=args.fgm_epsilon)
Expand Down Expand Up @@ -310,14 +306,6 @@ def main():
args.logger.info("The number of training instances: {}".format(instances_num))
optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
8 changes: 1 addition & 7 deletions finetune/run_classifier_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,7 @@ def main():
model = model.to(args.device)
load_or_initialize_parameters(args, model)
optimizer, scheduler = build_optimizer(args, model)
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
model = torch.nn.DataParallel(model)
args.model = model
Expand Down
8 changes: 1 addition & 7 deletions finetune/run_classifier_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,7 @@ def main():
model = model.to(args.device)
load_or_initialize_parameters(args, model)
optimizer, scheduler = build_optimizer(args, model)
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
model = torch.nn.DataParallel(model)
args.model = model
Expand Down
8 changes: 0 additions & 8 deletions finetune/run_classifier_mt.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
17 changes: 2 additions & 15 deletions finetune/run_classifier_multi_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_bat
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

if args.use_adv and args.adv_type == "fgm":
args.adv_method.attack(epsilon=args.fgm_epsilon)
Expand Down Expand Up @@ -233,14 +229,6 @@ def main():
args.logger.info("Batch size: {}".format(batch_size))
args.logger.info("The number of training instances: {}".format(instances_num))
optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
Expand All @@ -253,8 +241,7 @@ def main():
total_loss, result, best_result = 0.0, 0.0, 0.0

args.logger.info("Start training.")



for epoch in range(1, args.epochs_num + 1):
random.shuffle(trainset)
src = torch.LongTensor([example[0] for example in trainset])
Expand Down
15 changes: 2 additions & 13 deletions finetune/run_classifier_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def read_dataset(args, path):
args.logger.info(f"read dataset, count:{count}, ignore_count:{ignore_count}")
return dataset


def train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch):
model.zero_grad()

Expand All @@ -130,11 +131,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_bat
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -256,14 +253,6 @@ def main():
args.logger.info("The number of training instances: {}".format(instances_num))
optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_classifier_siamese.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_bat
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -288,14 +284,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
13 changes: 1 addition & 12 deletions finetune/run_cmrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,7 @@ def train(args, model, optimizer, scheduler, src_batch, seg_batch, start_positio
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -394,13 +390,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer,opt_level=args.fp16_opt_level)

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
8 changes: 0 additions & 8 deletions finetune/run_dbqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer,opt_level = args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
13 changes: 1 addition & 12 deletions finetune/run_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,7 @@ def train(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch):
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -288,13 +284,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level = args.fp16_opt_level)

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
8 changes: 0 additions & 8 deletions finetune/run_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,6 @@ def main():
args.logger.info("The number of training instances: {}".format(instances_num))
optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_simcse.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,14 +202,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down Expand Up @@ -245,11 +237,7 @@ def main():
tgt_batch = torch.arange(similarity_matrix.size(0), device=similarity_matrix.device, dtype=torch.long)
loss = nn.CrossEntropyLoss()(similarity_matrix, tgt_batch)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down
14 changes: 1 addition & 13 deletions finetune/run_text2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,7 @@ def train_model(args, model, optimizer, scheduler, src_batch, tgt_in_batch, tgt_
if torch.cuda.device_count() > 1:
loss = torch.mean(loss)

if args.fp16:
with args.amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
loss.backward()

optimizer.step()
scheduler.step()
Expand Down Expand Up @@ -262,14 +258,6 @@ def main():

optimizer, scheduler = build_optimizer(args, model)

if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
args.amp = amp

if torch.cuda.device_count() > 1:
args.logger.info("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
Expand Down
6 changes: 3 additions & 3 deletions pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,11 @@ def main():
elif args.world_size == 1 and ranks_num == 1:
# Single GPU mode.
assert torch.cuda.is_available(), "No available GPUs."
args.gpu_id = args.gpu_ranks[0]
assert args.gpu_id < torch.cuda.device_count(), "Invalid specified GPU device."
args.local_rank = args.gpu_ranks[0]
assert args.local_rank < torch.cuda.device_count(), "Invalid specified GPU device."
args.dist_train = False
args.single_gpu = True
print("Using GPU %d for training." % args.gpu_id)
print("Using GPU %d for training." % args.local_rank)
else:
# CPU mode.
assert ranks_num == 0, "GPUs are specified, please check the arguments."
Expand Down
9 changes: 3 additions & 6 deletions uer/opts.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,13 @@ def optimization_opts(parser):
help="Learning rate.")
parser.add_argument("--warmup", type=float, default=0.1,
help="Warm up value.")
parser.add_argument("--fp16", action='store_true',
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit.")
parser.add_argument("--fp16_opt_level", choices=["O0", "O1", "O2", "O3" ], default='O1',
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
"See details at https://nvidia.github.io/apex/amp.html")
parser.add_argument("--lr_decay", type=float, default=0.5,
help="Learning rate decay value.")
parser.add_argument("--optimizer", choices=["adamw", "adafactor"],
default="adamw",
help="Optimizer type.")
parser.add_argument("--scheduler", choices=["linear", "cosine", "cosine_with_restarts", "polynomial",
"constant", "constant_with_warmup"],
"constant", "constant_with_warmup", "inverse_sqrt", "tri_stage"],
default="linear", help="Scheduler type.")


Expand Down
Loading

0 comments on commit 7079fa5

Please sign in to comment.