-
Notifications
You must be signed in to change notification settings - Fork 5
/
trainer_base.py
173 lines (131 loc) · 5.49 KB
/
trainer_base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import numpy as np
import time
import sys
import os
import json
from utils import *
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
# Reference : https://github.com/nianticlabs/monodepth2
class Trainer:
def __init__(self, options):
self.opt = options
self.opt.name = sys.argv[0].split('/')[-1].split('.')[0][8:]
self.log_path = os.path.join("./runs", self.opt.name)
GPU_NUM = self.opt.gpu
self.device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(self.device) # change allocation of current GPU
self.epoch = 0
self.step = 0
self.set_init()
print("GPU: {}\n".format(self.device))
self.set_dataset()
if not self.opt.is_eval:
self.writers = {}
for mode in ["train", "val"]:
self.writers[mode] = SummaryWriter(os.path.join(self.log_path, mode))
# print("There are {:d} training items and {:d} validation items\n".format(
# len(train_dataset), len(val_dataset)))
if not self.opt.is_eval:
self.save_opts()
def set_init(self):
raise NotImplementedError
def process_batch(self, inputs, is_val):
raise NotImplementedError
def log(self, mode, inputs, outputs, losses):
raise NotImplementedError
def set_dataset(self):
raise NotImplementedError
def set_train(self):
"""Convert all models to training mode
"""
for m in self.models.values():
m.train()
def set_eval(self):
"""Convert all models to testing/evaluation mode
"""
for m in self.models.values():
m.eval()
def train(self):
"""Run the entire training pipeline
"""
# self.epoch = 0
# self.step = 0
self.start_time = time.time()
for self.epoch in range(self.opt.num_epochs):
self.run_epoch()
if (self.epoch + 1) % self.opt.save_frequency == 0:
self.save_model()
def run_epoch(self):
"""Run a single epoch of training and validation
"""
self.model_lr_scheduler.step()
print("Training")
self.set_train()
for batch_idx, inputs in enumerate(self.train_loader):
before_op_time = time.time()
outputs, losses = self.process_batch(inputs)
self.model_optimizer.zero_grad()
losses["loss"].backward()
self.model_optimizer.step()
duration = time.time() - before_op_time
# log less frequently after the first 2000 steps to save time & disk space
early_phase = batch_idx % self.opt.log_frequency == 0 and self.step < 2000
late_phase = self.step % 2000 == 0
if early_phase or late_phase:
self.log_time(batch_idx, duration, losses["loss"].cpu().data)
# if "depth_gt" in inputs:
# self.compute_depth_losses(inputs, outputs, losses)
self.log("train", inputs, outputs, losses)
self.val()
self.step += 1
def val(self):
"""Validate the model on a single minibatch
"""
self.set_eval()
try:
inputs = self.val_iter.next()
except StopIteration:
self.val_iter = iter(self.val_loader)
inputs = self.val_iter.next()
with torch.no_grad():
outputs, losses = self.process_batch(inputs, is_val = True)
# if "depth_gt" in inputs:
# self.compute_depth_losses(inputs, outputs, losses)
self.log("val", inputs, outputs, losses)
del inputs, outputs, losses
self.set_train()
def log_time(self, batch_idx, duration, loss):
"""Print a logging statement to the terminal
"""
samples_per_sec = self.opt.batch_size / duration
time_sofar = time.time() - self.start_time
training_time_left = (
self.num_total_steps / self.step - 1.0) * time_sofar if self.step > 0 else 0
print_string = "epoch {:>3} | batch {:>6} | examples/s: {:5.1f}" + \
" | loss: {:.5f} | time elapsed: {} | time left: {}"
print(print_string.format(self.epoch, batch_idx, samples_per_sec, loss,
sec_to_hm_str(time_sofar), sec_to_hm_str(training_time_left)))
def update_time(self):
time_sofar = time.time() - self.start_time
training_time_left = (
self.num_total_steps / self.step - 1.0) * time_sofar if self.step > 0 else 0
return sec_to_hm_str(time_sofar), sec_to_hm_str(training_time_left)
def save_opts(self):
"""Save options to disk so we know what we ran this experiment with
"""
models_dir = os.path.join(self.log_path, "models")
if not os.path.exists(models_dir):
os.makedirs(models_dir)
to_save = self.opt.__dict__.copy()
with open(os.path.join(models_dir, 'opt.json'), 'w') as f:
json.dump(to_save, f, indent=2)
# from options import Options
# options = Options()
# opts = options.parse()
# if __name__ == "__main__":
# trainer = Trainer(opts)
# trainer.train()