-
Notifications
You must be signed in to change notification settings - Fork 502
/
mksubset.py
159 lines (141 loc) · 5.77 KB
/
mksubset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# coding: utf-8
"""
Make subset of dataset
usage: mksubset.py [options] <in_dir> <out_dir>
options:
-h, --help Show help message.
--limit=<N> Limit dataset size by N-hours [default: 10000].
--train-dev-test-split Train/test split.
--dev-size=<N> Development size or rate [default: 0.1].
--test-size=<N> Test size or rate [default: 0.1].
--target-sr=<N> Resampling.
--random-state=<N> Random seed [default: 1234].
"""
from docopt import docopt
import librosa
from glob import glob
from os.path import join, basename, exists, splitext
from tqdm import tqdm
import sys
import os
from shutil import copy2
from scipy.io import wavfile
import numpy as np
def read_wav_or_raw(src_file, is_raw):
if is_raw:
sr = 24000 # hard coded for now
x = np.fromfile(src_file, dtype=np.int16)
else:
sr, x = wavfile.read(src_file)
return sr, x
def write_wav_or_raw(dst_path, sr, x, is_raw):
if is_raw:
x.tofile(dst_path)
else:
wavfile.write(dst_path, sr, x)
if __name__ == "__main__":
args = docopt(__doc__)
in_dir = args["<in_dir>"]
out_dir = args["<out_dir>"]
limit = float(args["--limit"])
train_dev_test_split = args["--train-dev-test-split"]
dev_size = float(args["--dev-size"])
test_size = float(args["--test-size"])
target_sr = args["--target-sr"]
target_sr = int(target_sr) if target_sr is not None else None
random_state = int(args["--random-state"])
src_files = sorted(glob(join(in_dir, "*.wav")))
raw_files = sorted(glob(join(in_dir, "*.raw")))
is_raw = len(src_files) == 0 and len(raw_files) > 0
if is_raw:
print("Assuming 24kHz /16bit audio data")
src_files = raw_files
if len(src_files) == 0:
raise RuntimeError("No files found in {}".format(in_dir))
total_samples = 0
indices = []
signed_int16_max = 2**15
os.makedirs(out_dir, exist_ok=True)
if train_dev_test_split:
os.makedirs(join(out_dir, "train_no_dev"), exist_ok=True)
os.makedirs(join(out_dir, "dev"), exist_ok=True)
os.makedirs(join(out_dir, "eval"), exist_ok=True)
print("Total number of utterances: {}".format(len(src_files)))
for idx, src_file in tqdm(enumerate(src_files)):
sr, x = read_wav_or_raw(src_file, is_raw)
if x.dtype == np.int16:
x = x.astype(np.float32) / signed_int16_max
total_samples += len(x)
total_hours = float(total_samples) / sr / 3600.0
indices.append(idx)
if total_hours > limit:
print("Total hours {:.3f} exceeded limit ({} hours).".format(total_hours, limit))
break
print("Total number of collected utterances: {}".format(len(indices)))
if train_dev_test_split:
from sklearn.model_selection import train_test_split as split
# Get test and dev set from last
if test_size > 1 and dev_size > 1:
test_size = int(test_size)
dev_size = int(dev_size)
testdev_size = test_size + dev_size
train_indices = indices[:-testdev_size]
dev_indices = indices[-testdev_size:-testdev_size + dev_size]
test_indices = indices[-test_size:]
else:
train_indices, dev_test_indices = split(
indices, test_size=test_size + dev_size, random_state=random_state)
dev_indices, test_indices = split(
dev_test_indices, test_size=test_size / (test_size + dev_size),
random_state=random_state)
sets = [
(sorted(train_indices), join(out_dir, "train_no_dev")),
(sorted(dev_indices), join(out_dir, "dev")),
(sorted(test_indices), join(out_dir, "eval")),
]
else:
sets = [(indices, out_dir)]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
total_samples = {}
sr = 0
for indices, d in sets:
set_name = basename(d)
total_samples[set_name] = 0
for idx in tqdm(indices):
src_file = src_files[idx]
dst_path = join(d, basename(src_file))
if target_sr is not None:
sr, x = read_wav_or_raw(src_file, is_raw)
is_int16 = x.dtype == np.int16
if is_int16:
x = x.astype(np.float32) / signed_int16_max
if target_sr is not None and target_sr != sr:
x = librosa.resample(x, sr, target_sr)
sr = target_sr
scaler.partial_fit(x.astype(np.float64).reshape(-1, 1))
if is_int16:
x = (x * signed_int16_max).astype(np.int16)
write_wav_or_raw(dst_path, sr, x, is_raw)
total_samples[set_name] += len(x)
else:
sr, x = read_wav_or_raw(src_file, is_raw)
is_int16 = x.dtype == np.int16
if is_int16:
x = x.astype(np.float32) / signed_int16_max
scaler.partial_fit(x.astype(np.float64).reshape(-1, 1))
total_samples[set_name] += len(x)
copy2(src_file, dst_path)
print("Waveform min: {}".format(scaler.data_min_))
print("Waveform max: {}".format(scaler.data_max_))
absmax = max(np.abs(scaler.data_min_[0]), np.abs(scaler.data_max_[0]))
print("Waveform absolute max: {}".format(absmax))
if absmax > 1.0:
print("There were clipping(s) in your dataset.")
print("Global scaling factor would be around {}".format(1.0 / absmax))
if train_dev_test_split:
print("Train/dev/test split:")
for n, s in zip(["train_no_dev", "dev", "eval"], sets):
hours = total_samples[n] / sr / 3600.0
print("{}: {:.2f} hours ({} utt)".format(n, hours, len(s[0])))
sys.exit(0)