-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
112 lines (101 loc) · 3.78 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pandas as pd
import matplotlib.pyplot as plt
from converters import perform_label_reduction
from plot_histogram import plot_histogram
from extract_feats import preprocess_data, create_hdf5_dataset, preprocess_data_sota
from tqdm import tqdm
import os
# config
total_len = 30
win_len = 10
step_len = 5
delete_npy = True
stage = 1
use_sota_data = False
sample_rate = 16000
feature_type = ["wav", "mfcc", "log_mel"]
# clean the dataset
if stage == 0:
directory_labels = "../data/magnatagatune/"
plot_histogram(directory_labels, "annotations_final.csv")
perform_label_reduction(directory_labels)
plot_histogram(directory_labels, "annotations_final_new.csv")
song_dir = "../data/magnatagatune/mp3/"
csv_dir = "../data/magnatagatune/annotations_final_new.csv"
save_dir = "preprocessed/" + str(win_len) + "/"
os.makedirs(save_dir, exist_ok=True)
binary_dir = "../sota-music-tagging-models/split/mtat/binary.npy"
tags_dir = "../sota-music-tagging-models/split/mtat/tags.npy"
test_dir = "../sota-music-tagging-models/split/mtat/test.npy"
train_dir = "../sota-music-tagging-models/split/mtat/train.npy"
valid_dir = "../sota-music-tagging-models/split/mtat/valid.npy"
if stage <= 1:
# extract mfcc and labels, save them to numpy files.
if use_sota_data:
preprocess_data_sota(
song_dir,
binary_dir,
tags_dir,
test_dir,
train_dir,
valid_dir,
save_dir,
n_workers=4,
win_len=win_len,
step_len=step_len,
total_len=total_len,
sample_rate=sample_rate,
feature_type=feature_type,
)
else:
preprocess_data(
song_dir,
csv_dir,
save_dir,
n_workers=4,
win_len=win_len,
step_len=step_len,
total_len=total_len,
sample_rate=sample_rate,
feature_type=feature_type,
)
if stage <= 2:
for folder in ["training", "validation", "testing"]:
print("processing " + folder)
if not os.path.exists(save_dir + folder + "/mfcc.h5") and os.path.exists(
save_dir + folder + "/mfcc"
):
print("creating " + folder + "/mfcc.h5")
create_hdf5_dataset(save_dir + folder + "/mfcc", save_dir + folder, "mfcc")
if not os.path.exists(save_dir + folder + "/mfcc_mean.h5") and os.path.exists(
save_dir + folder + "/mfcc_mean"
):
print("creating " + folder + "/mfcc_mean.h5")
create_hdf5_dataset(
save_dir + folder + "/mfcc_mean", save_dir + folder, "mfcc_mean"
)
if not os.path.exists(save_dir + folder + "/log_mel.h5") and os.path.exists(
save_dir + folder + "/log_mel"
):
print("creating " + folder + "/log_mel.h5")
create_hdf5_dataset(
save_dir + folder + "/log_mel", save_dir + folder, "log_mel"
)
if not os.path.exists(save_dir + folder + "/wav.h5") and os.path.exists(
save_dir + folder + "/wav"
):
print("creating " + folder + "/wav.h5")
create_hdf5_dataset(save_dir + folder + "/wav", save_dir + folder, "wav")
if not os.path.exists(save_dir + folder + "/label.h5") and os.path.exists(
save_dir + folder + "/label"
):
print("creating " + folder + "/label.h5")
create_hdf5_dataset(
save_dir + folder + "/label", save_dir + folder, "label"
)
if delete_npy:
# delete folders that contains npy files
print("deleting npy files")
for subfolder in ["mfcc", "mfcc_mean", "label", "log_mel", "wav"]:
os.system("rm -rf " + save_dir + folder + "/" + subfolder)
print("done")