-
Notifications
You must be signed in to change notification settings - Fork 2
/
SplitDatasets.py
147 lines (102 loc) · 4.63 KB
/
SplitDatasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import json
import shutil
from math import floor
## Main function
def main():
# Set the parameters
# TODO: argparser
datasetNames = ['Aphanizomenon_flosaquae', 'Centrales_sp', 'Dolichospermum_an', 'Chaetoceros_sp', 'Nodularia_spumigena', 'Pauliella_taeniata', 'Peridiniella_catenata_chain', 'Peridiniella_catenata_single', 'Skeletonema_marinoi']
#datasetNames = ['Dolichospermum_an']
tstRatioNOK = 0.6
valRatioNOK = 1 - tstRatioNOK
for datasetName in datasetNames:
## Set the dataset path and create directory structure for OCC
basePath = './' + datasetName + '/'
imPath = basePath + datasetName + '_YOLO/images'
lbPath = basePath + datasetName + '_YOLO/labels'
occPath = basePath + datasetName + '_OCC'
# Create the directories for the image storage
if not os.path.exists(occPath):
# Create the folder structure
os.makedirs(occPath + '/train/ok')
os.makedirs(occPath + '/train/nok')
os.makedirs(occPath + '/valid/ok')
os.makedirs(occPath + '/valid/nok')
os.makedirs(occPath + '/test/ok')
os.makedirs(occPath + '/test/nok')
else:
print("Folder structure with the OCC dataset already exists...")
continue
## Get the OK, NOK and Parasite class IDs
# Open JSON file as dictionary
f = open(basePath + '/' + datasetName + '_YOLO/notes.json')
notes = json.load(f)
# Define the empty lists
idsOK = []
idsNOK = []
idPAR = -1
for i in notes['categories']:
if i['name'] == 'Parasite':
idPAR = i['id']
elif '_Clean' in i['name']:
idsOK.append(i['id'])
else:
idsNOK.append(i['id'])
## Split the OK and NOK images
imgsOK = []
imgsNOK = []
# Loop through the label files
for labelFile in os.listdir(lbPath):
# Open the annotations files
with open(os.path.join(lbPath, labelFile), 'r') as f:
# Read the annotations
annotations = f.readlines()
# Get the image name
imgFile = labelFile.replace('txt', 'png')
lsAnnot = []
# Get the single annotations
for annotation in annotations:
annotation = annotation.strip()
annotation = annotation.split()
# Append annotation ID if it exists
if annotation:
lsAnnot.append(int(annotation[0]))
# Check if the annotations contain NOK or Parasite class
if any(x in lsAnnot for x in idsNOK) or idPAR in lsAnnot:
imgsNOK.append(imgFile)
else:
imgsOK.append(imgFile)
# Get total counts of the OK and NOK images
cntOK = len(imgsOK)
cntNOK = len(imgsNOK)
# Print out the dataset statistics
print('Dataset name: ', datasetName)
print('OK samples count: ', cntOK)
print('NOK samples count: ', cntNOK)
print('')
# Set the train-test-val split based on the available data (tst and val dataset will be always balanced)
cntOK_maxTstVal = cntOK * 0.2 + cntOK * 0.1
cntOK_desTstVal = cntNOK * tstRatioNOK + cntNOK * valRatioNOK
if cntOK_maxTstVal >= cntOK_desTstVal:
cntOK_setTst = int(floor(cntOK_desTstVal * tstRatioNOK))
cntOK_setVal = int(floor(cntOK_desTstVal * valRatioNOK))
else:
cntOK_setTst = int(floor(cntOK_maxTstVal * tstRatioNOK))
cntOK_setVal = int(floor(cntOK_maxTstVal * valRatioNOK))
# Copy the images to the OCC folder structure
# Test data
for i in range(cntOK_setTst):
shutil.copy(os.path.join(imPath, imgsOK[i]), (occPath + '/test/ok/'))
shutil.copy(os.path.join(imPath, imgsNOK[i]), (occPath + '/test/nok/'))
# Valid data
for i in range(cntOK_setVal):
shutil.copy(os.path.join(imPath, imgsOK[i + cntOK_setTst]), (occPath + '/valid/ok/'))
shutil.copy(os.path.join(imPath, imgsNOK[i + cntOK_setTst]), (occPath + '/valid/nok/'))
# Train data
for i in range(cntOK - cntOK_setTst - cntOK_setVal):
shutil.copy(os.path.join(imPath, imgsOK[i + cntOK_setTst + cntOK_setVal]), (occPath + '/train/ok/'))
# Save everything to zip folder
#shutil.make_archive(datasetName, 'zip', basePath)
if __name__ == '__main__':
main()