-
Notifications
You must be signed in to change notification settings - Fork 1
/
build_dataset.py
119 lines (94 loc) · 4.45 KB
/
build_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import config
from pathlib import Path
import numpy as np
import shutil
import argparse
import pathlib
import os
def copy_images(image_paths, folder):
dataset_folder = os.path.join(args['dataset_path'], folder)
if not os.path.exists(dataset_folder):
os.makedirs(dataset_folder)
for path in image_paths:
image_name = path.split(os.path.sep)[-1]
label = path.split(os.path.sep)[-2]
label_folder = os.path.join(dataset_folder, label)
if not os.path.exists(label_folder):
os.makedirs(label_folder)
destination = os.path.join(label_folder, image_name)
shutil.copy(path, destination)
def tree(dir_path, prefix=''):
space, branch, tee, last = ' ', '│ ', '├── ', '└── '
contents = [p for p in dir_path.iterdir() if p.is_dir()]
pointers = [tee] * (len(contents) - 1) + [last]
for pointer, path in zip(pointers, contents):
count = len([p for p in path.iterdir() if p.is_file()])
yield prefix + pointer + path.name + (f' ({count} images)' if count > 0 else '')
if path.is_dir():
extension = branch if pointer == tee else space
yield from tree(path, prefix=prefix+extension)
def list_images(image_path):
image_types = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']
image_paths = [(Path(p.parent).name, str(p)) for p in Path(image_path).glob('*/*.*') if p.suffix in image_types]
label_paths = {}
for i in image_paths:
label_paths.setdefault(i[0],[]).append(i[1])
return label_paths
def build_dataset(args):
if args['show_tree']:
dataset_path = Path(args['dataset_path'])
if dataset_path.exists():
print(dataset_path.name)
for line in tree(dataset_path):
print(line)
else:
print(f"dataset folder '{str(dataset_path)}' does not exist.")
return
label_paths = list_images(args['images_path'])
labels = label_paths.keys()
if len(labels) > 0:
for label in labels:
image_paths = label_paths.get(label)
print(f'[INFO] {len(image_paths)} images with label: {label}')
if args['shuffle']:
print('[INFO] shuffling image paths ...')
np.random.shuffle(image_paths)
test_paths_len = int(len(image_paths) * args['test_split'])
val_paths_len = int(len(image_paths) * args['val_split'])
train_paths_len = len(image_paths) - test_paths_len - val_paths_len
test_paths = image_paths[-test_paths_len:]
val_paths = image_paths[train_paths_len:-test_paths_len]
train_paths = image_paths[:train_paths_len]
copy_images(train_paths, config.TRAIN)
copy_images(val_paths, config.VAL)
copy_images(test_paths, config.TEST)
dataset_path = Path(args['dataset_path'])
print(f'\n{dataset_path.name}')
for line in tree(dataset_path):
print(line)
else:
print('No labels found.')
if __name__ == '__main__':
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description='''Create Dataset.
Image folder expects the files and directories to be constructed like this:
.
└── images
├── label_1
│ ├── image_0.jpg
│ ├── image_1.jpg
│ └── image_2.jpg
└── label_2
├── image_3.jpg
└── image_4.jpg
Each class has it's own directory for the images. The images are then labeled with the class taken from the directory name.
''', formatter_class=CustomFormatter)
parser.add_argument('--images-path', type=pathlib.Path, default=config.IMAGES_PATH, metavar='PATH', help='path to image data')
parser.add_argument('--dataset-path', type=pathlib.Path, default=config.DATASET_PATH, metavar='PATH', help='path to dataset')
parser.add_argument('--shuffle', default=True, action='store_true', help='shuffle images')
parser.add_argument('--test-split', default=config.TEST_SPLIT, type=float, metavar='VAL', help='test split')
parser.add_argument('--val-split', default=config.VAL_SPLIT, type=float, metavar='VAL', help='validation split')
parser.add_argument('--show-tree', action='store_true', help='show dataset tree with number of images and exit')
args = vars(parser.parse_args())
build_dataset(args)