-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathhdf5datasetwriter.py
101 lines (91 loc) · 4.28 KB
/
hdf5datasetwriter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
" License:
" -----------------------------------------------------------------------------
" Copyright (c) 2018, Ratnajit Mukherjee.
" All rights reserved.
"
" Redistribution and use in source and binary forms, with or without
" modification, are permitted provided that the following conditions are met:
"
" 1. Redistributions of source code must retain the above copyright notice,
" this list of conditions and the following disclaimer.
"
" 2. Redistributions in binary form must reproduce the above copyright notice,
" this list of conditions and the following disclaimer in the documentation
" and/or other materials provided with the distribution.
"
" 3. Neither the name of the copyright holder nor the names of its contributors
" may be used to endorse or promote products derived from this software
" without specific prior written permission.
"
" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
" ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
" POSSIBILITY OF SUCH DAMAGE.
" -----------------------------------------------------------------------------
"
" Description: Originally written to dump converted Kaggle datasets to HDF5 files
" for training, validation and test sets by Arian Rosebrock
" Creation date: June 2017
" URL: https://www.pyimagesearch.com/
"
" Integrated into an emotion detection application by:
" Author: Ratnajit Mukherjee, ratnajitmukherjee@gmail.com
" Date: July 2018
"""
# import the necessary packages
import h5py
import os
class HDF5DatasetWriter:
def __init__(self, dims, outputPath, dataKey = "images", bufSize = 1000):
# check to see if the output path exists, and if so, raise
# an exception
if os.path.exists(outputPath):
raise ValueError("The supplied `outputPath` already exists and cannot be overwritten. Manually delete "
"the file before continuing.", outputPath)
# open the HDF5 database for writing and create two datasets:
# one to store the images/features and another to store the
# class labels
self.db = h5py.File(outputPath, "w")
self.data = self.db.create_dataset(dataKey, dims, dtype="float")
self.labels = self.db.create_dataset("labels", (dims[0],), dtype="int")
# store the buffer size, then initialize the buffer itself
# along with the index into the datasets
self.bufSize = bufSize
self.buffer = {"data": [], "labels": []}
self.idx = 0
def add(self, rows, labels):
# add the rows and labels to the buffer
self.buffer["data"].extend(rows)
self.buffer["labels"].extend(labels)
# check to see if the buffer needs to be flushed to disk
if len(self.buffer["data"]) >= self.bufSize:
self.flush()
def flush(self):
# write the buffers to disk then reset the buffer
i = self.idx + len(self.buffer["data"])
self.data[self.idx:i] = self.buffer["data"]
self.labels[self.idx:i] = self.buffer["labels"]
self.idx = i
self.buffer = {"data": [], "labels": []}
def storeClassLabels(self, classLabels):
# create a dataset to store the actual class label names,
# then store the class labels
dt = h5py.special_dtype(vlen=str) # `vlen=unicode` for Py2.7
labelSet = self.db.create_dataset("label_names",
(len(classLabels),), dtype=dt)
labelSet[:] = classLabels
def close(self):
# check to see if there are any other entries in the buffer
# that need to be flushed to disk
if len(self.buffer["data"]) > 0:
self.flush()
# close the dataset
self.db.close()