用Python將多標籤資料存入caffe使用的HDF5&LMDB格式
阿新 • • 發佈:2019-02-07
最近在進行多標籤的資料轉換,發現直接使用caffe自帶的convert_image不是很方面,就收集了一下用python的處理方法。現整理以備後用。
使用時發現,用python寫入lmdb有個問題,如果事先無法知道資料的大小,那麼分配的儲存空間map_size就不好確定(預設是10M),且不會自動隨寫入資料的實際大小而調整,所以還是HDF5好處理一下,雖然在caffe中是提倡使用lmdb。
在此也望有經驗的前輩能指教一下,用python處理lmdb時,檔案的大小如何預分配,或有什麼辦法能讓lmdb的檔案大小能隨寫入或刪除資料而自動增減?先謝過啦!
<pre name="code" class="python">import lmdb import random import os import caffe def convert_data_lmdb(train_data,train_label,output_data_lmdb,output_labels_lmdb): """ Used for save data and multi-labels to lmdbs call: convert_data_lmdb(train_X,train_y,'train_data_lmdb','train_labels_lmdb') """ X = train_data.astype(np.float) y = train_label.astype(np.float) X, y = shuffle(X, y, random_state=42) # shuffle train data # creating images lmdb in_db = lmdb.open(output_data_lmdb, map_size=X.nbytes*10) with in_db.begin(write=True) as in_txn : for in_idx,in_ in enumerate(X) : im = in_; im = im[:,:,::-1] im = im.transpose((2, 0, 1)) im_dat = caffe.io.array_to_datum(im) #in_txn.put(in_idx.encode('ascii'), im_dat.SerializeToString()) in_txn.put('{:0>10d}'.format(in_idx), im_dat.SerializeToString()) in_db.close() in_label = lmdb.open(output_labels_lmdb, map_size=y.nbytes*10) counter_label = 0 with in_label.begin(write=True) as in_txn : for idx in range(y.shape[0]): datum = caffe.io.array_to_datum(y[np.newaxis,np.newaxis,idx]) in_txn.put("{:0>10d}".format(counter_label), datum.SerializeToString()) counter_label += 1 in_label.close() def write_hdf5(filename): import h5py IMAGE_SIZE = (96, 96) LABEL_SIZE = 30 # Multi-labels MEAN_VALUE = 128 #filename = sys.argv[1] setname, ext = filename.split('.') with open(filename, 'r') as f: lines = f.readlines() np.random.shuffle(lines) sample_size = len(lines) imgs = np.zeros((sample_size, 1,) + IMAGE_SIZE, dtype=np.float32) scores = np.zeros((sample_size,1) + LABEL_SIZE, dtype=np.float32) h5_filename = '{}.h5'.format(setname) with h5py.File(h5_filename, 'w') as h: for i, line in enumerate(lines): image_name, score = line[:-1].split() img = pyplot.imread(image_name)[:, :, 0].astype(np.float32) img = img.reshape((1, )+img.shape) #img -= MEAN_VALUE imgs[i] = img scores[i,1] = float(score) if (i+1) % 1000 == 0: print('processed {} images!'.format(i+1)) h.create_dataset('data', data=imgs) h.create_dataset('label', data=scores) with open('{}_h5.txt'.format(setname), 'w') as f: f.write(h5_filename) def write_hdf5(data,labels,output_filename): """ This function is used to save image data and its label(s) to hdf5 file. output_file.h5,contain data and label data.shape is (n,c,h,w) label.shape is (n,labels) """ import h5py X = data.astype(np.float32) y = labels.astype(np.float32) X, y = shuffle(X, y, random_state=42) # shuffle train data IMAGE_SIZE = (96, 96) LABEL_SIZE = 30 # Multi-labels MEAN_VALUE = 128 #filename = sys.argv[1] setname, ext = output_filename.split('.') sample_size = X.shape[0] imgs = np.zeros((sample_size, 1,) + IMAGE_SIZE, dtype=np.float32) scores = np.zeros((sample_size,LABEL_SIZE), dtype=np.float32) h5_filename = '{}.h5'.format(setname) with h5py.File(h5_filename, 'w') as h: i = 0; for in_,label in zip(X,y) : im = in_; im = im[:,:,::-1] im = im.transpose((2, 0, 1)) imgs[i] = im scores[i] = label i = i + 1; print('processed {} images!'.format(i)) h.create_dataset('data', data=imgs) h.create_dataset('label', data=scores) with open('{}_h5.txt'.format(setname), 'w') as f: f.write(h5_filename)