1. 程式人生 > >用Python將多標籤資料存入caffe使用的HDF5&LMDB格式

用Python將多標籤資料存入caffe使用的HDF5&LMDB格式

最近在進行多標籤的資料轉換,發現直接使用caffe自帶的convert_image不是很方面,就收集了一下用python的處理方法。現整理以備後用。

使用時發現,用python寫入lmdb有個問題,如果事先無法知道資料的大小,那麼分配的儲存空間map_size就不好確定(預設是10M),且不會自動隨寫入資料的實際大小而調整,所以還是HDF5好處理一下,雖然在caffe中是提倡使用lmdb。

在此也望有經驗的前輩能指教一下,用python處理lmdb時,檔案的大小如何預分配,或有什麼辦法能讓lmdb的檔案大小能隨寫入或刪除資料而自動增減?先謝過啦!

<pre name="code" class="python">import lmdb
import random
import os

import caffe


def convert_data_lmdb(train_data,train_label,output_data_lmdb,output_labels_lmdb):
	"""
	Used for save data and multi-labels to lmdbs
	call: convert_data_lmdb(train_X,train_y,'train_data_lmdb','train_labels_lmdb')
	"""
	X = train_data.astype(np.float)
	y = train_label.astype(np.float)

	X, y = shuffle(X, y, random_state=42)  # shuffle train data
	
    # creating images lmdb
	in_db = lmdb.open(output_data_lmdb, map_size=X.nbytes*10)

	with in_db.begin(write=True) as in_txn :
		for in_idx,in_ in enumerate(X) :
			im = in_;
			im = im[:,:,::-1]
			im = im.transpose((2, 0, 1))
			im_dat = caffe.io.array_to_datum(im)
			#in_txn.put(in_idx.encode('ascii'), im_dat.SerializeToString())
			in_txn.put('{:0>10d}'.format(in_idx), im_dat.SerializeToString())
	in_db.close()
	
	in_label = lmdb.open(output_labels_lmdb, map_size=y.nbytes*10)
	counter_label = 0
	with in_label.begin(write=True) as in_txn :
		for idx in range(y.shape[0]):
			datum = caffe.io.array_to_datum(y[np.newaxis,np.newaxis,idx])
			in_txn.put("{:0>10d}".format(counter_label), datum.SerializeToString())
			counter_label += 1
	in_label.close()

def write_hdf5(filename):
	import h5py

	IMAGE_SIZE = (96, 96)
	LABEL_SIZE = 30 # Multi-labels
	MEAN_VALUE = 128

	#filename = sys.argv[1]
	setname, ext = filename.split('.')

	with open(filename, 'r') as f:
		lines = f.readlines()

	np.random.shuffle(lines)

	sample_size = len(lines)
	imgs = np.zeros((sample_size, 1,) + IMAGE_SIZE, dtype=np.float32)
	scores = np.zeros((sample_size,1) + LABEL_SIZE, dtype=np.float32)

	h5_filename = '{}.h5'.format(setname)
	with h5py.File(h5_filename, 'w') as h:
		for i, line in enumerate(lines):
			image_name, score = line[:-1].split()
			img = pyplot.imread(image_name)[:, :, 0].astype(np.float32)
			img = img.reshape((1, )+img.shape)
			#img -= MEAN_VALUE
			imgs[i] = img
			scores[i,1] = float(score)
			if (i+1) % 1000 == 0:
				print('processed {} images!'.format(i+1))
		h.create_dataset('data', data=imgs)
		h.create_dataset('label', data=scores)

	with open('{}_h5.txt'.format(setname), 'w') as f:
		f.write(h5_filename)

def write_hdf5(data,labels,output_filename):
	"""
	This function is used to save image data and its label(s) to hdf5 file.
	output_file.h5,contain data and label
	data.shape is (n,c,h,w)
	label.shape is (n,labels)
	"""
	import h5py
	
	X = data.astype(np.float32)
	y = labels.astype(np.float32)

	X, y = shuffle(X, y, random_state=42)  # shuffle train data

	IMAGE_SIZE = (96, 96)
	LABEL_SIZE = 30 # Multi-labels
	MEAN_VALUE = 128

	#filename = sys.argv[1]
	setname, ext = output_filename.split('.')
	sample_size = X.shape[0]

	imgs = np.zeros((sample_size, 1,) + IMAGE_SIZE, dtype=np.float32)
	scores = np.zeros((sample_size,LABEL_SIZE), dtype=np.float32)

	h5_filename = '{}.h5'.format(setname)
	with h5py.File(h5_filename, 'w') as h:
		i = 0;
		for in_,label in zip(X,y) :
			im = in_;
			im = im[:,:,::-1]
			im = im.transpose((2, 0, 1))
			imgs[i] = im
			scores[i] = label
			i = i + 1;
			
		print('processed {} images!'.format(i))
			
		h.create_dataset('data', data=imgs)
		h.create_dataset('label', data=scores)

	with open('{}_h5.txt'.format(setname), 'w') as f:
		f.write(h5_filename)