1. 程式人生 > 其它 >show-attend-and-tell-tensorflow原始碼解讀:preprocess.py

show-attend-and-tell-tensorflow原始碼解讀:preprocess.py

from scipy import ndimage
# scipy.ndimage: Multi-dimentional image processing(多維影象處理包) 更強大的影象處理庫包括:opencv, scikit-image等
from collections import Counter
# collections模組包含多種集合類,Counter是其中之一,它是一個簡單的計數器,統計字元出現的個數,是dict的一個子類
from core.vggnet import Vgg19
# core.vggnet.Vgg19類從imagenet-vgg-verydeep-19.mat中獲取了預訓練引數,用這些預訓練引數構造了vgg19網路的計算模型。
from core.utils import * import tensorflow as tf import numpy as np import pandas as pd import hickle # hickle與pickle都是常用的序列化/反序列化模組,用來儲存程式執行結果或者載入包含程式所需資訊的檔案。 import os # os模組用於程式與作業系統互動,訪問資料夾 import json # json和hickle,pickle作用類似

以上是preprocess.py的import資訊。因為該python檔案包含多個python函式,內容較多,需要理清頭緒,找出程式入口,程式入口即main()函式。

def main():
    # batch size for extracting feature vectors from vggnet
    batch_size = 100    # 一次提取100幅影象的feature vectors
    # maximum length of caption (number of word). if caption is longer than max_length, deleted.
    max_length = 15    # 標籤語句最長15個單詞,超過15個單詞的語句刪掉
    # if word occurs less than word_count_threshold in training dataset, the word index is special unknown token.
word_count_threshold = 1 # 如果訓練集中某個單詞出現次數小於1,那就設為null(一個特殊的token) # vgg model path vgg_model_path = './data/imagenet-vgg-verydeep-19.mat' # about 80000 images and 400000 captions for train dataset train_dataset = _process_caption_data(caption_file='data/annotations/captions_train2014.json, image_dir='image/train2014_resized', max_length=max_length) # 有影象資料夾image_dir,有包含標籤語句和影象與標籤的連線資訊的caption_file,這個函式(後面詳細介紹)事實上構建了訓練集變數,另外一點:./data/ == data/ # about 40000 images and 200000 captions val_dataset = _process_caption_data(caption_file='data/annotations/captions_val2014.json', image_dir='image/val_resized', max_length=max_length) # 這裡構建了驗證集變數 # about 4000 images and 20000 captions for val / test dataset val_cutoff = int(0.1 * len(val_dataset)) test_cutoff = int(0.2 * len(val_dataset)) print('Finished processing caption data') save_pickle(train_dataset, 'data/train/train.annotations.pkl') save_pickle(val_dataset[:val_cutoff], 'data/val/val.annotations.pkl') save_pickle(val_dataset[val_cutoff:test_cutoff].reset_index(drop=True), 'data/test/test.annotations.pkl') """ 這裡save_pickle()函式與pickle模組有關,pickle模組儲存的檔案字尾名都是pkl,save_pickle()是對pickle.dump()函式的擴充套件,它的定義在core.utils模組中(前面匯入模組中已經寫了)。 reset_index()方法的全稱是pandas.DataFrame.reset_index(),用來防止原索引變成資料列。可見_process_caption_data返回的結果是pd.DataFrame類的例項,但疑點是為什麼前兩個沒用該方法? 從這兒開始,對上面得到的train, val, test三個檔案,逐個執行相關操作。 """ for split in ['train', 'val', 'test']: annotations = load_pickle('./data/%s/%s.annotations.pkl' % (split, split)) # load_pickle()與save_pickle()情形相似,都位於core.utils模組中(core/utils.py檔案中),都是對pickle模組中的函式進行擴充套件,不同之處在於load_pickle()擴充套件的是pickle.load() if split == 'train': word_to_idx = _build_vocab(annotations=annotations, threshold=word_count_threshold) # 在training階段,製作詞彙表,方便後續的one-hot詞編碼和詞嵌入。 save_pickle(word_to_idx, '.data/%s/word_to_idx.pkl' % split) # 把詞彙表儲存起來 captions = _build_caption_vector(annotations=annotations, word_to_idx=word_to_idx, max_length=max_length) # 製作好詞彙表後,對整個句子進行編碼 save_pickle(captions, './data/%s/%s.captions.pkl' % (split, split)) # 對句子編碼向量儲存起來。 file_names, id_to_idx = _build_file_names(annotations) save_pickle(file_names, './data/%s/%s.file.names.pkl' % (split, split)) image_idxs = _build_image_idxs(annotations, id_to_idx) save_pickle(image_idxs, './data/%s/%s.image.idxs.pkl' % (split, split)) """這四句暫時不清楚具體幹了啥,但應該是提取了圖片檔名,圖片id,標籤語句,標籤編碼之間的關係""" # prepare reference captions to compute bleu scores later image_ids = {} feature_to_captions = {} i = -1 for caption, image_id in zip(annotations['caption'], annotations['image_id']): if not image_id in image_ids: image_ids[image_id] = 0 i += 1 feature_to_captions[i] = [] feature_to_captions[i].append(caption.lower() + ' .') save_pickle(feature_to_captions, './data/%s/%s.references.pkl' % (split, split)) print("finished building %s caption dataset" % split)
    # extract conv5_3 feature vectors
    vggnet = Vgg19(vgg_model_path)
    # 載入預訓練的模型引數
    vggnet.build()
    # 載入後構建vgg19模型,得到完整的計算流程
    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        for split in ['train', 'val', 'test']:
            anno_path = './data/%s/%s.annotations.pkl' % (split, split)
            save_path = './data/%s/%s.features.hkl' % (split, split)
            annotations = load_pickle(anno_path)
            image_path = list(annotations['file_name'].unique())
            n_example = len(image_path)
            
            all_feats = np.ndarray([n_example, 196, 512], dtype=np.float32)
            
            for start, end in zip(range(0, n_example, batch_size), range(batch_size, n_example+batch_size, batch_size)):
                image_batch_file = image_path[staart:end]
                image_batch = np.array(map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype(np.float32)
                feats = sess.run(vggnet.features, feed_dict={vggnet.images:image_batch})
                all_feats[start:end, :] = feats
                print("Processed %d %s features.." % (end, split))
            
            # use hickle to save huge feature vectors
            hickle.dump(all_feats, save_path)
            print("Saved %s.." % (save_path))