我的貓狗大戰資料集圖片缺失處理
阿新 • • 發佈:2018-11-09
前面 找了一份540M的貓狗大戰的資料集,想使用這個資料集在小型資料集上從頭開始訓練一個卷積神經網路,使用了其中的2500個樣本,這個貓狗大戰的資料集總的是25000張圖片,所以在前面2500張圖片缺失的時候我就自己從後面的資料集中拷貝圖片補齊前面的,但是發現缺失圖片比較多,手動去查詢太麻煩,所以乾脆還是判斷一下檔案是不是存在的,寫程式碼來解決這個問題。
其實這個程式碼比較簡單,但是考慮到前面自己在CSDN上分享過這個資料集(https://download.csdn.net/download/lxiao428/10747658),不想坑別人,就把程式碼貼出來,供參考。如果想用全部的資料集作訓練,不能從後面考圖片,但是應該可以自己去搜集圖片來不全缺失,只要將這一部分的程式碼修改一下,打印出來其中的缺失的index,然後去對應的補全,免得自己去找,還是有一定的小作用的。
程式碼如下:
# -*- coding: utf-8 -*- """ Created on Sat Oct 27 16:25:41 2018 @author: Lxiao217 """ import os, shutil original_dataset_dir = 'F:\\python\\DeepLearning\\train' base_dir = 'F:\\python\\DeepLearning\\cats_and_dogs_small' if not os.path.exists(base_dir): os.mkdir(base_dir) train_dir = os.path.join(base_dir, 'train') if not os.path.exists(train_dir): os.mkdir(train_dir) test_dir = os.path.join(base_dir, 'test') if not os.path.exists(test_dir): os.mkdir(test_dir) validation_dir = os.path.join(base_dir, 'validation') if not os.path.exists(validation_dir): os.mkdir(validation_dir) train_cats_dir = os.path.join(train_dir, 'cats') if not os.path.exists(train_cats_dir): os.mkdir(train_cats_dir) train_dogs_dir = os.path.join(train_dir, 'dogs') if not os.path.exists(train_dogs_dir): os.mkdir(train_dogs_dir) validation_cats_dir = os.path.join(validation_dir, 'cats') if not os.path.exists(validation_cats_dir): os.mkdir(validation_cats_dir) validation_dogs_dir = os.path.join(validation_dir, 'dogs') if not os.path.exists(validation_dogs_dir): os.mkdir(validation_dogs_dir) test_cats_dir = os.path.join(test_dir, 'cats') if not os.path.exists(test_cats_dir): os.mkdir(test_cats_dir) test_dogs_dir = os.path.join(test_dir, 'dogs') if not os.path.exists(test_dogs_dir): os.mkdir(test_dogs_dir) #將前1000張貓的圖片複製到train_cats_dir中 fnames = ['cat.{}.jpg'.format(i) for i in range (1000)] for fname in fnames: src = os.path.join(original_dataset_dir, fname) if not os.path.exists(src): nameList = fname.split('.') picindex = int(nameList[1]) newindex = picindex + 6666 newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2] src = os.path.join(original_dataset_dir, newName) dst = os.path.join(train_cats_dir, fname) shutil.copyfile(src, dst) #500張驗證貓 fnames = ['cat.{}.jpg'.format(i) for i in range(1000, 1500)] for fname in fnames: src = os.path.join(original_dataset_dir, fname) if not os.path.exists(src): nameList = fname.split('.') picindex = int(nameList[1]) newindex = picindex + 6666 #用的原始index+6666的圖片 newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2] src = os.path.join(original_dataset_dir, newName) dst = os.path.join(validation_cats_dir, fname) shutil.copyfile(src, dst) #500張測試貓 fnames = ['cat.{}.jpg'.format(i) for i in range(1500, 2000)] for fname in fnames: src = os.path.join(original_dataset_dir, fname) if not os.path.exists(src): nameList = fname.split('.') picindex = int(nameList[1]) newindex = picindex + 6666 newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2] src = os.path.join(original_dataset_dir, newName) dst = os.path.join(test_cats_dir, fname) shutil.copyfile(src, dst) #1000張訓練狗 fnames = ['dog.{}.jpg'.format(i) for i in range(1000)] for fname in fnames: src = os.path.join(original_dataset_dir, fname) if not os.path.exists(src): nameList = fname.split('.') picindex = int(nameList[1]) newindex = picindex + 6666 newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2] src = os.path.join(original_dataset_dir, newName) dst = os.path.join(train_dogs_dir, fname) shutil.copyfile(src, dst) #500張驗證狗 fnames = ['dog.{}.jpg'.format(i) for i in range(1000, 1500)] for fname in fnames: src = os.path.join(original_dataset_dir, fname) if not os.path.exists(src): nameList = fname.split('.') picindex = int(nameList[1]) newindex = picindex + 6666 newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2] src = os.path.join(original_dataset_dir, newName) dst = os.path.join(validation_dogs_dir, fname) shutil.copyfile(src, dst) #500張測試狗 fnames = ['dog.{}.jpg'.format(i) for i in range(1500, 2000)] for fname in fnames: src = os.path.join(original_dataset_dir, fname) if not os.path.exists(src): nameList = fname.split('.') picindex = int(nameList[1]) newindex = picindex + 6666 newName = nameList[0] + '.' + str(newindex) + '.' + nameList[2] src = os.path.join(original_dataset_dir, newName) dst = os.path.join(test_dogs_dir, fname) shutil.copyfile(src, dst)