獲取mnist訓練資料集input_data.py

阿新 • • 發佈：2019-02-16

# Copyright 2015 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions for downloading and reading MNIST data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gzip
import os
import numpy
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
def maybe_download(filename, work_directory):
  """Download the data from Yann's website, unless it's already here."""
  if not os.path.exists(work_directory):
    os.mkdir(work_directory)
  filepath = os.path.join(work_directory, filename)
  if not os.path.exists(filepath):
    filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath)
    statinfo = os.stat(filepath)
    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
  return filepath
def _read32(bytestream):
  dt = numpy.dtype(numpy.uint32).newbyteorder('>')
  return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]#新增加 [0]
def extract_images(filename):
  """Extract the images into a 4D uint8 numpy array [index, y, x, depth]."""
  print('Extracting', filename)
  with gzip.open(filename) as bytestream:
    magic = _read32(bytestream)
    if magic != 2051:
      raise ValueError(
          'Invalid magic number %d in MNIST image file: %s' %
          (magic, filename))
    num_images = _read32(bytestream)
    rows = _read32(bytestream)
    cols = _read32(bytestream)
    buf = bytestream.read(rows * cols * num_images)
    data = numpy.frombuffer(buf, dtype=numpy.uint8)
    data = data.reshape(num_images, rows, cols, 1)
    return data
def dense_to_one_hot(labels_dense, num_classes=10):
  """Convert class labels from scalars to one-hot vectors."""
  num_labels = labels_dense.shape[0]
  index_offset = numpy.arange(num_labels) * num_classes
  labels_one_hot = numpy.zeros((num_labels, num_classes))
  labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
  return labels_one_hot
def extract_labels(filename, one_hot=False):
  """Extract the labels into a 1D uint8 numpy array [index]."""
  print('Extracting', filename)
  with gzip.open(filename) as bytestream:
    magic = _read32(bytestream)
    if magic != 2049:
      raise ValueError(
          'Invalid magic number %d in MNIST label file: %s' %
          (magic, filename))
    num_items = _read32(bytestream)
    buf = bytestream.read(num_items)
    labels = numpy.frombuffer(buf, dtype=numpy.uint8)
    if one_hot:
      return dense_to_one_hot(labels)
    return labels
class DataSet(object):
  def __init__(self, images, labels, fake_data=False):
    if fake_data:
      self._num_examples = 10000
    else:
      assert images.shape[0] == labels.shape[0], (
          "images.shape: %s labels.shape: %s" % (images.shape,
                                                 labels.shape))
      self._num_examples = images.shape[0]
      # Convert shape from [num examples, rows, columns, depth]
      # to [num examples, rows*columns] (assuming depth == 1)
      assert images.shape[3] == 1
      images = images.reshape(images.shape[0],
                              images.shape[1] * images.shape[2])
      # Convert from [0, 255] -> [0.0, 1.0].
      images = images.astype(numpy.float32)
      images = numpy.multiply(images, 1.0 / 255.0)
    self._images = images
    self._labels = labels
    self._epochs_completed = 0
    self._index_in_epoch = 0
  @property
  def images(self):
    return self._images
  @property
  def labels(self):
    return self._labels
  @property
  def num_examples(self):
    return self._num_examples
  @property
  def epochs_completed(self):
    return self._epochs_completed
  def next_batch(self, batch_size, fake_data=False):
    """Return the next `batch_size` examples from this data set."""
    if fake_data:
      fake_image = [1.0 for _ in xrange(784)]
      fake_label = 0
      return [fake_image for _ in xrange(batch_size)], [
          fake_label for _ in xrange(batch_size)]
    start = self._index_in_epoch
    self._index_in_epoch += batch_size
    if self._index_in_epoch > self._num_examples:
      # Finished epoch
      self._epochs_completed += 1
      # Shuffle the data
      perm = numpy.arange(self._num_examples)
      numpy.random.shuffle(perm)
      self._images = self._images[perm]
      self._labels = self._labels[perm]
      # Start next epoch
      start = 0
      self._index_in_epoch = batch_size
      assert batch_size <= self._num_examples
    end = self._index_in_epoch
    return self._images[start:end], self._labels[start:end]
def read_data_sets(train_dir, fake_data=False, one_hot=False):
  class DataSets(object):
    pass
  data_sets = DataSets()
  if fake_data:
    data_sets.train = DataSet([], [], fake_data=True)
    data_sets.validation = DataSet([], [], fake_data=True)
    data_sets.test = DataSet([], [], fake_data=True)
    return data_sets
  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
  VALIDATION_SIZE = 5000
  local_file = maybe_download(TRAIN_IMAGES, train_dir)
  train_images = extract_images(local_file)
  local_file = maybe_download(TRAIN_LABELS, train_dir)
  train_labels = extract_labels(local_file, one_hot=one_hot)
  local_file = maybe_download(TEST_IMAGES, train_dir)
  test_images = extract_images(local_file)
  local_file = maybe_download(TEST_LABELS, train_dir)
  test_labels = extract_labels(local_file, one_hot=one_hot)
  validation_images = train_images[:VALIDATION_SIZE]
  validation_labels = train_labels[:VALIDATION_SIZE]
  train_images = train_images[VALIDATION_SIZE:]
  train_labels = train_labels[VALIDATION_SIZE:]
  data_sets.train = DataSet(train_images, train_labels)
  data_sets.validation = DataSet(validation_images, validation_labels)
  data_sets.test = DataSet(test_images, test_labels)
  return data_sets

呼叫方法：新建py檔案

import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

執行報錯：

only integer scalar arrays can be converted to a scalar

排查：

http://blog.csdn.net/md5xwl/article/details/71159486

def _read32(bytestream):
    dt = numpy.dtype(numpy.uint32).newbyteorder('>')
    return numpy.frombuffer(bytestream.read(4), dtype=dt)

新版Numpy改為，語句後增加了[0]

def _read32(bytestream):
    dt = numpy.dtype(numpy.uint32).newbyteorder('>')
    return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]

獲取mnist訓練資料集input_data.py

神經網路演算法學習---獲取常用訓練資料集

下載地址：http://vision.stanford.edu/Datasets/OlympicSports/ 7. UIUC action dataset 這個資料庫已經做到98%了，建議不要去做了。下載地址：http://vision.cs.uiuc.edu/projects/activity/

本地匯入Mnist的資料集的方法

完整程式碼的下載路徑：https://download.csdn.net/download/lxiao428/10714886 很多人在介紹Mnist資料集的時候都是通過庫在網上下載，我以前也是這麼做的，但是今天發現遠端伺服器關閉連線了，而我本地又有這個Mnist資料集，我就想怎麼講

求助Tensorflow下跑mnist手寫體資料集遇到Cuda compute capability問題

在Python下裝了tensorflow-gpu，其中cuda為cuda_8.0.61_windows，cudnn為cudnn-8.0-windows7-x64-v5.1，安裝沒有問題，可以正常跑起來，但是在跑mnist手寫體資料集時遇到以下問題： I c:\tf_jenkins\home\

關於在深度學習中訓練資料集的batch的經驗總結

由於深度學習的網格很大，用來訓練的資料集也很大。因此不可能一下子將所有資料集都輸入到網路中，便引入了batch_size的概念，下面總結自己兩種常用的呼叫batch的方法 1、使用TensorFlow， tf.train.batch（）。 2、 offset = (offset

神經網路中訓練資料集、驗證資料集和測試資料集的區別

whats the difference between train, validation and test set, in neural networks? Answer: The training and validation sets are used during t

電腦科學採用訓練資料集，驗證資料集，測試資料集的方法為什麼不採用統計學中常用的假設檢驗呢？（引數檢驗和非引數檢驗）

如題所說，這個問題作為一個本科讀管理，碩士讀計算機卻旁修經濟學，博士在讀計算機的我來說感覺比較迷惑的。在管理學，經濟學，計算機這三門學科在解決優化問題的時候採用的方法大致相同，其核心都是統計學，管理學，電腦科學中採用的基礎方法，如線性迴歸，多元線性迴歸，廣義線性迴歸，決策樹，SVM,ID3,KNN等分類方法

pyspark 用fit訓練資料集的時候出現"Params must be either a param map or a list/tuple of param maps, "

在anaconda用決策樹訓練資料， from pyspark.ml.classification import DecisionTreeClassifier dt=DecisionTreeClassifier(labelCol="label",features

Deeplearning4j 實戰（12）：Mnist替代資料集Fashion Mnist在CNN上的實驗及結果

Mnist資料集的分類問題一直被認為是深度學習的Hello World。利用2層卷積網路，經過若干輪的訓練後，在相應測試集上的準確率可以達到95%以上。經過調參後，甚至可以達到99%以上。其實，即使不用用卷積層提取特徵，而是用傳統的全連線網路也同樣可以達到非常高的準確率。在Mnist資料集的官網上(

Tensorflow Object Detection 生成自己的tfrecord訓練資料集

Object Detection API谷歌該文章部分參考別的大佬的，由於忘了內容出處，所以沒有加轉載連結，請諒解，有原創作者看到可以聯絡我新增。 ========轉載請註明出處========== 此python檔案放在dataset_tools下面生成自己訓練

(十)訓練資料集建立

Caffe2 - 訓練資料集建立 caffe2 使用二值 DB 儲存模型訓練的資料，以 key-value 格式儲存， key1 value1 key2 value2 key3 value3 ... DB 中，將 keys 和 values 儲存為 s

機器學習中訓練資料集，交叉驗證資料集，測試資料集的作用

#1. 簡介在Andrew Ng的機器學習教程裡，會將給定的資料集分為三部分：訓練資料集（training set）、交叉驗證資料集（cross validation set）、測試資料集（test set）。三者分別佔總資料集的60%、20%、20%。那麼

yolo生成和訓練資料集

第二代yolo的效能明顯比第一代yolo有所提升，在博主實際測試中。基於3000張的資料集來說，yolo的效果還是不錯的，tiny-yolo的效能稍差，但是時間也縮短了很多。大約tiny的時間是yolo的1/5，YOLO的官方網站上也有所介紹，對於67的fps還

在ubuntu16.04+opencv3.0環境下使用mnist手寫體資料集編寫相關程式

因為最近可能做專案需要，因此搜尋了手寫體數字檢測部落格，在查看了大量部落格後總結了一些自己的學習小體會。但是但是-----敲黑板、劃重點了。呵呵，就是還是先把參考的幾篇好部落格的分享給大家。 http://www.itnose.net/detail/6525

訓練資料集報錯-NaN

報錯資訊：Input contains NaN, infinity or a value too large for dtype(‘float32’). 原因：資料集中有NaN空值解決方法：填補缺失資料 melbourne_data = pd.rea

opencv_haartraining.exe 訓練資料集自己經歷過的坑

1、opencv_haartraining.exe 在新版本里面沒有，之前在openv3.3.1 和 3.1.0 中都沒找打，看網上用的是2.4.9 ，然後才用 opencv249完成。 2、將圖片名稱輸入到 txt 檔案，最開始直接進入樣本圖片的資料夾，直接 dir /b

Alink漫談(七) : 如何劃分訓練資料集和測試資料集

# Alink漫談(七) : 如何劃分訓練資料集和測試資料集 [TOC] ## 0x00 摘要 Alink 是阿里巴巴基於實時計算引擎 Flink 研發的新一代機器學習演算法平臺，是業界首個同時支援批式演算法、流式演算法的機器學習平臺。本文將為大家展現Alink如何劃分訓練資料集和測試資料集。 ##

Mnist資料集以及input_data.py的程式碼

Mnist作為tensorflow的入門，但是很多人都在Mnist的資料集上就已經卡住了。有的人找不到input_data.pyde程式碼。所以在此給那些找不到input_data.py的人提供程式碼。僅供學習。原始碼來自於https://tensorflow.

關於TensorFlow的MNIST資料集下載指令碼input_data.py的坑

今天用github上的程式碼入門tensorflow但是發現似乎要下載資料集，但是這個我弄了一會才明白是怎麼下的，所以把經驗寫在下面：（ubuntu14.04環境）用github上的input_da

【MNIST/Python】手寫體數字訓練/測試資料集(圖片格式)下載及分割預處理

MNIST手寫體數字資料集 MNIST是一個手寫數字資料庫，它有60000個訓練樣本集和10000個測試樣本集由Yann LeCun等人建立，是NIST資料庫的一個子集官方網址連結：Link 官網上的資料庫檔案形式如下： train-images-idx3-ubyte.

獲取mnist訓練資料集input_data.py

相關推薦