Chainer 入门教程(7)数据集模块介绍

import numpy as np
import chainer
from chainer import cuda, Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from import extensions
import chainer.dataset
import chainer.datasets



from chainer.datasets import TupleDataset

x = np.arange(10)
t = x * x

data = TupleDataset(x, t)

print('data type: {}, len: {}'.format(type(data), len(data)))
data type: <class 'chainer.datasets.tuple_dataset.TupleDataset'>, len: 10

i个数据可以通过data[i]访问,是一个元组($x_i$, $t_i$, …)

# get forth data -> x=3, t=9
(3, 9)


当通过切片索引访问TupleDataset时,例如data[i:j], 返回一个元组列表 $[(x_i, t_i), …, (x_{j-1}, t_{j-1})]$

# Get 1st, 2nd, 3rd data at the same time.
examples = data[0:4]

print('examples type: {}, len: {}'
      .format(type(examples), len(examples)))
[(0, 0), (1, 1), (2, 4), (3, 9)]
examples type: <class 'list'>, len: 4

要将示例转换为小批量格式,可以在chainer.dataset中使用concat_examples函数。返回的数值格式是 ([x_array], [t array], ...)

from chainer.dataset import concat_examples

data_minibatch = concat_examples(examples)

#print('data_minibatch type: {}, len: {}'
#      .format(type(data_minibatch), len(data_minibatch)))

x_minibatch, t_minibatch = data_minibatch
# Now it is array format, which has shape
print('x_minibatch = {}, type: {}, shape: {}'.format(x_minibatch, type(x_minibatch), x_minibatch.shape))
print('t_minibatch = {}, type: {}, shape: {}'.format(t_minibatch, type(t_minibatch), t_minibatch.shape))
x_minibatch = [0 1 2 3], type: <class 'numpy.ndarray'>, shape: (4,)
t_minibatch = [0 1 4 9], type: <class 'numpy.ndarray'>, shape: (4,)


from chainer.datasets import DictDataset

x = np.arange(10)
t = x * x

# To construct `DictDataset`, you can specify each key-value pair by passing "key=value" in kwargs.
data = DictDataset(x=x, t=t)

print('data type: {}, len: {}'.format(type(data), len(data)))
data type: <class 'chainer.datasets.dict_dataset.DictDataset'>, len: 10
# Get 3rd data at the same time.
example = data[2]
print('examples type: {}, len: {}'
      .format(type(example), len(example)))

# You can access each value via key
print('x: {}, t: {}'.format(example['x'], example['t']))
{'x': 2, 't': 4}
examples type: <class 'dict'>, len: 2
x: 2, t: 4



ImageDataset 将只下载图像,如果您需要另一个标签信息(例如,如果您正在处理图像分类任务),请使用LabeledImageDataset。


import os

from chainer.datasets import ImageDataset

# print('Current direcotory: ', os.path.abspath(os.curdir))

filepath = './data/images.dat'
image_dataset = ImageDataset(filepath, root='./data/images')

print('image_dataset type: {}, len: {}'.format(type(image_dataset), len(image_dataset)))
image_dataset type: <class 'chainer.datasets.image_dataset.ImageDataset'>, len: 10

我们已经创建了上面的image_dataset,但是图像还没有扩展到内存中。 每次通过索引访问时,图像数据都会从存储器加载到内存中,以便高效地使用内存。

# Access i-th image by image_dataset[i].
# image data is loaded here. for only 0-th image.
img = image_dataset[0]

# img is numpy array, already aligned as (channels, height, width), 
# which is the standard shape format to feed into convolutional layer.
print('img', type(img), img.shape)
img <class 'numpy.ndarray'> (3, 426, 640)


这是图像数据集的应用工具类。它与ImageDataset类似,允许在运行时将图像文件从存储器加载到内存中。不同之处在于它包含了标签信息,通常用于图像分类任务。您需要创建一个文本文件,其中包含要使用LabeledImageDataset的图像路径和标签列表。 具体参见如下:

cute-animal-degu-octodon-71487.jpeg 0
guinea-pig-pet-nager-rodent-47365.jpeg 0
kittens-cat-cat-puppy-rush-45170.jpeg 1
kitty-cat-kitten-pet-45201.jpeg 1
pexels-photo-96938.jpeg 1
pexels-photo-126407.jpeg 1
pexels-photo-206931.jpeg 0
pexels-photo-208845.jpeg 1
pexels-photo-209079.jpeg 0
rat-pets-eat-51340.jpeg 0
import os

from chainer.datasets import LabeledImageDataset

# print('Current direcotory: ', os.path.abspath(os.curdir))

filepath = './data/images_labels.dat'
labeled_image_dataset = LabeledImageDataset(filepath, root='./data/images')

print('labeled_image_dataset type: {}, len: {}'.format(type(labeled_image_dataset), len(labeled_image_dataset)))
labeled_image_dataset type: <class 'chainer.datasets.image_dataset.LabeledImageDataset'>, len: 10

我们已经创建了上面的labeled_image_dataset,但是图像还没有扩展到内存中。 每次通过索引访问时,图像数据都会从存储器加载到内存中,以便高效地使用内存。

# Access i-th image and label by image_dataset[i].
# image data is loaded here. for only 0-th image.
img, label = labeled_image_dataset[0]

print('img', type(img), img.shape)
print('label', type(label), label)
img <class 'numpy.ndarray'> (3, 426, 640)
label <class 'numpy.ndarray'> 0






import os
import numpy as np
import pandas as pd

DATA_DIR = 'data'

def black_box_fn(x_data):
    return np.sin(x_data) + np.random.normal(0, 0.1, x_data.shape)

x = np.arange(-5, 5, 0.01)
t = black_box_fn(x)
df = pd.DataFrame({'x': x, 't': t}, columns={'x', 't'})
df.to_csv(os.path.join(DATA_DIR, 'my_data.csv'), index=False)


我采用简单的sin函数和一点点高斯噪声从x生成t。 (你可以尝试修改black_box_fn函数来改变函数来估计。






  • __init__(self, *args) 编写初始化代码。

  • __len__(self) 训练器模块(迭代器)访问此属性来计算每个epoch中训练的进度。

  • get_examples(self, i) 返回第i个数据

import numpy as np
import pandas as pd
import chainer
class MyDataset(chainer.dataset.DatasetMixin):
    def __init__(self, filepath, debug=False):
        self.debug = debug
        # Load the data in initialization
        df = pd.read_csv(filepath) = df.values.astype(np.float32)
        if self.debug:
            print('[DEBUG] data: \n{}'.format(
    def __len__(self):
        """return length of this dataset"""
        return len(
    def get_example(self, i):
        """Return i-th data"""
        x, t =[i]
        return [x], [t]




  1. 在初始化代码的__init__函数中加载准备好的数据data/my_data.csv(设置为filepath),并将扩展数组(严格来说,pandas.DataFrame类)设置为

  2. 返回第i个数据xi和ti作为get_example(self,i)中大小为1的向量。



也可以通过切片或一维矢量进行访问 dataset[i:j]从而返回[dataset[i],dataset[i + 1],...,dataset[j-1]]

dataset = MyDataset('data/my_data.csv', debug=True)

print('Access by index dataset[1] = ', dataset[1])
print('Access by slice dataset[:3] = ', dataset[:3])
print('Access by list dataset[[3, 5]] = ', dataset[[3, 5]])
index = np.arange(3)
print('Access by numpy array dataset[[0, 1, 2]] = ', dataset[index])
# Randomly take 3 data
index = np.random.permutation(len(dataset))[:3]
print('dataset[{}] = {}'.format(index, dataset[index]))
[DEBUG] data: 
[[ 0.95193064 -5.        ]
 [ 0.97486413 -4.98999977]
 [ 1.05177033 -4.98000002]
 [-1.08878708  4.96999979]
 [-0.98387295  4.98000002]
 [-0.89990532  4.98999977]]
Access by index dataset[1] =  ([0.97486413], [-4.9899998])
Access by slice dataset[:3] =  [([0.95193064], [-5.0]), ([0.97486413], [-4.9899998]), ([1.0517703], [-4.98])]
Access by list dataset[[3, 5]] =  [([1.0441649], [-4.9699998]), ([0.87154579], [-4.9499998])]
Access by numpy array dataset[[0, 1, 2]] =  [([0.95193064], [-5.0]), ([0.97486413], [-4.9899998]), ([1.0517703], [-4.98])]
dataset[[834 666 533]] = [([-0.241432], [3.3399999]), ([1.1102532], [1.66]), ([0.29236495], [0.33000001])]

数据集灵活性 - 来自存储的动态加载,预处理,数据增强


  1. 数据增强


class PreprocessedDataset(chainer.dataset.DatasetMixin):

    def __init__(self, path, root, mean, crop_size, random=True):
        self.base = chainer.datasets.LabeledImageDataset(path, root)
        self.mean = mean.astype('f')
        self.crop_size = crop_size
        self.random = random

    def __len__(self):
        return len(self.base)

    def get_example(self, i):
        # It reads the i-th image/label pair and return a preprocessed image.
        # It applies following preprocesses:
        #     - Cropping (random or center rectangular)
        #     - Random flip
        #     - Scaling to [0, 1] value
        crop_size = self.crop_size

        image, label = self.base[i]
        _, h, w = image.shape

        if self.random:
            # Randomly crop a region and flip the image
            top = random.randint(0, h - crop_size - 1)
            left = random.randint(0, w - crop_size - 1)
            if random.randint(0, 1):
                image = image[:, :, ::-1]
            # Crop the center
            top = (h - crop_size) // 2
            left = (w - crop_size) // 2
        bottom = top + crop_size
        right = left + crop_size

        image = image[:, top:bottom, left:right]
        image -= self.mean[:, top:bottom, left:right]
        image *= (1.0 / 255.0)  # Scale to [0, 1]
        return image, label
  1. 从存储动态加载





from chainer.datasets import TransformDataset

x = np.arange(10)
t = x * x - x

original_dataset = TupleDataset(x, t)

def transform_function(in_data):
    x_i, t_i = in_data
    new_t_i = t_i + np.random.normal(0, 0.1)
    return x_i, new_t_i

transformed_dataset = TransformDataset(original_dataset, transform_function)

[(0, 0), (1, 0), (2, 2)]
# Now Gaussian noise is added (in transform_function) to the original_dataset.
[(0, -0.10313827057174003), (1, 0.13332423623441678), (2, 2.0453149576361631)]


from chainer import reporter
class MyMLP(chainer.Chain):
    def __init__(self, n_units):
        super(MyMLP, self).__init__()
        with self.init_scope():
            # the size of the inputs to each layer will be inferred
            self.l1 = L.Linear(n_units)  # n_in -> n_units
            self.l2 = L.Linear(n_units)  # n_units -> n_units
            self.l3 = L.Linear(n_units)  # n_units -> n_units
            self.l4 = L.Linear(1)    # n_units -> n_out
    def __call__(self, *args):
        # Calculate loss
        h = self.forward(*args)
        t = args[1]
        self.loss = F.mean_squared_error(h, t){'loss': self.loss}, self)
        return self.loss
    def forward(self, *args):
        # Common code for both loss (__call__) and predict
        x = args[0]
        h = F.sigmoid(self.l1(x))
        h = F.sigmoid(self.l2(h))
        h = F.sigmoid(self.l3(h))
        h = self.l4(h)
        return h





  • chainer.datasets.split_dataset(dataset, split_at, order=None)
  • chainer.datasets.split_dataset_random(dataset, first_size, seed=None)
  • chainer.datasets.get_cross_validation_datasets(dataset, n_fold, order=None)
  • chainer.datasets.get_cross_validation_datasets_random(dataset, n_fold, seed=None)



 # Load the dataset and separate to train data and test data
dataset = MyDataset('data/my_data.csv')
train_ratio = 0.7
train_size = int(len(dataset) * train_ratio)
train, test = chainer.datasets.split_dataset_random(dataset, train_size, seed=13)




epoch       main/loss   validation/main/loss  elapsed_time
1           8.7217      13.2216               0.264993      
     total [###...............................................]  7.14%
this epoch [#####################.............................] 42.86%
       100 iter, 1 epoch / 20 epochs
       inf iters/sec. Estimated time to finish: 0:00:00.
2           8.7564      8.27661               0.62847       
     total [#######...........................................] 14.29%
this epoch [##########################################........] 85.71%
       200 iter, 2 epoch / 20 epochs
    214.07 iters/sec. Estimated time to finish: 0:00:05.605751.
3           8.47132     8.20647               0.99818       
4           8.19539     8.48856               1.37226       
     total [##########........................................] 21.43%
this epoch [##############....................................] 28.57%
       300 iter, 4 epoch / 20 epochs
     186.1 iters/sec. Estimated time to finish: 0:00:05.910877.
5           8.26764     8.48402               1.73545       
     total [##############....................................] 28.57%
this epoch [###################################...............] 71.43%
       400 iter, 5 epoch / 20 epochs
    192.58 iters/sec. Estimated time to finish: 0:00:05.192770.
6           8.35916     7.82453               2.1203        
7           8.22192     8.26731               2.47891       
     total [#################.................................] 35.71%
this epoch [#######...........................................] 14.29%
       500 iter, 7 epoch / 20 epochs
       186 iters/sec. Estimated time to finish: 0:00:04.838621.
8           8.21255     7.90139               2.84666       
     total [#####################.............................] 42.86%
this epoch [############################......................] 57.14%
       600 iter, 8 epoch / 20 epochs
    185.53 iters/sec. Estimated time to finish: 0:00:04.311946.
9           8.1826      7.86489               3.29141       
10          8.20058     8.18055               3.6595        
     total [#########################.........................] 50.00%
this epoch [..................................................]  0.00%
       700 iter, 10 epoch / 20 epochs
    182.82 iters/sec. Estimated time to finish: 0:00:03.828946.
11          8.23385     7.83185               4.02586       
     total [############################......................] 57.14%
this epoch [#####################.............................] 42.86%
       800 iter, 11 epoch / 20 epochs
    185.26 iters/sec. Estimated time to finish: 0:00:03.238664.
12          8.13546     8.0219                4.40651       
     total [################################..................] 64.29%
this epoch [##########################################........] 85.71%
       900 iter, 12 epoch / 20 epochs
    188.43 iters/sec. Estimated time to finish: 0:00:02.653515.
13          8.1298      7.78307               4.77653       
14          8.26764     7.91379               5.1378        
     total [###################################...............] 71.43%
this epoch [##############....................................] 28.57%
      1000 iter, 14 epoch / 20 epochs
    185.62 iters/sec. Estimated time to finish: 0:00:02.154961.
15          8.23635     7.92182               5.50792       
     total [#######################################...........] 78.57%
this epoch [###################################...............] 71.43%
      1100 iter, 15 epoch / 20 epochs
    188.09 iters/sec. Estimated time to finish: 0:00:01.594948.
16          8.27431     7.98348               5.8799        
17          8.16515     7.83324               6.24324       
     total [##########################################........] 85.71%
this epoch [#######...........................................] 14.29%
      1200 iter, 17 epoch / 20 epochs
    185.75 iters/sec. Estimated time to finish: 0:00:01.076714.
18          8.30931     8.15014               6.6156        
     total [##############################################....] 92.86%
this epoch [############################......................] 57.14%
      1300 iter, 18 epoch / 20 epochs
    187.82 iters/sec. Estimated time to finish: 0:00:00.532415.
19          8.15276     7.89404               6.98574       
20          8.04605     9.16781               7.36121       
     total [##################################################] 100.00%
this epoch [..................................................]  0.00%
      1400 iter, 20 epoch / 20 epochs
    186.08 iters/sec. Estimated time to finish: 0:00:00.



from chainer.dataset import concat_examples

class MyMLP(chainer.Chain):

    def __init__(self, n_units):
        super(MyMLP, self).__init__()
        with self.init_scope():
            # the size of the inputs to each layer will be inferred
            self.l1 = L.Linear(n_units)  # n_in -> n_units
            self.l2 = L.Linear(n_units)  # n_units -> n_units
            self.l3 = L.Linear(n_units)  # n_units -> n_units
            self.l4 = L.Linear(1)    # n_units -> n_out

    def __call__(self, *args):
        # Calculate loss
        h = self.forward(*args)
        t = args[1]
        self.loss = F.mean_squared_error(h, t){'loss': self.loss}, self)
        return self.loss

    def forward(self, *args):
        # Common code for both loss (__call__) and predict
        x = args[0]
        h = F.sigmoid(self.l1(x))
        h = F.sigmoid(self.l2(h))
        h = F.sigmoid(self.l3(h))
        h = self.l4(h)
        return h

    def predict(self, *args):
        with chainer.using_config('train', False):
            with chainer.no_backprop_mode():
                return self.forward(*args)

    def predict2(self, *args, batchsize=32):
        data = args[0]
        x_list = []
        y_list = []
        t_list = []
        for i in range(0, len(data), batchsize):
            x, t = concat_examples(data[i:i + batchsize])
            y = self.predict(x)

        x_array = np.concatenate(x_list)[:, 0]
        y_array = np.concatenate(y_list)[:, 0]
        t_array = np.concatenate(t_list)[:, 0]
        return x_array, y_array, t_array



  • 函数行为


  • 反向传播是没有必要的




chainer.dataset.concat_examples(batch, device=None, padding=None)

concat_examples 将数据集列表转换为可以输入到神经网络中的每个特征(这里是x和y)的小批量。

通常,当我们通过切片索引访问数据集时,例如dataset[i:j],它会返回一个连续的数据列表。 concat_examples分隔数据的每个元素并连接它以生成小批量。


