The version of numpy data
import numpy as np
class Dataset:
def __init__(self, data):
self._index_in_epoch = 0
self._epochs_completed = 0
self._data = data
self._num_examples = data.shape[0]
pass
@property
def data(self):
return self._data
def next_batch(self, batch_size, shuffle=True):
start = self._index_in_epoch
if start == 0 and self._epochs_completed == 0:
idx = np.arange(0, self._num_examples)
np.random.shuffle(idx) # shuffle indexe
self._data = self.data[idx] # get the shuffled data
# go to the data of next batch
if start + batch_size > self._num_examples:
'''
note: when start == self._num_examples, data_rest_part = np.array([])
'''
self._epochs_completed += 1
# print(self.data)
rest_num_examples = self._num_examples - start
data_rest_part = self.data[start:self._num_examples]
idx_update = np.arange(0, self._num_examples)
np.random.shuffle(idx_update)
self._data = self.data[idx_update] # get another shuffled data
start = 0
self._index_in_epoch = batch_size - rest_num_examples
end = self._index_in_epoch
data_new_part = self._data[start:end]
return np.concatenate((data_rest_part, data_new_part), axis=0)
else:
self._index_in_epoch += batch_size
end = self._index_in_epoch
return self._data[start:end]
dataset = Dataset(np.arange(0, 10))
for i in range(10):
print(dataset.next_batch(6))
print(dataset.data)
The version of pandas data
import numpy as np
import pandas as pd
class Dataset:
def __init__(self, data):
self._index_in_epoch = 0
self._epochs_completed = 0
self._data = data
self._num_examples = data.shape[0]
pass
@property
def data(self):
return self._data
def next_batch(self, batch_size, shuffle=True):
start = self._index_in_epoch
if start == 0 and self._epochs_completed == 0:
idx = np.arange(0, self._num_examples)
np.random.shuffle(idx) # shuffle index
self._data = self.data.iloc[idx,:] # get the shuffled data
# go to the data of next batch
if start + batch_size > self._num_examples:
'''
note: when start == self._num_examples, data_rest_part = np.array([])
'''
self._epochs_completed += 1
# print(self.data) # this is for debug
rest_num_examples = self._num_examples - start
data_rest_part = self.data.iloc[start:self._num_examples,:]
idx_update = np.arange(0, self._num_examples)
np.random.shuffle(idx_update)
self._data = self.data.iloc[idx_update,:] # get another shuffled data
start = 0
self._index_in_epoch = batch_size - rest_num_examples
end = self._index_in_epoch
data_new_part = self._data.iloc[start:end,:]
return pd.concat((data_rest_part, data_new_part), axis=0)
else:
self._index_in_epoch += batch_size
end = self._index_in_epoch
return self._data[start:end]
df = pd.DataFrame()
df['a']=np.arange(10)
df['b']=np.arange(10)*10
dataset = Dataset(df)
for i in range(10):
print(dataset.next_batch(5))
print(dataset.data)