Hossein Hossein - 1 month ago 12
Python Question

How can I get around memory limitation in this script?

I'm trying to normalize my dataset which is

1.7 Gigabyte
. I have
14Gig of RAM
and I hit my limit very quickly.

This happens when computing the
mean/std
of the training data. The training data takes up the majority of the memory when loaded into
RAM(13.8Gig)
,thus the mean gets calculated, but when it reaches to the next line while calculating the
std
, it crashes.

Follows the script:

import caffe
import leveldb
import numpy as np
from caffe.proto import caffe_pb2
import cv2
import sys
import time

direct = 'examples/svhn/'
db_train = leveldb.LevelDB(direct+'svhn_train_leveldb')
db_test = leveldb.LevelDB(direct+'svhn_test_leveldb')
datum = caffe_pb2.Datum()

#using the whole dataset for training which is 604,388
size_train = 604388 #normal training set is 73257
size_test = 26032
data_train = np.zeros((size_train, 3, 32, 32))
label_train = np.zeros(size_train, dtype=int)

print 'Reading training data...'
i = -1
for key, value in db_train.RangeIter():
i = i + 1
if i % 1000 == 0:
print i
if i == size_train:
break
datum.ParseFromString(value)
label = datum.label
data = caffe.io.datum_to_array(datum)
data_train[i] = data
label_train[i] = label

print 'Computing statistics...'
print 'calculating mean...'
mean = np.mean(data_train, axis=(0,2,3))
print 'calculating std...'
std = np.std(data_train, axis=(0,2,3))

#np.savetxt('mean_svhn.txt', mean)
#np.savetxt('std_svhn.txt', std)

print 'Normalizing training'
for i in range(3):
print i
data_train[:, i, :, :] = data_train[:, i, :, :] - mean[i]
data_train[:, i, :, :] = data_train[:, i, :, :]/std[i]


print 'Outputting training data'
leveldb_file = direct + 'svhn_train_leveldb_normalized'
batch_size = size_train

# create the leveldb file
db = leveldb.LevelDB(leveldb_file)
batch = leveldb.WriteBatch()
datum = caffe_pb2.Datum()

for i in range(size_train):
if i % 1000 == 0:
print i

# save in datum
datum = caffe.io.array_to_datum(data_train[i], label_train[i])
keystr = '{:0>5d}'.format(i)
batch.Put( keystr, datum.SerializeToString() )

# write batch
if(i + 1) % batch_size == 0:
db.Write(batch, sync=True)
batch = leveldb.WriteBatch()
print (i + 1)

# write last batch
if (i+1) % batch_size != 0:
db.Write(batch, sync=True)
print 'last batch'
print (i + 1)
#explicitly freeing memory to avoid hitting the limit!
#del data_train
#del label_train

print 'Reading test data...'
data_test = np.zeros((size_test, 3, 32, 32))
label_test = np.zeros(size_test, dtype=int)
i = -1
for key, value in db_test.RangeIter():
i = i + 1
if i % 1000 == 0:
print i
if i ==size_test:
break
datum.ParseFromString(value)
label = datum.label
data = caffe.io.datum_to_array(datum)
data_test[i] = data
label_test[i] = label

print 'Normalizing test'
for i in range(3):
print i
data_test[:, i, :, :] = data_test[:, i, :, :] - mean[i]
data_test[:, i, :, :] = data_test[:, i, :, :]/std[i]

#Zero Padding
#print 'Padding...'
#npad = ((0,0), (0,0), (4,4), (4,4))
#data_train = np.pad(data_train, pad_width=npad, mode='constant', constant_values=0)
#data_test = np.pad(data_test, pad_width=npad, mode='constant', constant_values=0)

print 'Outputting test data'
leveldb_file = direct + 'svhn_test_leveldb_normalized'
batch_size = size_test

# create the leveldb file
db = leveldb.LevelDB(leveldb_file)
batch = leveldb.WriteBatch()
datum = caffe_pb2.Datum()

for i in range(size_test):
# save in datum
datum = caffe.io.array_to_datum(data_test[i], label_test[i])
keystr = '{:0>5d}'.format(i)
batch.Put( keystr, datum.SerializeToString() )

# write batch
if(i + 1) % batch_size == 0:
db.Write(batch, sync=True)
batch = leveldb.WriteBatch()
print (i + 1)

# write last batch
if (i+1) % batch_size != 0:
db.Write(batch, sync=True)
print 'last batch'
print (i + 1)


How can I make it consume less memory so that I can get to run the script?

Answer

The culprit that cause so much issues and constant crashing due to insufficient memory , was due to batch size being the size of whole training set :

print 'Outputting test data'
leveldb_file = direct + 'svhn_test_leveldb_normalized'
batch_size = size_test

This apparently was the cause, nothing would get committed and saved to the disk until the whole dataset was read and loaded into one huge transaction, this is also the case when using np.float32 suggested by @BillCheatham didnt work properly.
For calculating the mean/std I used this approach and it worked:

  1. I divided the training set into two (or more it doesn't matter).
  2. I then load different parts one after the other, calculate mean/std respectively
  3. Then I add the fractions par together and the resulting numbers are very close to the actual value

the memorymap solution wouldnt work for me for some reason and I used the solution I mentioned above.

ps: Later on, I completely changed to float32, fixed the batch_size and ran the whole thing all together, thats how I could say my former solution (devide and add the fractions together) works and gives the exact number up to 2 decimals.