Lukasz Lukasz - 2 months ago 12
Python Question

Is there a way to reduce the amount of code for RMSProp

I have some code for a simple recurrent neural network and would like to know if there is a way for me to reduce the amount of code necessary for my update stage. The code I have so for:

class RNN(object):
def__init___(self, data, hidden_size, eps=0.0001):
self.data = data
self.hidden_size = hidden_size
self.weights_hidden = np.random.rand(hidden_size, hidden_size) * 0.1 # W
self.weights_input = np.random.rand(hidden_size, len(data[0])) * 0.1 # U
self.weights_output = np.random.rand(len(data[0]), hidden_size) * 0.1 # V
self.bias_hidden = np.array([np.random.rand(hidden_size)]).T # b
self.bias_output = np.array([np.random.rand(len(data[0]))]).T # c

self.cache_w_hid, self.cache_w_in, self.cache_w_out = 0, 0, 0
self.cache_b_hid, self.cache_b_out = 0, 0
self.eps = eps

def train(self, seq_length, epochs, eta, decay_rate=0.9, learning_decay=0.0):
# Other stuff
self.update(seq, epoch, eta, decay_rate, learning_decay)
# Other Stuff

def update(self, seq, epoch, eta, decay_rate, learning_decay):
"""Updates the network's weights and biases by applying gradient
descent using backpropagation through time and RMSPROP.
"""
delta_nabla_c, delta_nabla_b,\
delta_nabla_V, delta_nabla_W, delta_nabla_U = self.backward_pass(seq)

eta = eta*np.exp(-epoch*learning_decay)

self.cache_w_hid += decay_rate * self.cache_w_hid \
+ (1 - decay_rate) * delta_nabla_W**2
self.weights_hidden -= eta * delta_nabla_W / (np.sqrt(self.cache_w_hid) + self.eps)

self.cache_w_in += decay_rate * self.cache_w_in \
+ (1 - decay_rate) * delta_nabla_U**2
self.weights_input -= eta * delta_nabla_U / (np.sqrt(self.cache_w_in) + self.eps)

self.cache_w_out += decay_rate * self.cache_w_out \
+ (1 - decay_rate) * delta_nabla_V**2
self.weights_output -= eta * delta_nabla_V / (np.sqrt(self.cache_w_out) + self.eps)

self.cache_b_hid += decay_rate * self.cache_b_hid \
+ (1 - decay_rate) * delta_nabla_b**2
self.bias_hidden -= eta * delta_nabla_b / (np.sqrt(self.cache_b_hid) + self.eps)

self.cache_b_out += decay_rate * self.cache_b_out \
+ (1 - decay_rate) * delta_nabla_c**2
self.bias_output -= eta * delta_nabla_c / (np.sqrt(self.cache_b_out) + self.eps)


For every variable under
#RMSProp
follows the update rule, namely:

cache = decay_rate * cache + (1 - decay_rate) * dx**2
x += - learning_rate * dx / (np.sqrt(cache) + eps)


I have
cache_
all declared followed by
self.weight_
or
self.bias_
and would like to have this written more compactly. I was looking at using
zip()
but I'm not sure how to go about that.

Answer

Judging from your question, I am guessing that you are trying to improve readability/elegance over any other kind of optimization here.

You can introduce a function to implement the update rule, then call it once for each variable. The trick here is that Python lets you access attributes by name, so you can pass in the name of your cache and weights attribute instead of the value. This will let you update the value for future passes:

def update_rule(self, cache_attr, x_attr, decay_rate, learning_rate, dx):
    cache = getattr(self, cache_attr)
    cache = decay_rate * cache + (1 - decay_rate) * dx**2
    setattr(self, cache_attr, cache)

    x = getattr(self, x_attr)
    x += - learning_rate * dx / (np.sqrt(cache) + self.eps)
    setattr(self, x_attr, x)

def update(self, seq, epoch, eta, decay_rate, learning_decay):
    """Updates the network's weights and biases by applying gradient
    descent using backpropagation through time and RMSPROP. 
    """
    delta_nabla_c, delta_nabla_b,\
    delta_nabla_V, delta_nabla_W, delta_nabla_U = self.backward_pass(seq)

    eta = eta*np.exp(-epoch*learning_decay)

    self.update_rule('cache_w_hid', 'weights_hidden', decay_rate, eta, delta_nabla_W)
    self.update_rule('cache_w_in', 'weights_input', decay_rate, eta, delta_nabla_U)
    self.update_rule('cache_w_out', 'weights_output', decay_rate, eta, delta_nabla_V)
    self.update_rule('cache_b_hid', 'bias_hidden', decay_rate, eta, delta_nabla_b)
    self.update_rule('cache_w_out', 'bias_output', decay_rate, eta, delta_nabla_c)

In fact, you can save additional parameters and avoid exposing what is basically a private method by putting update_rule into update. This will expose the namespace of update to update_rule when it is called, so you do not have to pass in decay_rate and learning_rate:

def update(self, seq, epoch, eta, decay_rate, learning_decay):
    """Updates the network's weights and biases by applying gradient
    descent using backpropagation through time and RMSPROP. 
    """

    def update_rule(self, cache_attr, x_attr, dx):
        cache = getattr(self, cache_attr)
        cache = decay_rate * cache + (1 - decay_rate) * dx**2
        setattr(self, cache_attr, cache)

        x = getattr(self, x_attr)
        x += - eta * dx / (np.sqrt(cache) + self.eps)
        setattr(self, x_attr, x)

    delta_nabla_c, delta_nabla_b,\
    delta_nabla_V, delta_nabla_W, delta_nabla_U = self.backward_pass(seq)

    eta = eta*np.exp(-epoch*learning_decay)

    self.update_rule('cache_w_hid', 'weights_hidden', delta_nabla_W)
    self.update_rule('cache_w_in', 'weights_input', delta_nabla_U)
    self.update_rule('cache_w_out', 'weights_output', delta_nabla_V)
    self.update_rule('cache_b_hid', 'bias_hidden', delta_nabla_b)
    self.update_rule('cache_w_out', 'bias_output', delta_nabla_c)
Comments