I am currently a bit astonished that this takes so long. I want to calculate the sum of matrix elements, weighted by their distance to the diagonal. The square matrix contains only nonnegative integer elements.
#!/usr/bin/env python
"""Calculate a score for a square matrix."""
import random
random.seed(0)
def calculate_score(cm):
"""
Calculate a score how close big elements of cm are to the diagonal.
Examples

>>> cm = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
>>> calculate_score(cm)
32
"""
score = 0
for i, line in enumerate(cm):
for j, el in enumerate(line):
score += el * abs(i  j)
return score
def main(n):
import time
import numpy as np
score_calculations = 10**3
t = 0
for step in range(score_calculations):
cm = np.random.randint(0, 150000, size=(n, n))
t0 = time.time()
calculate_score(cm)
t1 = time.time()
t += (t1  t0)
print("{:0.2f} scores / sec".format(score_calculations / t))
if __name__ == '__main__':
main(369)
Here's a vectorized approach using broadcasting
to compute those weights
and then using matrixmultiplication
with np.tensordot
for those sumreductions

def calculate_score_vectorized(cm):
m,n = cm.shape
wghts = np.abs(np.arange(n)  np.arange(m)[:,None])
return np.tensordot(cm,wghts, axes=((0,1),(0,1)))
The last step of sumreduction
could also be computed with np.einsum

np.einsum('ij,ij',cm,wghts)
Also simply with elementwise multiplication and summation 
(cm*wghts).sum()
Runtime test 
In [104]: n = 369
In [105]: cm = np.random.randint(0, 150000, size=(n, n))
In [106]: calculate_score(cm)
Out[106]: 1257948732168
In [107]: calculate_score_vectorized(cm)
Out[107]: array(1257948732168)
In [108]: %timeit calculate_score(cm)
10 loops, best of 3: 31.4 ms per loop
In [109]: %timeit calculate_score_vectorized(cm)
1000 loops, best of 3: 675 µs per loop
In [110]: 31400/675.0
Out[110]: 46.51851851851852
46x+
speedup there for the given dataset sizes.
As mentioned in the comments, if the shape of input arrays stays the same, we could save the weights wghts
and reuse them with those sumreduction
methods discussed earlier for further boost.
#!/usr/bin/env python
"""Calculate a score for a square matrix."""
import random
random.seed(0)
import numpy as np
def calculate_score(cm, weights):
"""
Calculate a score how close big elements of cm are to the diagonal.
Examples

>>> cm = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
>>> weights = calculate_weight_matrix(3)
>>> calculate_score(cm, weights)
32
"""
return int(np.tensordot(cm, weights, axes=((0, 1), (0, 1))))
def calculate_weight_matrix(n):
"""
Calculate the weights for each position.
The weight is the distance to the diagonal.
"""
weights = np.abs(np.arange(n)  np.arange(n)[:, None])
return weights
def measure_time(n):
"""Measure the time of calculate_score for n x n matrices."""
import time
import numpy as np
score_calculations = 10**3
t = 0
weights = calculate_weight_matrix(n)
for step in range(score_calculations):
cm = np.random.randint(0, 150000, size=(n, n))
t0 = time.time()
calculate_score(cm, weights)
t1 = time.time()
t += (t1  t0)
print("{:0.2f} scores / sec".format(score_calculations / t))
if __name__ == '__main__':
import doctest
doctest.testmod()
measure_time(369)
This gives 10044.31 scores / sec  10381.71 scores / sec
(before: 32.47 scores / sec). That is a 309× speedup!