I am currently a bit astonished that this takes so long. I want to calculate the sum of matrix elements, weighted by their distance to the diagonal. The square matrix contains only nonnegative integer elements.
#!/usr/bin/env python
"""Calculate a score for a square matrix."""
import random
random.seed(0)
def calculate_score(cm):
"""
Calculate a score how close big elements of cm are to the diagonal.
Examples

>>> cm = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
>>> calculate_score(cm)
32
"""
score = 0
for i, line in enumerate(cm):
for j, el in enumerate(line):
score += el * abs(i  j)
return score
def main(n):
import time
import numpy as np
score_calculations = 10**3
t = 0
for step in range(score_calculations):
cm = np.random.randint(0, 150000, size=(n, n))
t0 = time.time()
calculate_score(cm)
t1 = time.time()
t += (t1  t0)
print("{:0.2f} scores / sec".format(score_calculations / t))
if __name__ == '__main__':
main(369)
kernprof l v main.py
range(n)
Wrote profile results to main.py.lprof
Timer unit: 1e06 s
Total time: 109.124 s
File: main.py
Function: calculate_score at line 9
Line # Hits Time Per Hit % Time Line Contents
==============================================================
9 @profile
10 def calculate_score(cm):
11 """
12 Calculate a score how close big elements of cm are to the diagonal.
13
14 Examples
15 
16 >>> cm = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
17 >>> calculate_score(cm)
18 32
19 """
20 1000 619 0.6 0.0 score = 0
21 370000 180693 0.5 0.2 for i, line in enumerate(cm):
22 136530000 43691655 0.3 40.0 for j, el in enumerate(line):
23 136161000 65250190 0.5 59.8 score += el * abs(i  j)
24 1000 386 0.4 0.0 return score
Here's a vectorized approach using broadcasting
to compute those weights
and then using matrixmultiplication
with np.tensordot
for those sumreductions

def calculate_score_vectorized(cm):
m,n = cm.shape
wghts = np.abs(np.arange(n)  np.arange(m)[:,None])
return np.tensordot(cm,wghts, axes=((0,1),(0,1)))
The last step of sumreduction
could also be computed with np.einsum

np.einsum('ij,ij',cm,wghts)
Also simply with elementwise multiplication and summation 
(cm*wghts).sum()
Runtime test 
In [104]: n = 369
In [105]: cm = np.random.randint(0, 150000, size=(n, n))
In [106]: calculate_score(cm)
Out[106]: 1257948732168
In [107]: calculate_score_vectorized(cm)
Out[107]: array(1257948732168)
In [108]: %timeit calculate_score(cm)
10 loops, best of 3: 31.4 ms per loop
In [109]: %timeit calculate_score_vectorized(cm)
1000 loops, best of 3: 675 µs per loop
In [110]: 31400/675.0
Out[110]: 46.51851851851852
46x+
speedup there for the given dataset sizes.
As mentioned in the comments, if the shape of input arrays stays the same, we could save the weights wghts
and reuse them with those sumreduction
methods discussed earlier for further boost.
#!/usr/bin/env python
"""Calculate a score for a square matrix."""
import random
random.seed(0)
import numpy as np
def calculate_score(cm, weights):
"""
Calculate a score how close big elements of cm are to the diagonal.
Examples

>>> cm = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
>>> weights = calculate_weight_matrix(3)
>>> calculate_score(cm, weights)
32
"""
return int(np.tensordot(cm, weights, axes=((0, 1), (0, 1))))
def calculate_weight_matrix(n):
"""
Calculate the weights for each position.
The weight is the distance to the diagonal.
"""
weights = np.abs(np.arange(n)  np.arange(n)[:, None])
return weights
def measure_time(n):
"""Measure the time of calculate_score for n x n matrices."""
import time
import numpy as np
score_calculations = 10**3
t = 0
weights = calculate_weight_matrix(n)
for step in range(score_calculations):
cm = np.random.randint(0, 150000, size=(n, n))
t0 = time.time()
calculate_score(cm, weights)
t1 = time.time()
t += (t1  t0)
print("{:0.2f} scores / sec".format(score_calculations / t))
if __name__ == '__main__':
import doctest
doctest.testmod()
measure_time(369)
This gives 10044.31 scores / sec  10381.71 scores / sec
(before: 32.47 scores / sec). That is a 309× speedup!