I'd like to know how I might be able to transform this problem to reduce the overhead of the
np.sum()
input
shape=(1000, 36)
nodes_nbrs
nodes_nbrs = {0: [0, 1],
1: [1, 0, 2],
2: [2, 1],
...}
0
0
1
1
1
0
2
nodes_nbrs
output = np.zeros(shape=input.shape)
for k, v in nodes_nbrs.items():
output[k] = np.sum(input[v], axis=0)
shape=(1000, 36)
shape=(~1E(56), 36)
np.sum
cython
for
cython
np.sum
np.sum
npsum
npsum
for
np.sum
np.add.reduce
npsum
npsum
for
autograd
numba
nodes_nbrs
numpy
np.dot
shape=(10^n, 10^n)
scipy
autograd
dot
scipy
If scipy.sparse is not an option, one way you might approach this would be to massage your data so that you can use vectorized functions to do everything in the compiled layer. If you change your neighbors dictionary into a twodimensional array with appropriate flags for missing values, you can use np.take
to extract the data you want and then do a single sum()
call.
Here's an example of what I have in mind:
import numpy as np
def make_data(N=100):
X = np.random.randint(1, 20, (N, 36))
connections = np.random.randint(2, 5, N)
nbrs = {i: list(np.random.choice(N, c))
for i, c in enumerate(connections)}
return X, nbrs
def original_solution(X, nbrs):
output = np.zeros(shape=X.shape)
for k, v in nbrs.items():
output[k] = np.sum(X[v], axis=0)
return output
def vectorized_solution(X, nbrs):
# Make neighbors all the same length, filling with 1
new_nbrs = np.full((X.shape[0], max(map(len, nbrs.values()))), 1, dtype=int)
for i, v in nbrs.items():
new_nbrs[i, :len(v)] = v
# add a row of zeros to X
new_X = np.vstack([X, 0 * X[0]])
# compute the sums
return new_X.take(new_nbrs, 0).sum(1)
Now we can confirm that the results match:
>>> X, nbrs = make_data(100)
>>> np.allclose(original_solution(X, nbrs),
vectorized_solution(X, nbrs))
True
And we can time things to see the speedup:
X, nbrs = make_data(1000)
%timeit original_solution(X, nbrs)
%timeit vectorized_solution(X, nbrs)
# 100 loops, best of 3: 13.7 ms per loop
# 100 loops, best of 3: 1.89 ms per loop
Going up to larger sizes:
X, nbrs = make_data(100000)
%timeit original_solution(X, nbrs)
%timeit vectorized_solution(X, nbrs)
1 loop, best of 3: 1.42 s per loop
1 loop, best of 3: 249 ms per loop
It's about a factor of 510 faster, which may be good enough for your purposes (though this will heavily depend on the exact characteristics of your nbrs
dictionary).
Edit: Just for fun, I tried a couple other approaches, one using numpy.add.reduceat
, one using pandas.groupby
, and one using scipy.sparse
. It seems that the vectorized approach I originally proposed above is probably the best bet. Here they are for reference:
from itertools import chain
def reduceat_solution(X, nbrs):
ind, j = np.transpose([[i, len(v)] for i, v in nbrs.items()])
i = list(chain(*(nbrs[i] for i in ind)))
j = np.concatenate([[0], np.cumsum(j)[:1]])
return np.add.reduceat(X[i], j)[ind]
np.allclose(original_solution(X, nbrs),
reduceat_solution(X, nbrs))
# True

import pandas as pd
def groupby_solution(X, nbrs):
i, j = np.transpose([[k, vi] for k, v in nbrs.items() for vi in v])
return pd.groupby(pd.DataFrame(X[j]), i).sum().values
np.allclose(original_solution(X, nbrs),
groupby_solution(X, nbrs))
# True

from scipy.sparse import csr_matrix
from itertools import chain
def sparse_solution(X, nbrs):
items = (([i]*len(col), col, [1]*len(col)) for i, col in nbrs.items())
rows, cols, data = (np.array(list(chain(*a))) for a in zip(*items))
M = csr_matrix((data, (rows, cols)))
return M.dot(X)
np.allclose(original_solution(X, nbrs),
sparse_solution(X, nbrs))
# True
And all the timings together:
X, nbrs = make_data(100000)
%timeit original_solution(X, nbrs)
%timeit vectorized_solution(X, nbrs)
%timeit reduceat_solution(X, nbrs)
%timeit groupby_solution(X, nbrs)
%timeit sparse_solution(X, nbrs)
# 1 loop, best of 3: 1.46 s per loop
# 1 loop, best of 3: 268 ms per loop
# 1 loop, best of 3: 416 ms per loop
# 1 loop, best of 3: 657 ms per loop
# 1 loop, best of 3: 282 ms per loop