I want to call mkl.mkl_scsrmultcsr from python. The goal is to calculate a sparse matrix C in compressed sparse row format. Sparse matrix C is the matrix product between A and transpose of A, where A is also a sparse matrix in csr format. When calculating C = A dot (A.T) with scipy, scipy seems (?) to allocate new memory for holding transpose of A (A.T), and definitely allocates memory for a new C matrix (This means I can't use an existing C matrix). So, I want to try to use the mkl c function directly to decrease memory usage.
The following doesn't work because the program didn't modify the C matrix.
How to fix that?
Here is an answer that works for another mkl function. In that answer, mkl function was faster by 4 times.
import scipy.sparse as spsp
import numpy as np
import ctypes
from ctypes import *
import multiprocessing as mp
# Load the share library
mkl = ctypes.cdll.LoadLibrary("libmkl_rt.so")
class ctype_csr_mat(object):
def __init__(self, arr_in, c_type=c_float, clear=False):
dtype_to_ctype = {}
dtype_to_ctype[c_float] = np.float32
dtype_to_ctype[c_double] = np.float64
dtype_to_ctype[c_int] = np.int16
dtype = dtype_to_ctype[c_type]
self.csr = spsp.csr_matrix(arr_in, dtype=dtype)
# Cannot use .astype method on self.csr
# after this line, or converting to ctype
# array won't work as expected.
if clear == True:
self.csr.data[:] = 0
self.csr.indptr[:] = 0
self.csr.indices[:] = 0
self.c_data = self.csr.data.ctypes.data_as(POINTER(c_type)) #*self.csr.data.size
self.c_indptr = self.csr.indptr.ctypes.data_as (POINTER(c_int))
self.c_indices = self.csr.indices.ctypes.data_as(POINTER(c_int))
def show(self, indent=4):
sp = ' '*indent
to_print = [ self.csr.data,
self.c_data[0:self.csr.data.size],
self.c_indptr[0:self.csr.indptr.size],
self.c_indices[0:self.csr.indices.size] ]
name = ['csr data','c_data', 'c_indptr', 'c_indices']
for i in range(4):
print sp + name[i] + ': ' + str(to_print[i])
def mkl_dot(self,B, C, mkl_pass):
# calculate A.T * B and put answer in C
# by several passes.
assert type(B) == ctype_csr_mat
assert type(C) == ctype_csr_mat
request_list = [c_int(0),c_int(1), c_int(2)]
request1 = request_list[mkl_pass]; request = byref(request1)
(a, ja, ia) = (self.c_data, self.c_indptr, self.c_indices)
(b, jb, ib) = (B.c_data, B.c_indptr, B.c_indices)
(c, jc, ic) = (C.c_data, C.c_indptr, C.c_indices)
(mm, nn) = self.csr.shape
# Transpose matrix A before calculating
# matrix product.
trans1 = c_char('T') ; trans = byref(trans1)
sort1 = c_int(7) ; sort = byref(sort1)
m = byref(c_int(mm)) ; n = byref(c_int(nn)) ; k = byref(c_int(mm))
nzmax1 = c_int(1) ; nzmax = byref(nzmax1)
info1 = c_int(0) ; info = byref(info1)
mkl.mkl_scsrmultcsr(trans, request, sort,
m, n, k,
a, ja, ia,
b, jb, ib,
c, jc, ic,
nzmax, info)
return info1.value
def test():
# Make Test Data
Ao = [[1,0,0],
[1,1,1],
[0,0,1]]
AA = np.array(Ao, dtype=np.single) # save the original matrix
A = ctype_csr_mat(Ao)
B = A
C = ctype_csr_mat(np.ones((3,3)), clear=True)
def show_all(A,B,C):
print("A:")
A.show()
print("B:")
B.show()
print("C:")
C.show()
show_all(A,B,C)
for mkl_pass in [0,1,2]:
print "=== pass: %d ===" % (mkl_pass)
info = A.mkl_dot(B,C,mkl_pass)
print("info:")
print(" %d" % info)
show_all(A,B,C)
print "=== Correct answer ==="
title = ["A dot A.T:", "A dot A:"]
ans = [(AA.dot(AA.T)), (AA.dot(AA))]
for i in range(2):
ans_mat = ctype_csr_mat(ans[i])
print(title[i])
print(ans[i])
ans_mat.show()
test()
A:
csr data: [ 1. 1. 1. 1. 1.]
c_data: [1.0, 1.0, 1.0, 1.0, 1.0]
c_indptr: [0, 1, 4, 5]
c_indices: [0, 0, 1, 2, 2]
B:
csr data: [ 1. 1. 1. 1. 1.]
c_data: [1.0, 1.0, 1.0, 1.0, 1.0]
c_indptr: [0, 1, 4, 5]
c_indices: [0, 0, 1, 2, 2]
C:
csr data: [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]
c_data: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
c_indptr: [0, 0, 0, 0]
c_indices: [0, 0, 0, 0, 0, 0, 0, 0, 0]
=== pass: 0 ===
info:
0
A:
csr data: [ 1. 1. 1. 1. 1.]
c_data: [1.0, 1.0, 1.0, 1.0, 1.0]
c_indptr: [0, 1, 4, 5]
c_indices: [0, 0, 1, 2, 2]
B:
csr data: [ 1. 1. 1. 1. 1.]
c_data: [1.0, 1.0, 1.0, 1.0, 1.0]
c_indptr: [0, 1, 4, 5]
c_indices: [0, 0, 1, 2, 2]
C:
csr data: [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]
c_data: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
c_indptr: [0, 0, 0, 0]
c_indices: [1, 1, 1, 1, 0, 0, 0, 0, 0]
=== pass: 1 ===
info:
0
A:
csr data: [ 1. 1. 1. 1. 1.]
c_data: [1.0, 1.0, 1.0, 1.0, 1.0]
c_indptr: [0, 1, 4, 5]
c_indices: [0, 0, 1, 2, 2]
B:
csr data: [ 1. 1. 1. 1. 1.]
c_data: [1.0, 1.0, 1.0, 1.0, 1.0]
c_indptr: [0, 1, 4, 5]
c_indices: [0, 0, 1, 2, 2]
C:
csr data: [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]
c_data: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
c_indptr: [0, 0, 0, 0]
c_indices: [1, 1, 1, 1, 0, 0, 0, 0, 0]
=== pass: 2 ===
info:
0
A:
csr data: [ 1. 1. 1. 1. 1.]
c_data: [1.0, 1.0, 1.0, 1.0, 1.0]
c_indptr: [0, 1, 4, 5]
c_indices: [0, 0, 1, 2, 2]
B:
csr data: [ 1. 1. 1. 1. 1.]
c_data: [1.0, 1.0, 1.0, 1.0, 1.0]
c_indptr: [0, 1, 4, 5]
c_indices: [0, 0, 1, 2, 2]
C:
csr data: [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]
c_data: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
c_indptr: [0, 0, 0, 0]
c_indices: [1, 1, 1, 1, 0, 0, 0, 0, 0]
=== Correct answer ===
A dot A.T:
[[ 1. 1. 0.]
[ 1. 3. 1.]
[ 0. 1. 1.]]
csr data: [ 1. 1. 1. 3. 1. 1. 1.]
c_data: [1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0]
c_indptr: [0, 2, 5, 7]
c_indices: [0, 1, 0, 1, 2, 1, 2]
A dot A:
[[ 1. 0. 0.]
[ 2. 1. 2.]
[ 0. 0. 1.]]
csr data: [ 1. 2. 1. 2. 1.]
c_data: [1.0, 2.0, 1.0, 2.0, 1.0]
c_indptr: [0, 1, 4, 5]
c_indices: [0, 0, 1, 2, 2]
from ctypes import *
import scipy.sparse as spsp
import numpy as np
# Load the share library
mkl = cdll.LoadLibrary("libmkl_rt.so")
def print_mat(mat, m, n,indent=8):
for i in xrange(0,m):
print " "*indent,
for j in xrange(0,n):
print mat[i*n+j],
# Initialize scalar data
trans_pointer = byref(c_char('T')) # trans = 'T' or 't' or 'C' or 'c', then C := A'*B.
request_l = [c_int(0),
c_int(1),
c_int(2)] # somehow, the calculation is performed in two passes
sort_pointer = byref(c_int(0))
m_pointer = byref(c_int(3)) # Number of rows of matrix A
n_pointer = byref(c_int(3)) # Number of columns of matrix A
k_pointer = byref(c_int(3)) # Number of columns of matrix B
a_type = c_double*5 # Array containing non-zero elements of the matrix A.
# This corresponds to data array of csr_matrix
# Its length is equal to #non zero elements in A
# (Can this be longer than actual #non-zero elements?)
a = a_type(1.0, 1.0, 1.0, 1.0, 1.0)
a_pointer = byref(a)
ja_type = c_int*5 # Array of column indices of all non-zero elements of A.
# This corresponds to the indices array of csr_matrix
ja = ja_type(1, 1, 2, 3, 3)
ja_pointer = byref(ja)
ia_type = c_int*4 # Array of length m+1.
# a[ia[i]:ia[i+1]] is the value of nonzero entries of
# the ith row of A.
# ja[ia[i]:ia[i+1]] is the column indices of nonzero
# entries of the ith row of A
# This corresponds to the indptr array of csr_matrix
ia = ia_type(1, 2, 5, 6)
ia_pointer = byref(ia)
####
b_pointer = a_pointer
jb_pointer = ja_pointer
ib_pointer = ia_pointer
#b = a
#jb = ja
#ib = ia
####
nzmax_pointer = byref(c_int(3*3)) # length of arrays c and jc. (which are data and
# indices of csr_matrix). So this is the number of
# nonzero elements of matrix C
#
# This parameter is used only if request=0.
# The routine stops calculation if the number of
# elements in the result matrix C exceeds the
# specified value of nzmax.
nz = 9 # Number of nonzero elements
c_type = c_double*nz # Array of nonzero entries of matrix C
c = c_type(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
c_pointer = byref(c)
jc_type = c_int*nz
jc = jc_type(0, 0, 0, 0, 0, 0, 0, 0, 0)
jc_pointer = byref(jc)
ic_type = c_int*9 # Array of length m + 1 when trans = 'N' or 'n', or n + 1 otherwise.
ic = ic_type(0, 0, 0, 0, 0, 0, 0, 0, 0)
ic_pointer = byref(ic)
print('Original')
print(' c')
print_mat(c, 1, 9)
print(' jc')
print_mat(jc, 1, 9)
print(' ic')
print_mat(ic, 1, 9)
for ii in [0,1,2]:
request_pointer = byref(request_l[ii])
print('request %d' % (ii))
info = c_int(-3)
info_pointer = byref(info)
mkl.mkl_dcsrmultcsr(trans_pointer, request_pointer, sort_pointer,
m_pointer, n_pointer, k_pointer,
a_pointer, ja_pointer, ia_pointer,
b_pointer, jb_pointer, ib_pointer,
c_pointer, jc_pointer, ic_pointer,
nzmax_pointer, info_pointer)
print(' info %d' % (info.value))
print(' c')
print_mat(c, 1, 9)
print(' jc')
print_mat(jc, 1, 9)
print(' ic')
print_mat(ic, 1, 9)
A_data = np.array([1,1,1,1,1])
A_indptr = np.array([0,1,4,5])
A_indices = np.array([0,0,1,2,2])
A = spsp.csr_matrix( (A_data, A_indices, A_indptr), shape=(3,3))
print( ("A matrix\n%s"%A.todense()).replace('\n','\n'+' ') )
print("C = (A.T).dot(A)")
C = (A.T).dot(A)
print( ("C matrix\n%s"%C.todense()).replace('\n','\n'+' ') )
print( "C.data : %s" % C.data )
print( "C.indices + 1: %s" % (C.indices+1) )
print( "C.indptr + 1 : %s" % (C.indptr+1) )
Original
c
0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
jc
0 0 0 0 0 0 0 0 0
ic
0 0 0 0 0 0 0 0 0
request 0
info 0
c
2.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0
jc
1 2 3 1 2 3 1 2 3
ic
1 4 7 10 0 0 0 0 0
request 1
info 0
c
2.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0
jc
1 2 3 1 2 3 1 2 3
ic
1 4 7 10 0 0 0 0 0
request 2
info 0
c
2.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0
jc
1 2 3 1 2 3 1 2 3
ic
1 4 7 10 0 0 0 0 0
A matrix
[[1 0 0]
[1 1 1]
[0 0 1]]
C = (A.T).dot(A)
C matrix
[[2 1 1]
[1 1 1]
[1 1 2]]
C.data : [1 1 2 1 1 1 2 1 1]
C.indices + 1: [3 2 1 3 2 1 3 2 1]
C.indptr + 1 : [ 1 4 7 10]
mkl_scsrmultcsr(&ta, &r[1], &sort, &m, &m, &m, a, ja, ia, a, ja, ia, c, jc, ic, &nzmax, &info);
Look at the Python code for the scipy sparse product. Notice that it calls the compiled code in 2 passes.
It looks like the mkl code does the same thing
https://software.intel.com/en-us/node/468640
If request=1, the routine computes only values of the array ic of length m + 1, the memory for this array must be allocated beforehand. On exit the value ic(m+1) - 1 is the actual number of the elements in the arrays c and jc.
If request=2, the routine has been called previously with the parameter request=1, the output arrays jc and c are allocated in the calling program and they are of the length ic(m+1) - 1 at least.
You first allocated ic
based on the number of rows of C
(you know that from the inputs), and call the mkl code with request=1
.
For request=2
you have to allocate c
and jc
arrays, based on the size in ic(m+1) - 1
. This is not the same as the number of nnz
in the input arrays.
You are using request1 = c_int(0)
, which requires that the c
arrays be the correct size - which you don't know without actually performing the dot
(or a request 1
).
==================
File: /usr/lib/python3/dist-packages/scipy/sparse/compressed.py
Definition: A._mul_sparse_matrix(self, other)
pass 1 allocates indptr
(note size), and passes the pointers (data doesn't matter at this pass)
indptr = np.empty(major_axis + 1, dtype=idx_dtype)
fn = getattr(_sparsetools, self.format + '_matmat_pass1')
fn(M, N,
np.asarray(self.indptr, dtype=idx_dtype),
np.asarray(self.indices, dtype=idx_dtype),
np.asarray(other.indptr, dtype=idx_dtype),
np.asarray(other.indices, dtype=idx_dtype),
indptr)
nnz = indptr[-1]
pass 2 allocates indptr
(different size), and based on nnz
indices
and data
.
indptr = np.asarray(indptr, dtype=idx_dtype)
indices = np.empty(nnz, dtype=idx_dtype)
data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
fn = getattr(_sparsetools, self.format + '_matmat_pass2')
fn(M, N, np.asarray(self.indptr, dtype=idx_dtype),
np.asarray(self.indices, dtype=idx_dtype),
self.data,
np.asarray(other.indptr, dtype=idx_dtype),
np.asarray(other.indices, dtype=idx_dtype),
other.data,
indptr, indices, data)
Last make a new array using these arrays.
return self.__class__((data,indices,indptr),shape=(M,N))
The mkl
library should be used in the same way.
===================
https://github.com/scipy/scipy/blob/master/scipy/sparse/sparsetools/csr.h
has c code for csr_matmat_pass1
and csr_matmat_pass2
====================
In case it helps, here's a pure Python implementation of these passes. A literal translation without any attempt to take advantage of any array operations.
def pass1(A, B):
nrow,ncol=A.shape
Aptr=A.indptr
Bptr=B.indptr
Cp=np.zeros(nrow+1,int)
mask=np.zeros(ncol,int)-1
nnz=0
for i in range(nrow):
row_nnz=0
for jj in range(Aptr[i],Aptr[i+1]):
j=A.indices[jj]
for kk in range(Bptr[j],Bptr[j+1]):
k=B.indices[kk]
if mask[k]!=i:
mask[k]=i
row_nnz += 1
nnz += row_nnz
Cp[i+1]=nnz
return Cp
def pass2(A,B,Cnnz):
nrow,ncol=A.shape
Ap,Aj,Ax=A.indptr,A.indices,A.data
Bp,Bj,Bx=B.indptr,B.indices,B.data
next = np.zeros(ncol,int)-1
sums = np.zeros(ncol,A.dtype)
Cp=np.zeros(nrow+1,int)
Cj=np.zeros(Cnnz,int)
Cx=np.zeros(Cnnz,A.dtype)
nnz = 0
for i in range(nrow):
head = -2
length = 0
for jj in range(Ap[i], Ap[i+1]):
j, v = Aj[jj], Ax[jj]
for kk in range(Bp[j], Bp[j+1]):
k = Bj[kk]
sums[k] += v*Bx[kk]
if (next[k]==-1):
next[k], head=head, k
length += 1
print(i,sums, next)
for _ in range(length):
if sums[head] !=0:
Cj[nnz], Cx[nnz] = head, sums[head]
nnz += 1
temp = head
head = next[head]
next[temp], sums[temp] = -1, 0
Cp[i+1] = nnz
return Cp, Cj, Cx
def pass0(A,B):
Cp = pass1(A,B)
nnz = Cp[-1]
Cp,Cj,Cx=pass2(A,B,nnz)
N,M=A.shape[0], B.shape[1]
C=sparse.csr_matrix((Cx, Cj, Cp), shape=(N,M))
return C
Both A
and B
have to be csr
. It using a transpose it needs conversion, e.g. B = A.T.tocsr()
.