@@ -147,63 +147,104 @@ end
147
147
* (A:: Adjoint{<:Any,<:SparseMatrixCSC{Tv,Ti}} , B:: Adjoint{<:Any,<:SparseMatrixCSC{Tv,Ti}} ) where {Tv,Ti} = spmatmul (copy (A), copy (B))
148
148
* (A:: Transpose{<:Any,<:SparseMatrixCSC{Tv,Ti}} , B:: Transpose{<:Any,<:SparseMatrixCSC{Tv,Ti}} ) where {Tv,Ti} = spmatmul (copy (A), copy (B))
149
149
150
- function spmatmul (A:: SparseMatrixCSC{Tv,Ti} , B:: SparseMatrixCSC{Tv,Ti} ;
151
- sortindices:: Symbol = :sortcols ) where {Tv,Ti}
150
+ # Gustavsen's matrix multiplication algorithm revisited.
151
+ # The result rowval vector is already sorted by construction.
152
+ # The auxiliary Vector{Ti} xb is replaced by a Vector{Bool} of same length.
153
+ # The optional argument controlling a sorting algorithm is obsolete.
154
+ # depending on expected execution speed the sorting of the result column is
155
+ # done by a quicksort of the row indices or by a full scan of the dense result vector.
156
+ # The last is faster, if more than ≈ 1/32 of the result column is nonzero.
157
+ # TODO : extend to SparseMatrixCSCUnion to allow for SubArrays (view(X, :, r)).
158
+ function spmatmul (A:: SparseMatrixCSC{Tv,Ti} , B:: SparseMatrixCSC{Tv,Ti} ) where {Tv,Ti}
152
159
mA, nA = size (A)
153
- mB, nB = size (B)
154
- nA== mB || throw (DimensionMismatch ())
160
+ nB = size (B, 2 )
161
+ nA == size (B, 1 ) || throw (DimensionMismatch ())
155
162
156
- colptrA = A. colptr; rowvalA = A. rowval; nzvalA = A. nzval
157
- colptrB = B. colptr; rowvalB = B. rowval; nzvalB = B. nzval
158
- # TODO : Need better estimation of result space
159
- nnzC = min (mA* nB, length (nzvalA) + length (nzvalB))
163
+ rowvalA = rowvals (A); nzvalA = nonzeros (A)
164
+ rowvalB = rowvals (B); nzvalB = nonzeros (B)
165
+ nnzC = max (estimate_mulsize (mA, nnz (A), nA, nnz (B), nB) * 11 ÷ 10 , mA)
160
166
colptrC = Vector {Ti} (undef, nB+ 1 )
161
167
rowvalC = Vector {Ti} (undef, nnzC)
162
168
nzvalC = Vector {Tv} (undef, nnzC)
169
+ nzpercol = nnzC ÷ max (nB, 1 )
163
170
164
171
@inbounds begin
165
172
ip = 1
166
- xb = zeros (Ti, mA)
167
- x = zeros (Tv, mA)
173
+ xb = fill (false , mA)
168
174
for i in 1 : nB
169
175
if ip + mA - 1 > nnzC
170
- resize! (rowvalC, nnzC + max (nnzC,mA) )
171
- resize! (nzvalC , nnzC + max (nnzC,mA) )
172
- nnzC = length (nzvalC)
176
+ nnzC += max (mA, nnzC >> 2 )
177
+ resize! (rowvalC , nnzC)
178
+ resize! (nzvalC, nnzC )
173
179
end
174
- colptrC[i] = ip
175
- for jp in colptrB[i]: (colptrB[i+ 1 ] - 1 )
180
+ colptrC[i] = ip0 = ip
181
+ k0 = ip - 1
182
+ for jp in nzrange (B, i)
176
183
nzB = nzvalB[jp]
177
184
j = rowvalB[jp]
178
- for kp in colptrA[j] : (colptrA[j + 1 ] - 1 )
185
+ for kp in nzrange (A, j )
179
186
nzC = nzvalA[kp] * nzB
180
187
k = rowvalA[kp]
181
- if xb[k] != i
188
+ if xb[k]
189
+ nzvalC[k+ k0] += nzC
190
+ else
191
+ nzvalC[k+ k0] = nzC
192
+ xb[k] = true
182
193
rowvalC[ip] = k
183
194
ip += 1
184
- xb[k] = i
185
- x[k] = nzC
186
- else
187
- x[k] += nzC
188
195
end
189
196
end
190
197
end
191
- for vp in colptrC[i]: (ip - 1 )
192
- nzvalC[vp] = x[rowvalC[vp]]
198
+ if ip > ip0
199
+ if prefer_sort (ip- k0, mA)
200
+ # in-place sort of indices. Effort: O(nnz*ln(nnz)).
201
+ sort! (rowvalC, ip0, ip- 1 , QuickSort, Base. Order. Forward)
202
+ for vp = ip0: ip- 1
203
+ k = rowvalC[vp]
204
+ xb[k] = false
205
+ nzvalC[vp] = nzvalC[k+ k0]
206
+ end
207
+ else
208
+ # scan result vector (effort O(mA))
209
+ for k = 1 : mA
210
+ if xb[k]
211
+ xb[k] = false
212
+ rowvalC[ip0] = k
213
+ nzvalC[ip0] = nzvalC[k+ k0]
214
+ ip0 += 1
215
+ end
216
+ end
217
+ end
193
218
end
194
219
end
195
220
colptrC[nB+ 1 ] = ip
196
221
end
197
222
198
- deleteat ! (rowvalC, colptrC[ end ] : length (rowvalC) )
199
- deleteat ! (nzvalC, colptrC[ end ] : length (nzvalC) )
223
+ resize ! (rowvalC, ip - 1 )
224
+ resize ! (nzvalC, ip - 1 )
200
225
201
- # The Gustavson algorithm does not guarantee the product to have sorted row indices.
202
- Cunsorted = SparseMatrixCSC (mA, nB, colptrC, rowvalC, nzvalC)
203
- C = SparseArrays. sortSparseMatrixCSC! (Cunsorted, sortindices= sortindices)
226
+ # This modification of Gustavson algorithm has sorted row indices
227
+ C = SparseMatrixCSC (mA, nB, colptrC, rowvalC, nzvalC)
204
228
return C
205
229
end
206
230
231
+ # estimated number of non-zeros in matrix product
232
+ # it is assumed, that the non-zero indices are distributed independently and uniformly
233
+ # in both matrices. Over-estimation is possible if that is not the case.
234
+ function estimate_mulsize (m:: Integer , nnzA:: Integer , n:: Integer , nnzB:: Integer , k:: Integer )
235
+ p = (nnzA / (m * n)) * (nnzB / (n * k))
236
+ p >= 1 ? m* k : p > 0 ? Int (ceil (- expm1 (log1p (- p) * n)* m* k)) : 0 # (1-(1-p)^n)*m*k
237
+ end
238
+
239
+ # determine if sort! shall be used or the whole column be scanned
240
+ # based on empirical data on i7-3610QM CPU
241
+ # measuring runtimes of the scanning and sorting loops of the algorithm.
242
+ # The parameters 6 and 3 might be modified for different architectures.
243
+ prefer_sort (nz:: Integer , m:: Integer ) = m > 6 && 3 * ilog2 (nz) * nz < m
244
+
245
+ # minimal number of bits required to represent integer; ilog2(n) >= log2(n)
246
+ ilog2 (n:: Integer ) = sizeof (n)<< 3 - leading_zeros (n)
247
+
207
248
# Frobenius dot/inner product: trace(A'B)
208
249
function dot (A:: SparseMatrixCSC{T1,S1} ,B:: SparseMatrixCSC{T2,S2} ) where {T1,T2,S1,S2}
209
250
m, n = size (A)
0 commit comments