@@ -30,13 +30,25 @@ import org.apache.spark.storage.StorageLevel
30
30
*
31
31
* @param numRowBlocks Number of blocks that form the rows of the matrix.
32
32
* @param numColBlocks Number of blocks that form the columns of the matrix.
33
+ * @param suggestedNumPartitions Number of partitions to partition the rdd into. The final number
34
+ * of partitions will be set to `min(suggestedNumPartitions,
35
+ * numRowBlocks * numColBlocks)`, because setting the number of
36
+ * partitions greater than the number of sub matrices is not useful.
33
37
*/
34
38
private [mllib] class GridPartitioner (
35
39
val numRowBlocks : Int ,
36
40
val numColBlocks : Int ,
37
- val numParts : Int ) extends Partitioner {
41
+ suggestedNumPartitions : Int ) extends Partitioner {
38
42
// Having the number of partitions greater than the number of sub matrices does not help
39
- override val numPartitions = math.min(numParts, numRowBlocks * numColBlocks)
43
+ override val numPartitions = math.min(suggestedNumPartitions, numRowBlocks * numColBlocks)
44
+
45
+ val totalBlocks = numRowBlocks.toLong * numColBlocks
46
+ // Gives the number of blocks that need to be in each partition
47
+ val targetNumBlocksPerPartition = math.ceil(totalBlocks * 1.0 / numPartitions).toInt
48
+ // Number of neighboring blocks to take in each row
49
+ val numRowBlocksPerPartition = math.ceil(numRowBlocks * 1.0 / targetNumBlocksPerPartition).toInt
50
+ // Number of neighboring blocks to take in each column
51
+ val numColBlocksPerPartition = math.ceil(numColBlocks * 1.0 / targetNumBlocksPerPartition).toInt
40
52
41
53
/**
42
54
* Returns the index of the partition the SubMatrix belongs to. Tries to achieve block wise
@@ -51,27 +63,20 @@ private[mllib] class GridPartitioner(
51
63
override def getPartition (key : Any ): Int = {
52
64
key match {
53
65
case (blockRowIndex : Int , blockColIndex : Int ) =>
54
- getBlockId (blockRowIndex, blockColIndex)
66
+ getPartitionId (blockRowIndex, blockColIndex)
55
67
case (blockRowIndex : Int , innerIndex : Int , blockColIndex : Int ) =>
56
- getBlockId (blockRowIndex, blockColIndex)
68
+ getPartitionId (blockRowIndex, blockColIndex)
57
69
case _ =>
58
70
throw new IllegalArgumentException (s " Unrecognized key. key: $key" )
59
71
}
60
72
}
61
73
62
74
/** Partitions sub-matrices as blocks with neighboring sub-matrices. */
63
- private def getBlockId (blockRowIndex : Int , blockColIndex : Int ): Int = {
64
- val totalBlocks = numRowBlocks * numColBlocks
65
- // Gives the number of blocks that need to be in each partition
66
- val partitionRatio = math.ceil(totalBlocks * 1.0 / numPartitions).toInt
67
- // Number of neighboring blocks to take in each row
68
- val subBlocksPerRow = math.ceil(numRowBlocks * 1.0 / partitionRatio).toInt
69
- // Number of neighboring blocks to take in each column
70
- val subBlocksPerCol = math.ceil(numColBlocks * 1.0 / partitionRatio).toInt
75
+ private def getPartitionId (blockRowIndex : Int , blockColIndex : Int ): Int = {
71
76
// Coordinates of the block
72
- val i = blockRowIndex / subBlocksPerRow
73
- val j = blockColIndex / subBlocksPerCol
74
- val blocksPerRow = math.ceil(numRowBlocks * 1.0 / subBlocksPerRow ).toInt
77
+ val i = blockRowIndex / numRowBlocksPerPartition
78
+ val j = blockColIndex / numColBlocksPerPartition
79
+ val blocksPerRow = math.ceil(numRowBlocks * 1.0 / numRowBlocksPerPartition ).toInt
75
80
j * blocksPerRow + i
76
81
}
77
82
@@ -91,10 +96,10 @@ private[mllib] class GridPartitioner(
91
96
* Represents a distributed matrix in blocks of local matrices.
92
97
*
93
98
* @param rdd The RDD of SubMatrices (local matrices) that form this matrix
94
- * @param nRows Number of rows of this matrix
95
- * @param nCols Number of columns of this matrix
96
- * @param numRowBlocks Number of blocks that form the rows of this matrix
97
- * @param numColBlocks Number of blocks that form the columns of this matrix
99
+ * @param nRows Number of rows of this matrix. If the supplied value is less than or equal to zero,
100
+ * the number of rows will be calculated when `numRows` is invoked.
101
+ * @param nCols Number of columns of this matrix. If the supplied value is less than or equal to
102
+ * zero, the number of columns will be calculated when `numCols` is invoked.
98
103
* @param rowsPerBlock Number of rows that make up each block. The blocks forming the final
99
104
* rows are not required to have the given number of rows
100
105
* @param colsPerBlock Number of columns that make up each block. The blocks forming the final
@@ -104,8 +109,6 @@ class BlockMatrix(
104
109
val rdd : RDD [((Int , Int ), Matrix )],
105
110
private var nRows : Long ,
106
111
private var nCols : Long ,
107
- val numRowBlocks : Int ,
108
- val numColBlocks : Int ,
109
112
val rowsPerBlock : Int ,
110
113
val colsPerBlock : Int ) extends DistributedMatrix with Logging {
111
114
@@ -115,25 +118,18 @@ class BlockMatrix(
115
118
* Alternate constructor for BlockMatrix without the input of the number of rows and columns.
116
119
*
117
120
* @param rdd The RDD of SubMatrices (local matrices) that form this matrix
118
- * @param numRowBlocks Number of blocks that form the rows of this matrix
119
- * @param numColBlocks Number of blocks that form the columns of this matrix
120
121
* @param rowsPerBlock Number of rows that make up each block. The blocks forming the final
121
122
* rows are not required to have the given number of rows
122
123
* @param colsPerBlock Number of columns that make up each block. The blocks forming the final
123
124
* columns are not required to have the given number of columns
124
125
*/
125
126
def this (
126
127
rdd : RDD [((Int , Int ), Matrix )],
127
- numRowBlocks : Int ,
128
- numColBlocks : Int ,
129
128
rowsPerBlock : Int ,
130
129
colsPerBlock : Int ) = {
131
- this (rdd, 0L , 0L , numRowBlocks, numColBlocks, rowsPerBlock, colsPerBlock)
130
+ this (rdd, 0L , 0L , rowsPerBlock, colsPerBlock)
132
131
}
133
132
134
- private [mllib] var partitioner : GridPartitioner =
135
- new GridPartitioner (numRowBlocks, numColBlocks, rdd.partitions.length)
136
-
137
133
private lazy val dims : (Long , Long ) = getDim
138
134
139
135
override def numRows (): Long = {
@@ -146,48 +142,21 @@ class BlockMatrix(
146
142
nCols
147
143
}
148
144
145
+ val numRowBlocks = math.ceil(numRows() * 1.0 / rowsPerBlock).toInt
146
+ val numColBlocks = math.ceil(numCols() * 1.0 / colsPerBlock).toInt
147
+
148
+ private [mllib] var partitioner : GridPartitioner =
149
+ new GridPartitioner (numRowBlocks, numColBlocks, rdd.partitions.length)
150
+
151
+
152
+
149
153
/** Returns the dimensions of the matrix. */
150
154
private def getDim : (Long , Long ) = {
151
- case class MatrixMetaData (var rowIndex : Int , var colIndex : Int ,
152
- var numRows : Int , var numCols : Int )
153
- // picks the sizes of the matrix with the maximum indices
154
- def pickSizeByGreaterIndex (example : MatrixMetaData , base : MatrixMetaData ): MatrixMetaData = {
155
- if (example.rowIndex > base.rowIndex) {
156
- base.rowIndex = example.rowIndex
157
- base.numRows = example.numRows
158
- }
159
- if (example.colIndex > base.colIndex) {
160
- base.colIndex = example.colIndex
161
- base.numCols = example.numCols
162
- }
163
- base
164
- }
165
-
166
- // Aggregate will return an error if the rdd is empty
167
- val lastRowCol = rdd.treeAggregate(new MatrixMetaData (0 , 0 , 0 , 0 ))(
168
- seqOp = (c, v) => (c, v) match { case (base, ((blockXInd, blockYInd), mat)) =>
169
- pickSizeByGreaterIndex(
170
- new MatrixMetaData (blockXInd, blockYInd, mat.numRows, mat.numCols), base)
171
- },
172
- combOp = (c1, c2) => (c1, c2) match {
173
- case (res1, res2) =>
174
- pickSizeByGreaterIndex(res1, res2)
175
- })
176
- // We add the size of the edge matrices, because they can be less than the specified
177
- // rowsPerBlock or colsPerBlock.
178
- (lastRowCol.rowIndex.toLong * rowsPerBlock + lastRowCol.numRows,
179
- lastRowCol.colIndex.toLong * colsPerBlock + lastRowCol.numCols)
180
- }
155
+ val (rows, cols) = rdd.map { case ((blockRowIndex, blockColIndex), mat) =>
156
+ (blockRowIndex * rowsPerBlock + mat.numRows, blockColIndex * colsPerBlock + mat.numCols)
157
+ }.reduce((x0, x1) => (math.max(x0._1, x1._1), math.max(x0._2, x1._2)))
181
158
182
- /** Returns the Frobenius Norm of the matrix */
183
- def normFro (): Double = {
184
- math.sqrt(rdd.map { mat => mat._2 match {
185
- case sparse : SparseMatrix =>
186
- sparse.values.map(x => math.pow(x, 2 )).sum
187
- case dense : DenseMatrix =>
188
- dense.values.map(x => math.pow(x, 2 )).sum
189
- }
190
- }.reduce(_ + _))
159
+ (math.max(rows, nRows), math.max(cols, nCols))
191
160
}
192
161
193
162
/** Cache the underlying RDD. */
@@ -210,14 +179,14 @@ class BlockMatrix(
210
179
s " Int.MaxValue. Currently numCols: ${numCols()}" )
211
180
val nRows = numRows().toInt
212
181
val nCols = numCols().toInt
213
- val mem = nRows * nCols * 8 / 1000000
182
+ val mem = nRows.toLong * nCols / 125000
214
183
if (mem > 500 ) logWarning(s " Storing this matrix will require $mem MB of memory! " )
215
184
216
- val parts = rdd.collect().sortBy(x => (x._1._2, x._1._1))
185
+ val parts = rdd.collect()
217
186
val values = new Array [Double ](nRows * nCols)
218
- parts.foreach { case ((rowIndex, colIndex ), block) =>
219
- val rowOffset = rowIndex * rowsPerBlock
220
- val colOffset = colIndex * colsPerBlock
187
+ parts.foreach { case ((blockRowIndex, blockColIndex ), block) =>
188
+ val rowOffset = blockRowIndex * rowsPerBlock
189
+ val colOffset = blockColIndex * colsPerBlock
221
190
var j = 0
222
191
val mat = block.toArray
223
192
while (j < block.numCols) {
0 commit comments