@@ -35,7 +35,7 @@ import scala.language.existentials
35
35
* representation. The resulting pseudo-eigenvector provides effective clustering - as
36
36
* performed by Parallel KMeans.
37
37
*/
38
- object PIClustering {
38
+ object PowerIterationClustering {
39
39
40
40
private val logger = Logger .getLogger(getClass.getName())
41
41
@@ -44,32 +44,32 @@ object PIClustering {
44
44
type DGraph = Graph [Double , Double ]
45
45
type IndexedVector [Double ] = (Long , BDV [Double ])
46
46
47
+
47
48
// Terminate iteration when norm changes by less than this value
48
- private [mllib] val DefaultMinNormChange : Double = 1e-11
49
+ private [mllib] val defaultMinNormChange : Double = 1e-11
49
50
50
- // Default σ for Gaussian Distance calculations
51
- private [mllib] val DefaultSigma = 1.0
51
+ // Default sigma for Gaussian Distance calculations
52
+ private [mllib] val defaultSigma = 1.0
52
53
53
54
// Default number of iterations for PIC loop
54
- private [mllib] val DefaultIterations : Int = 20
55
+ private [mllib] val defaultIterations : Int = 20
55
56
56
57
// Default minimum affinity between points - lower than this it is considered
57
58
// zero and no edge will be created
58
- private [mllib] val DefaultMinAffinity = 1e-11
59
+ private [mllib] val defaultMinAffinity = 1e-11
59
60
60
61
// Do not allow divide by zero: change to this value instead
61
- val DefaultDivideByZeroVal : Double = 1e-15
62
+ val defaultDivideByZeroVal : Double = 1e-15
62
63
63
64
// Default number of runs by the KMeans.run() method
64
- val DefaultKMeansRuns = 10
65
+ val defaultKMeansRuns = 10
65
66
66
67
/**
67
68
*
68
69
* Run a Power Iteration Clustering
69
70
*
70
71
* @param sc Spark Context
71
- * @param points Input Points in format of [(VertexId,(x,y)]
72
- * where VertexId is a Long
72
+ * @param G Affinity Matrix in a Sparse Graph structure
73
73
* @param nClusters Number of clusters to create
74
74
* @param nIterations Number of iterations of the PIC algorithm
75
75
* that calculates primary PseudoEigenvector and Eigenvalue
@@ -83,30 +83,13 @@ object PIClustering {
83
83
* Seq[(VertexId, ClusterID Membership)]
84
84
*/
85
85
def run (sc : SparkContext ,
86
- points : Points ,
86
+ G : Graph [ Double , Double ] ,
87
87
nClusters : Int ,
88
- nIterations : Int = DefaultIterations ,
89
- sigma : Double = DefaultSigma ,
90
- minAffinity : Double = DefaultMinAffinity ,
91
- nRuns : Int = DefaultKMeansRuns )
88
+ nIterations : Int = defaultIterations ,
89
+ sigma : Double = defaultSigma ,
90
+ minAffinity : Double = defaultMinAffinity ,
91
+ nRuns : Int = defaultKMeansRuns )
92
92
: (Seq [(Int , Vector )], Seq [((VertexId , Vector ), Int )]) = {
93
- val vidsRdd = sc.parallelize(points.map(_._1).sorted)
94
- val nVertices = points.length
95
-
96
- val (wRdd, rowSums) = createNormalizedAffinityMatrix(sc, points, sigma)
97
- val initialVt = createInitialVector(sc, points.map(_._1), rowSums)
98
- if (logger.isDebugEnabled) {
99
- logger.debug(s " Vt(0)= ${
100
- printVector(new BDV (initialVt.map {
101
- _._2
102
- }.toArray))
103
- }" )
104
- }
105
- val edgesRdd = createSparseEdgesRdd(sc, wRdd, minAffinity)
106
- val G = createGraphFromEdges(sc, edgesRdd, points.size, Some (initialVt))
107
- if (logger.isDebugEnabled) {
108
- logger.debug(printMatrixFromEdges(G .edges))
109
- }
110
93
val (gUpdated, lambda, vt) = getPrincipalEigen(sc, G , nIterations)
111
94
// TODO: avoid local collect and then sc.parallelize.
112
95
val localVt = vt.collect.sortBy(_._1)
@@ -140,36 +123,43 @@ object PIClustering {
140
123
}
141
124
142
125
/**
143
- * Read Points from an input file in the following format:
144
- * Vertex1Id Coord11 Coord12 CoordX13 .. Coord1D
145
- * Vertex2Id Coord21 Coord22 CoordX23 .. Coord2D
146
- * ..
147
- * VertexNId CoordN1 CoordN2 CoordN23 .. CoordND
148
- *
149
- * Where N is the number of observations, each a D-dimension point
150
126
*
151
- * E.g.
127
+ * Create an affinity matrix
152
128
*
153
- * 19 1.8035177495 0.7460582552 0.2361611395 -0.8645567427 -0.8613062
154
- * 10 0.5534111111 1.0456386879 1.7045663273 0.7281759816 1.0807487792
155
- * 911 1.200749626 1.8962364439 2.5117192131 -0.4034737281 -0.9069696484
156
- *
157
- * Which represents three 5-dimensional input Points with VertexIds 19,10, and 911
158
- * @param verticesFile Local filesystem path to the Points input file
159
- * @return Set of Vertices in format appropriate for consumption by the PIC algorithm
129
+ * @param sc Spark Context
130
+ * @param points Input Points in format of [(VertexId,(x,y)]
131
+ * where VertexId is a Long
132
+ * @param sigma Sigma for Gaussian distribution calculation according to
133
+ * [1/2 *sqrt(pi*sigma)] exp (- (x-y)**2 / 2sigma**2
134
+ * @param minAffinity Minimum Affinity between two Points in the input dataset: below
135
+ * this threshold the affinity will be considered "close to" zero and
136
+ * no Edge will be created between those Points in the sparse matrix
137
+ * @return Tuple of (Seq[(Cluster Id,Cluster Center)],
138
+ * Seq[(VertexId, ClusterID Membership)]
160
139
*/
161
- def readVerticesfromFile (verticesFile : String ): Points = {
162
-
163
- import scala .io .Source
164
- val vertices = Source .fromFile(verticesFile).getLines.map { l =>
165
- val toks = l.split(" \t " )
166
- val arr = new BDV (toks.slice(1 , toks.length).map(_.toDouble))
167
- (toks(0 ).toLong, arr)
168
- }.toSeq
140
+ def createGaussianAffinityMatrix (sc : SparkContext ,
141
+ points : Points ,
142
+ sigma : Double = defaultSigma,
143
+ minAffinity : Double = defaultMinAffinity)
144
+ : Graph [Double , Double ] = {
145
+ val vidsRdd = sc.parallelize(points.map(_._1).sorted)
146
+ val nVertices = points.length
147
+
148
+ val (wRdd, rowSums) = createNormalizedAffinityMatrix(sc, points, sigma)
149
+ val initialVt = createInitialVector(sc, points.map(_._1), rowSums)
150
+ if (logger.isDebugEnabled) {
151
+ logger.debug(s " Vt(0)= ${
152
+ printVector(new BDV (initialVt.map {
153
+ _._2
154
+ }.toArray))
155
+ }" )
156
+ }
157
+ val edgesRdd = createSparseEdgesRdd(sc, wRdd, minAffinity)
158
+ val G = createGraphFromEdges(sc, edgesRdd, points.size, Some (initialVt))
169
159
if (logger.isDebugEnabled) {
170
- logger.debug(s " Read in ${vertices.length} from $verticesFile " )
160
+ logger.debug(printMatrixFromEdges( G .edges) )
171
161
}
172
- vertices
162
+ G
173
163
}
174
164
175
165
/**
@@ -205,7 +195,7 @@ object PIClustering {
205
195
*/
206
196
def getPrincipalEigen (sc : SparkContext ,
207
197
G : DGraph ,
208
- nIterations : Int = DefaultIterations ,
198
+ nIterations : Int = defaultIterations ,
209
199
optMinNormChange : Option [Double ] = None
210
200
): (DGraph , Double , VertexRDD [Double ]) = {
211
201
@@ -312,7 +302,7 @@ object PIClustering {
312
302
* @return
313
303
*/
314
304
private [mllib] def createSparseEdgesRdd (sc : SparkContext , wRdd : RDD [IndexedVector [Double ]],
315
- minAffinity : Double = DefaultMinAffinity ) = {
305
+ minAffinity : Double = defaultMinAffinity ) = {
316
306
val labels = wRdd.map { case (vid, vect) => vid}.collect
317
307
val edgesRdd = wRdd.flatMap { case (vid, vect) =>
318
308
for ((dval, ix) <- vect.toArray.zipWithIndex
@@ -387,7 +377,7 @@ object PIClustering {
387
377
388
378
}
389
379
390
- private [mllib] def makeNonZero (dval : Double , tol : Double = DefaultDivideByZeroVal ) = {
380
+ private [mllib] def makeNonZero (dval : Double , tol : Double = defaultDivideByZeroVal ) = {
391
381
if (Math .abs(dval) < tol) {
392
382
Math .signum(dval) * tol
393
383
} else {
0 commit comments