Skip to content

SPARK-8918 Add @since tags to mllib.clustering #8256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import org.apache.spark.sql.{SQLContext, Row}
* the weight for Gaussian i, and weights.sum == 1
* @param gaussians Array of MultivariateGaussian where gaussians(i) represents
* the Multivariate Gaussian (Normal) Distribution for Gaussian i
* @since 1.3.0
*/
@Experimental
class GaussianMixtureModel(
Expand All @@ -53,32 +54,48 @@ class GaussianMixtureModel(

override protected def formatVersion = "1.0"

/**
* @since 1.4.0
*/
override def save(sc: SparkContext, path: String): Unit = {
GaussianMixtureModel.SaveLoadV1_0.save(sc, path, weights, gaussians)
}

/** Number of gaussians in mixture */
/**
* Number of gaussians in mixture
* @since 1.3.0
*/
def k: Int = weights.length

/** Maps given points to their cluster indices. */
/**
* Maps given points to their cluster indices.
* @since 1.3.0
*/
def predict(points: RDD[Vector]): RDD[Int] = {
val responsibilityMatrix = predictSoft(points)
responsibilityMatrix.map(r => r.indexOf(r.max))
}

/** Maps given point to its cluster index. */
/**
* Maps given point to its cluster index.
* @since 1.4.0
*/
def predict(point: Vector): Int = {
val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k)
r.indexOf(r.max)
}

/** Java-friendly version of [[predict()]] */
/**
* Java-friendly version of [[predict()]]
* @since 1.4.0
*/
def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]

/**
* Given the input vectors, return the membership value of each vector
* to all mixture components.
* @since 1.3.0
*/
def predictSoft(points: RDD[Vector]): RDD[Array[Double]] = {
val sc = points.sparkContext
Expand All @@ -91,6 +108,7 @@ class GaussianMixtureModel(

/**
* Given the input vector, return the membership values to all mixture components.
* @since 1.4.0
*/
def predictSoft(point: Vector): Array[Double] = {
computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k)
Expand All @@ -115,6 +133,9 @@ class GaussianMixtureModel(
}
}

/**
* @since 1.4.0
*/
@Experimental
object GaussianMixtureModel extends Loader[GaussianMixtureModel] {

Expand Down Expand Up @@ -147,6 +168,9 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
sc.parallelize(dataArray, 1).toDF().write.parquet(Loader.dataPath(path))
}

/**
* @since 1.4.0
*/
def load(sc: SparkContext, path: String): GaussianMixtureModel = {
val dataPath = Loader.dataPath(path)
val sqlContext = new SQLContext(sc)
Expand All @@ -165,6 +189,9 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
}
}

/**
* @since 1.4.0
*/
override def load(sc: SparkContext, path: String) : GaussianMixtureModel = {
val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
implicit val formats = DefaultFormats
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -430,11 +430,14 @@ class KMeans private (

/**
* Top-level methods for calling K-means clustering.
* @since 0.8.0
*/
object KMeans {

// Initialization mode names
/** @since 0.8.0 */
val RANDOM = "random"
/** @since 0.8.0 */
val K_MEANS_PARALLEL = "k-means||"

/**
Expand All @@ -446,6 +449,7 @@ object KMeans {
* @param runs number of parallel runs, defaults to 1. The best model is returned.
* @param initializationMode initialization model, either "random" or "k-means||" (default).
* @param seed random seed value for cluster initialization
* @since 1.3.0
*/
def train(
data: RDD[Vector],
Expand All @@ -470,6 +474,7 @@ object KMeans {
* @param maxIterations max number of iterations
* @param runs number of parallel runs, defaults to 1. The best model is returned.
* @param initializationMode initialization model, either "random" or "k-means||" (default).
* @since 0.8.0
*/
def train(
data: RDD[Vector],
Expand All @@ -486,6 +491,7 @@ object KMeans {

/**
* Trains a k-means model using specified parameters and the default values for unspecified.
* @since 0.8.0
*/
def train(
data: RDD[Vector],
Expand All @@ -496,6 +502,7 @@ object KMeans {

/**
* Trains a k-means model using specified parameters and the default values for unspecified.
* @since 0.8.0
*/
def train(
data: RDD[Vector],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,35 +34,52 @@ import org.apache.spark.sql.Row

/**
* A clustering model for K-means. Each point belongs to the cluster with the closest center.
* @since 0.8.0
*/
class KMeansModel (
val clusterCenters: Array[Vector]) extends Saveable with Serializable with PMMLExportable {

/** A Java-friendly constructor that takes an Iterable of Vectors. */
/**
* A Java-friendly constructor that takes an Iterable of Vectors.
* @since 1.4.0
*/
def this(centers: java.lang.Iterable[Vector]) = this(centers.asScala.toArray)

/** Total number of clusters. */
/**
* Total number of clusters.
* @since 0.8.0
*/
def k: Int = clusterCenters.length

/** Returns the cluster index that a given point belongs to. */
/**
* Returns the cluster index that a given point belongs to.
* @since 0.8.0
*/
def predict(point: Vector): Int = {
KMeans.findClosest(clusterCentersWithNorm, new VectorWithNorm(point))._1
}

/** Maps given points to their cluster indices. */
/**
* Maps given points to their cluster indices.
* @since 1.0.0
*/
def predict(points: RDD[Vector]): RDD[Int] = {
val centersWithNorm = clusterCentersWithNorm
val bcCentersWithNorm = points.context.broadcast(centersWithNorm)
points.map(p => KMeans.findClosest(bcCentersWithNorm.value, new VectorWithNorm(p))._1)
}

/** Maps given points to their cluster indices. */
/**
* Maps given points to their cluster indices.
* @since 1.0.0
*/
def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]

/**
* Return the K-means cost (sum of squared distances of points to their nearest center) for this
* model on the given data.
* @since 0.8.0
*/
def computeCost(data: RDD[Vector]): Double = {
val centersWithNorm = clusterCentersWithNorm
Expand All @@ -73,13 +90,20 @@ class KMeansModel (
private def clusterCentersWithNorm: Iterable[VectorWithNorm] =
clusterCenters.map(new VectorWithNorm(_))

/**
* @since 1.4.0
*/
override def save(sc: SparkContext, path: String): Unit = {
KMeansModel.SaveLoadV1_0.save(sc, this, path)
}

/** @since 1.4.0 */
override protected def formatVersion: String = "1.0"
}

/**
* @since 1.4.0
*/
object KMeansModel extends Loader[KMeansModel] {
override def load(sc: SparkContext, path: String): KMeansModel = {
KMeansModel.SaveLoadV1_0.load(sc, path)
Expand Down
Loading