From 2a21d6d4ab294c515ec23980cd9eda4508576a05 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Sun, 8 Feb 2015 22:11:22 +0000 Subject: [PATCH 1/7] Initial draft of FeatureAttributes class --- .../apache/spark/ml/attribute/Attribute.scala | 64 ++++++++++++ .../ml/attribute/CategoricalAttribute.scala | 59 +++++++++++ .../ml/attribute/ContinuousAttribute.scala | 58 +++++++++++ .../ml/attribute/FeatureAttributes.scala | 97 +++++++++++++++++++ .../spark/ml/attribute/FeatureType.scala | 31 ++++++ 5 files changed, 309 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala new file mode 100644 index 0000000000000..8d41187b3e917 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import org.apache.spark.ml.attribute.FeatureType.FeatureType +import org.apache.spark.sql.types.{MetadataBuilder, Metadata} + +abstract class Attribute(val index: Int, + val name: Option[String], + val dimension: Int) { + + require(index >= 0) + require(dimension >= 1) + + def featureType: FeatureType + + def toMetadata(): Metadata + + private[attribute] def toBaseMetadata(): MetadataBuilder = { + val builder = new MetadataBuilder() + builder.putLong("index", index) + if (name.isDefined) { + builder.putString("name", name.get) + } + if (dimension > 1) { + builder.putLong("dimension", dimension) + } + builder + } + +} + +object Attribute { + + def fromMetadata(metadata: Metadata): Attribute = { + FeatureType.withName(metadata.getString("type")) match { + case FeatureType.CATEGORICAL => CategoricalAttribute.fromMetadata(metadata) + case FeatureType.CONTINUOUS => ContinuousAttribute.fromMetadata(metadata) + } + } + + private[attribute] def parseBaseMetadata(metadata: Metadata): (Int, Option[String], Int) = { + val index = metadata.getLong("index").toInt + val name = if (metadata.contains("name")) Some(metadata.getString("name")) else None + val dimension = if (metadata.contains("dimension")) metadata.getLong("dimension").toInt else 1 + (index, name, dimension) + } + +} \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala new file mode 100644 index 0000000000000..2b317d735a906 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import org.apache.spark.ml.attribute.FeatureType.FeatureType +import org.apache.spark.sql.types.Metadata + +class CategoricalAttribute private ( + override val index: Int, + override val name: Option[String], + override val dimension: Int, + val categories: Option[Array[String]]) extends Attribute(index, name, dimension) { + + require(!categories.isDefined || categories.get.nonEmpty) + + override def featureType: FeatureType = FeatureType.CATEGORICAL + + def numCategories: Option[Int] = + if (categories.isDefined) Some(categories.get.length) else None + + override def toMetadata(): Metadata = { + val builder = toBaseMetadata() + if (categories.isDefined) { + builder.putStringArray("categories", categories.get) + } + builder.build() + } + +} + +private[attribute] object CategoricalAttribute { + + def fromMetadata(metadata: Metadata): CategoricalAttribute = { + val (index, name, dimension) = Attribute.parseBaseMetadata(metadata) + val categories = + if (metadata.contains("categories")) { + Some(metadata.getStringArray("categories")) + } else { + None + } + new CategoricalAttribute(index, name, dimension, categories) + } + +} \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala new file mode 100644 index 0000000000000..f56ae57c315e7 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import org.apache.spark.ml.attribute.FeatureType.FeatureType +import org.apache.spark.sql.types.Metadata + +class ContinuousAttribute private ( + override val index: Int, + override val name: Option[String], + override val dimension: Int, + val min: Option[Double], + val max: Option[Double]) extends Attribute(index, name, dimension) { + + if (min.isDefined && max.isDefined) { + require(min.get <= max.get) + } + + override def featureType(): FeatureType = FeatureType.CONTINUOUS + + override def toMetadata(): Metadata = { + val builder = toBaseMetadata() + if (min.isDefined) { + builder.putDouble("min", min.get) + } + if (max.isDefined) { + builder.putDouble("max", max.get) + } + builder.build() + } + +} + +private[attribute] object ContinuousAttribute { + + def fromMetadata(metadata: Metadata): ContinuousAttribute = { + val (index, name, dimension) = Attribute.parseBaseMetadata(metadata) + val min = if (metadata.contains("min")) Some(metadata.getDouble("min")) else None + val max = if (metadata.contains("max")) Some(metadata.getDouble("max")) else None + new ContinuousAttribute(index, name, dimension, min, max) + } + +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala new file mode 100644 index 0000000000000..6a947a3d50dc8 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import org.apache.spark.sql.types.{MetadataBuilder, Metadata} + +/** + * Wrapper around [[Metadata]] with specialized methods for accessing information about + * data as machine learning features, and their associated attributes, like: + * + * - type (continuous, categorical, etc.) as [[FeatureType]] + * - for categorical features, the category values + * + * This information is stored as a [[Metadata]] under key "features", and contains an array of + * [[Metadata]] inside that for each feature for which metadata is defined. Example: + * + * {{{ + * { + * ... + * "features" : [ + * { + * "index": 0, + * "name": "age", + * "type": "CONTINUOUS", + * "min": 0 + * }, + * { + * "index": 5, + * "name": "gender", + * "type": "CATEGORICAL", + * "categories" : [ "male", "female" ] + * }, + * { + * "index": 7, + * "name": "percentAllocations", + * "type": "CONTINUOUS", + * "dimension": 10, + * "min": 0, + * "max": 1 + * ] + * "producer": "..." + * ... + * } + * }}} + */ +class FeatureAttributes private (val attributes: Array[Attribute], + val producer: Option[String]) { + + private val nameToIndex: Map[String,Int] = + attributes.filter(_.name.isDefined).map(att => (att.name.get, att.index)).toMap + private val indexToAttribute: Map[Int,Attribute] = + attributes.map(att => (att.index, att)).toMap + private val categoricalIndices: Array[Int] = + attributes.filter(_.featureType == FeatureType.CATEGORICAL).map(_.index) + + def getFeatureAttribute(index: Int): Option[Attribute] = indexToAttribute.get(index) + + def getFeatureIndex(featureName: String): Option[Int] = nameToIndex.get(featureName) + + def categoricalFeatureIndices(): Array[Int] = categoricalIndices + + def toMetadata(): Metadata = { + val builder = new MetadataBuilder() + builder.putMetadataArray("features", attributes.map(_.toMetadata())) + if (producer.isDefined) { + builder.putString("producer", producer.get) + } + builder.build() + } + +} + +object FeatureAttributes { + + def fromMetadata(metadata: Metadata): FeatureAttributes = { + val attributes = metadata.getMetadataArray("features").map(Attribute.fromMetadata(_)) + val producer = + if (metadata.contains("producer")) Some(metadata.getString("producer")) else None + new FeatureAttributes(attributes, producer) + } + +} \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala new file mode 100644 index 0000000000000..d3f23ef7cf12a --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +/** + * Enumeration of machine learning feature types. + */ +object FeatureType extends Enumeration { + + type FeatureType = Value + + // CATEGORICAL = discrete, unordered value + // CONTINUOUS = ordered numeric value; also used for discrete numeric values now + val CATEGORICAL, CONTINUOUS = Value + +} From 7c944da8b2a1aa9ec9d70fabe77b17d54c08b291 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Sat, 21 Feb 2015 12:54:31 +0000 Subject: [PATCH 2/7] Add FeatureType hierarchy and categorical cardinality --- .../apache/spark/ml/attribute/Attribute.scala | 7 ++-- .../ml/attribute/CategoricalAttribute.scala | 34 ++++++++++++++----- .../ml/attribute/ContinuousAttribute.scala | 3 +- .../ml/attribute/FeatureAttributes.scala | 18 ++++++++-- .../spark/ml/attribute/FeatureType.scala | 23 ++++++++----- 5 files changed, 59 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala index 8d41187b3e917..afb03b488ac56 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala @@ -17,7 +17,6 @@ package org.apache.spark.ml.attribute -import org.apache.spark.ml.attribute.FeatureType.FeatureType import org.apache.spark.sql.types.{MetadataBuilder, Metadata} abstract class Attribute(val index: Int, @@ -48,9 +47,9 @@ abstract class Attribute(val index: Int, object Attribute { def fromMetadata(metadata: Metadata): Attribute = { - FeatureType.withName(metadata.getString("type")) match { - case FeatureType.CATEGORICAL => CategoricalAttribute.fromMetadata(metadata) - case FeatureType.CONTINUOUS => ContinuousAttribute.fromMetadata(metadata) + FeatureTypes.withName(metadata.getString("type")) match { + case Categorical => CategoricalAttribute.fromMetadata(metadata) + case Continuous => ContinuousAttribute.fromMetadata(metadata) } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala index 2b317d735a906..4c54839863211 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala @@ -17,27 +17,28 @@ package org.apache.spark.ml.attribute -import org.apache.spark.ml.attribute.FeatureType.FeatureType import org.apache.spark.sql.types.Metadata class CategoricalAttribute private ( override val index: Int, override val name: Option[String], override val dimension: Int, - val categories: Option[Array[String]]) extends Attribute(index, name, dimension) { + val categories: Option[Array[String]], + val cardinality: Option[Int]) extends Attribute(index, name, dimension) { require(!categories.isDefined || categories.get.nonEmpty) + require(!cardinality.isDefined || cardinality.get > 0) - override def featureType: FeatureType = FeatureType.CATEGORICAL - - def numCategories: Option[Int] = - if (categories.isDefined) Some(categories.get.length) else None + override def featureType: FeatureType = Categorical override def toMetadata(): Metadata = { val builder = toBaseMetadata() if (categories.isDefined) { builder.putStringArray("categories", categories.get) } + if (cardinality.isDefined) { + builder.putLong("cardinality", cardinality.get) + } builder.build() } @@ -47,13 +48,28 @@ private[attribute] object CategoricalAttribute { def fromMetadata(metadata: Metadata): CategoricalAttribute = { val (index, name, dimension) = Attribute.parseBaseMetadata(metadata) - val categories = + + var cardinality: Option[Int] = + if (metadata.contains("cardinality")) { + Some(metadata.getLong("cardinality").toInt) + } else { + None + } + + val categories: Option[Array[String]] = if (metadata.contains("categories")) { - Some(metadata.getStringArray("categories")) + val theCategories = Some(metadata.getStringArray("categories")) + if (cardinality.isDefined) { + require(theCategories.get.size <= cardinality.get) + } else { + cardinality = Some(theCategories.get.size) + } + theCategories } else { None } - new CategoricalAttribute(index, name, dimension, categories) + + new CategoricalAttribute(index, name, dimension, categories, cardinality) } } \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala index f56ae57c315e7..137cc62921863 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala @@ -17,7 +17,6 @@ package org.apache.spark.ml.attribute -import org.apache.spark.ml.attribute.FeatureType.FeatureType import org.apache.spark.sql.types.Metadata class ContinuousAttribute private ( @@ -31,7 +30,7 @@ class ContinuousAttribute private ( require(min.get <= max.get) } - override def featureType(): FeatureType = FeatureType.CONTINUOUS + override def featureType(): FeatureType = Continuous override def toMetadata(): Metadata = { val builder = toBaseMetadata() diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala index 6a947a3d50dc8..fe69a6dc27203 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala @@ -20,11 +20,14 @@ package org.apache.spark.ml.attribute import org.apache.spark.sql.types.{MetadataBuilder, Metadata} /** - * Wrapper around [[Metadata]] with specialized methods for accessing information about - * data as machine learning features, and their associated attributes, like: + * Representation of specialized information in a [[Metadata]] concerning + * data as machine learning features, with methods to access their associated attributes, like: * * - type (continuous, categorical, etc.) as [[FeatureType]] + * - optional feature name * - for categorical features, the category values + * - for continuous values, maximum and minimum value + * - dimension for vector-valued features * * This information is stored as a [[Metadata]] under key "features", and contains an array of * [[Metadata]] inside that for each feature for which metadata is defined. Example: @@ -46,6 +49,12 @@ import org.apache.spark.sql.types.{MetadataBuilder, Metadata} * "categories" : [ "male", "female" ] * }, * { + * "index": 6, + * "name": "customerType", + * "type": "CATEGORICAL", + * "cardinality": 10 + * }, + * { * "index": 7, * "name": "percentAllocations", * "type": "CONTINUOUS", @@ -66,7 +75,10 @@ class FeatureAttributes private (val attributes: Array[Attribute], private val indexToAttribute: Map[Int,Attribute] = attributes.map(att => (att.index, att)).toMap private val categoricalIndices: Array[Int] = - attributes.filter(_.featureType == FeatureType.CATEGORICAL).map(_.index) + attributes.filter(_.featureType match { + case c: CategoricalFeatureType => true + case _ => false + }).map(_.index) def getFeatureAttribute(index: Int): Option[Attribute] = indexToAttribute.get(index) diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala index d3f23ef7cf12a..1ec9599be4696 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala @@ -17,15 +17,22 @@ package org.apache.spark.ml.attribute -/** - * Enumeration of machine learning feature types. - */ -object FeatureType extends Enumeration { +sealed trait FeatureType - type FeatureType = Value +sealed trait ContinuousFeatureType extends FeatureType +sealed trait CategoricalFeatureType extends FeatureType +sealed trait DiscreteFeatureType extends ContinuousFeatureType - // CATEGORICAL = discrete, unordered value - // CONTINUOUS = ordered numeric value; also used for discrete numeric values now - val CATEGORICAL, CONTINUOUS = Value +case object Continuous extends ContinuousFeatureType +case object Categorical extends CategoricalFeatureType +case object Discrete extends DiscreteFeatureType +case object Binary extends DiscreteFeatureType with CategoricalFeatureType +object FeatureTypes { + def withName(name: String): FeatureType = name match { + case "CONTINUOUS" => Continuous + case "CATEGORICAL" => Categorical + case "DISCRETE" => Discrete + case "BINARY" => Binary + } } From e7ab467707fbea5a3a7481b863712653a02a6f8d Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Thu, 5 Mar 2015 19:12:56 -0800 Subject: [PATCH 3/7] update ML attribute impl --- .../apache/spark/ml/attribute/Attribute.scala | 63 --- .../spark/ml/attribute/AttributeGroup.scala | 223 ++++++++ ...{FeatureType.scala => AttributeKeys.scala} | 35 +- .../spark/ml/attribute/AttributeType.scala | 61 +++ .../ml/attribute/CategoricalAttribute.scala | 75 --- .../ml/attribute/ContinuousAttribute.scala | 57 -- .../ml/attribute/FeatureAttributes.scala | 109 ---- .../spark/ml/attribute/attributes.scala | 488 ++++++++++++++++++ .../apache/spark/ml/attribute/package.scala | 44 ++ .../ml/attribute/JavaAttributeSuite.java | 55 ++ .../ml/attribute/AttributeGroupSuite.scala | 49 ++ .../spark/ml/attribute/AttributeSuite.scala | 189 +++++++ 12 files changed, 1126 insertions(+), 322 deletions(-) delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala rename mllib/src/main/scala/org/apache/spark/ml/attribute/{FeatureType.scala => AttributeKeys.scala} (56%) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala create mode 100644 mllib/src/test/java/org/apache/spark/ml/attribute/JavaAttributeSuite.java create mode 100644 mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala create mode 100644 mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala deleted file mode 100644 index afb03b488ac56..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/Attribute.scala +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.attribute - -import org.apache.spark.sql.types.{MetadataBuilder, Metadata} - -abstract class Attribute(val index: Int, - val name: Option[String], - val dimension: Int) { - - require(index >= 0) - require(dimension >= 1) - - def featureType: FeatureType - - def toMetadata(): Metadata - - private[attribute] def toBaseMetadata(): MetadataBuilder = { - val builder = new MetadataBuilder() - builder.putLong("index", index) - if (name.isDefined) { - builder.putString("name", name.get) - } - if (dimension > 1) { - builder.putLong("dimension", dimension) - } - builder - } - -} - -object Attribute { - - def fromMetadata(metadata: Metadata): Attribute = { - FeatureTypes.withName(metadata.getString("type")) match { - case Categorical => CategoricalAttribute.fromMetadata(metadata) - case Continuous => ContinuousAttribute.fromMetadata(metadata) - } - } - - private[attribute] def parseBaseMetadata(metadata: Metadata): (Int, Option[String], Int) = { - val index = metadata.getLong("index").toInt - val name = if (metadata.contains("name")) Some(metadata.getString("name")) else None - val dimension = if (metadata.contains("dimension")) metadata.getLong("dimension").toInt else 1 - (index, name, dimension) - } - -} \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala new file mode 100644 index 0000000000000..d3e616f7f6a7c --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.mllib.linalg.VectorUDT +import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} + +/** + * Attributes that describe a vector ML column. + * + * @param name name of the attribute group (the ML column name) + * @param numAttributes optional number of attributes. At most one of `numAttributes` and `attrs` + * can be defined. + * @param attrs optional array of attributes. Attribute will be copied with their corresponding + * indices in the array. + */ +class AttributeGroup private ( + val name: String, + val numAttributes: Option[Int], + attrs: Option[Array[Attribute]]) extends Serializable { + + /** + * Creates an attribute group without attribute info. + * @param name name of the attribute group + */ + def this(name: String) = this(name, None, None) + + /** + * Creates an attribute group knowing only the number of attributes. + * @param name name of the attribute group + * @param numAttributes number of attributes + */ + def this(name: String, numAttributes: Int) = this(name, Some(numAttributes), None) + + /** + * Creates an attribute group with attributes. + * @param name name of the attribute group + * @param attrs array of attributes. Attributes will be copied with their corresponding indices in + * the array. + */ + def this(name: String, attrs: Array[Attribute]) = this(name, None, Some(attrs)) + + /** + * Optional array of attributes. At most one of `numAttributes` and `attributes` can be defined. + */ + val attributes: Option[Array[Attribute]] = attrs.map(_.view.zipWithIndex.map { case (attr, i) => + attr.withIndex(i) + }.toArray) + + private lazy val nameToIndex: Map[String, Int] = { + attributes.map(_.view.flatMap { attr => + attr.name.map(_ -> attr.index.get) + }.toMap).getOrElse(Map.empty) + } + + /** Size of the attribute group. Returns -1 if the size is unknown. */ + def size: Int = { + if (numAttributes.isDefined) { + numAttributes.get + } else if (attributes.isDefined) { + attributes.get.length + } else { + -1 + } + } + + /** Test whether this attribute group contains a specific attribute. */ + def hasAttr(attrName: String): Boolean = nameToIndex.contains(attrName) + + /** Index of an attribute specified by name. */ + def indexOf(attrName: String): Int = nameToIndex(attrName) + + /** Gets an attribute by name. */ + def apply(attrName: String): Attribute = { + attributes.get(indexOf(attrName)) + } + + def apply(attrIndex: Int): Attribute = attributes.get(attrIndex) + + /** Converts to metadata without name. */ + private[attribute] def toMetadata: Metadata = { + import AttributeKeys._ + val bldr = new MetadataBuilder() + if (attributes.isDefined) { + val numericMetadata = ArrayBuffer.empty[Metadata] + val nominalMetadata = ArrayBuffer.empty[Metadata] + val binaryMetadata = ArrayBuffer.empty[Metadata] + attributes.get.foreach { + case numeric: NumericAttribute => + // Skip default numeric attributes. + if (numeric.withoutIndex != NumericAttribute.defaultAttr) { + numericMetadata += numeric.toMetadata(withType = false) + } + case nominal: NominalAttribute => + nominalMetadata += nominal.toMetadata(withType = false) + case binary: BinaryAttribute => + binaryMetadata += binary.toMetadata(withType = false) + } + val attrBldr = new MetadataBuilder + if (numericMetadata.nonEmpty) { + attrBldr.putMetadataArray(AttributeType.Numeric.name, numericMetadata.toArray) + } + if (nominalMetadata.nonEmpty) { + attrBldr.putMetadataArray(AttributeType.Nominal.name, nominalMetadata.toArray) + } + if (binaryMetadata.nonEmpty) { + attrBldr.putMetadataArray(AttributeType.Binary.name, binaryMetadata.toArray) + } + bldr.putMetadata(ATTRIBUTES, attrBldr.build()) + bldr.putLong(NUM_ATTRIBUTES, attributes.get.length) + } else if (numAttributes.isDefined) { + bldr.putLong(NUM_ATTRIBUTES, numAttributes.get) + } + bldr.build() + } + + /** Converts to a StructField with some existing metadata. */ + def toStructField(existingMetadata: Metadata): StructField = { + val newMetadata = new MetadataBuilder() + .withMetadata(existingMetadata) + .putMetadata(AttributeKeys.ML_ATTR, toMetadata) + .build() + StructField(name, new VectorUDT, nullable = false, newMetadata) + } + + /** Converts to a StructField. */ + def toStructField(): StructField = toStructField(Metadata.empty) + + override def equals(other: Any): Boolean = { + other match { + case o: AttributeGroup => + (name == o.name) && + (numAttributes == o.numAttributes) && + (attributes.map(_.toSeq) == o.attributes.map(_.toSeq)) + case _ => + false + } + } + + override def hashCode: Int = { + var sum = 17 + sum = 37 * sum + name.hashCode + sum = 37 * sum + numAttributes.hashCode + sum = 37 * sum + attributes.map(_.toSeq).hashCode + sum + } +} + +/** Factory methods to create attribute groups. */ +object AttributeGroup { + + import AttributeKeys._ + + /** Creates an attribute group from a [[Metadata]] instance with name. */ + private[attribute] def fromMetadata(metadata: Metadata, name: String): AttributeGroup = { + import org.apache.spark.ml.attribute.AttributeType._ + if (metadata.contains(ATTRIBUTES)) { + val numAttrs = metadata.getLong(NUM_ATTRIBUTES).toInt + val attributes = new Array[Attribute](numAttrs) + val attrMetadata = metadata.getMetadata(ATTRIBUTES) + if (attrMetadata.contains(Numeric.name)) { + attrMetadata.getMetadataArray(Numeric.name) + .map(NumericAttribute.fromMetadata) + .foreach { attr => + attributes(attr.index.get) = attr + } + } + if (attrMetadata.contains(Nominal.name)) { + attrMetadata.getMetadataArray(Nominal.name) + .map(NominalAttribute.fromMetadata) + .foreach { attr => + attributes(attr.index.get) = attr + } + } + if (attrMetadata.contains(Binary.name)) { + attrMetadata.getMetadataArray(Binary.name) + .map(BinaryAttribute.fromMetadata) + .foreach { attr => + attributes(attr.index.get) = attr + } + } + var i = 0 + while (i < numAttrs) { + if (attributes(i) == null) { + attributes(i) = NumericAttribute.defaultAttr + } + i += 1 + } + new AttributeGroup(name, attributes) + } else if (metadata.contains(NUM_ATTRIBUTES)) { + new AttributeGroup(name, metadata.getLong(NUM_ATTRIBUTES).toInt) + } else { + new AttributeGroup(name) + } + } + + /** Creates an attribute group from a [[StructField]] instance. */ + def fromStructField(field: StructField): AttributeGroup = { + require(field.dataType == new VectorUDT) + if (field.metadata.contains(ML_ATTR)) { + fromMetadata(field.metadata.getMetadata(ML_ATTR), field.name) + } else { + new AttributeGroup(field.name) + } + } +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeKeys.scala similarity index 56% rename from mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala rename to mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeKeys.scala index 1ec9599be4696..f513d40f30a73 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureType.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeKeys.scala @@ -17,22 +17,21 @@ package org.apache.spark.ml.attribute -sealed trait FeatureType - -sealed trait ContinuousFeatureType extends FeatureType -sealed trait CategoricalFeatureType extends FeatureType -sealed trait DiscreteFeatureType extends ContinuousFeatureType - -case object Continuous extends ContinuousFeatureType -case object Categorical extends CategoricalFeatureType -case object Discrete extends DiscreteFeatureType -case object Binary extends DiscreteFeatureType with CategoricalFeatureType - -object FeatureTypes { - def withName(name: String): FeatureType = name match { - case "CONTINUOUS" => Continuous - case "CATEGORICAL" => Categorical - case "DISCRETE" => Discrete - case "BINARY" => Binary - } +/** + * Keys used to store attributes. + */ +private[attribute] object AttributeKeys { + final val ML_ATTR: String = "ml_attr" + final val TYPE: String = "type" + final val NAME: String = "name" + final val INDEX: String = "idx" + final val VALUES: String = "vals" + final val MIN: String = "min" + final val MAX: String = "max" + final val STD: String = "std" + final val SPARSITY: String = "sparsity" + final val ORDINAL: String = "ord" + final val CARDINALITY: String = "card" + final val ATTRIBUTES: String = "attrs" + final val NUM_ATTRIBUTES: String = "num_attrs" } diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala new file mode 100644 index 0000000000000..1f414459b67b9 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +/** + * An enum-like type for attribute types: [[AttributeType$#Numeric]], [[AttributeType$#Nominal]], + * and [[AttributeType$#Binary]]. + */ +sealed abstract class AttributeType(val name: String) + +object AttributeType { + + /** Numeric type. */ + final val Numeric: AttributeType = { + case object Numeric extends AttributeType("numeric") + Numeric + } + + /** Nominal type. */ + final val Nominal: AttributeType = { + case object Nominal extends AttributeType("nominal") + Nominal + } + + /** Binary type. */ + final val Binary: AttributeType = { + case object Binary extends AttributeType("binary") + Binary + } + + /** + * Gets the [[AttributeType]] object from its name. + * @param name attribute type name: "numeric", "nominal", or "binary" + */ + def fromName(name: String): AttributeType = { + if (name == Numeric.name) { + Numeric + } else if (name == Nominal.name) { + Nominal + } else if (name == Binary.name) { + Binary + } else { + throw new IllegalArgumentException(s"Cannot recognize type $name.") + } + } +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala deleted file mode 100644 index 4c54839863211..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/CategoricalAttribute.scala +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.attribute - -import org.apache.spark.sql.types.Metadata - -class CategoricalAttribute private ( - override val index: Int, - override val name: Option[String], - override val dimension: Int, - val categories: Option[Array[String]], - val cardinality: Option[Int]) extends Attribute(index, name, dimension) { - - require(!categories.isDefined || categories.get.nonEmpty) - require(!cardinality.isDefined || cardinality.get > 0) - - override def featureType: FeatureType = Categorical - - override def toMetadata(): Metadata = { - val builder = toBaseMetadata() - if (categories.isDefined) { - builder.putStringArray("categories", categories.get) - } - if (cardinality.isDefined) { - builder.putLong("cardinality", cardinality.get) - } - builder.build() - } - -} - -private[attribute] object CategoricalAttribute { - - def fromMetadata(metadata: Metadata): CategoricalAttribute = { - val (index, name, dimension) = Attribute.parseBaseMetadata(metadata) - - var cardinality: Option[Int] = - if (metadata.contains("cardinality")) { - Some(metadata.getLong("cardinality").toInt) - } else { - None - } - - val categories: Option[Array[String]] = - if (metadata.contains("categories")) { - val theCategories = Some(metadata.getStringArray("categories")) - if (cardinality.isDefined) { - require(theCategories.get.size <= cardinality.get) - } else { - cardinality = Some(theCategories.get.size) - } - theCategories - } else { - None - } - - new CategoricalAttribute(index, name, dimension, categories, cardinality) - } - -} \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala deleted file mode 100644 index 137cc62921863..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/ContinuousAttribute.scala +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.attribute - -import org.apache.spark.sql.types.Metadata - -class ContinuousAttribute private ( - override val index: Int, - override val name: Option[String], - override val dimension: Int, - val min: Option[Double], - val max: Option[Double]) extends Attribute(index, name, dimension) { - - if (min.isDefined && max.isDefined) { - require(min.get <= max.get) - } - - override def featureType(): FeatureType = Continuous - - override def toMetadata(): Metadata = { - val builder = toBaseMetadata() - if (min.isDefined) { - builder.putDouble("min", min.get) - } - if (max.isDefined) { - builder.putDouble("max", max.get) - } - builder.build() - } - -} - -private[attribute] object ContinuousAttribute { - - def fromMetadata(metadata: Metadata): ContinuousAttribute = { - val (index, name, dimension) = Attribute.parseBaseMetadata(metadata) - val min = if (metadata.contains("min")) Some(metadata.getDouble("min")) else None - val max = if (metadata.contains("max")) Some(metadata.getDouble("max")) else None - new ContinuousAttribute(index, name, dimension, min, max) - } - -} diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala deleted file mode 100644 index fe69a6dc27203..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/FeatureAttributes.scala +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.attribute - -import org.apache.spark.sql.types.{MetadataBuilder, Metadata} - -/** - * Representation of specialized information in a [[Metadata]] concerning - * data as machine learning features, with methods to access their associated attributes, like: - * - * - type (continuous, categorical, etc.) as [[FeatureType]] - * - optional feature name - * - for categorical features, the category values - * - for continuous values, maximum and minimum value - * - dimension for vector-valued features - * - * This information is stored as a [[Metadata]] under key "features", and contains an array of - * [[Metadata]] inside that for each feature for which metadata is defined. Example: - * - * {{{ - * { - * ... - * "features" : [ - * { - * "index": 0, - * "name": "age", - * "type": "CONTINUOUS", - * "min": 0 - * }, - * { - * "index": 5, - * "name": "gender", - * "type": "CATEGORICAL", - * "categories" : [ "male", "female" ] - * }, - * { - * "index": 6, - * "name": "customerType", - * "type": "CATEGORICAL", - * "cardinality": 10 - * }, - * { - * "index": 7, - * "name": "percentAllocations", - * "type": "CONTINUOUS", - * "dimension": 10, - * "min": 0, - * "max": 1 - * ] - * "producer": "..." - * ... - * } - * }}} - */ -class FeatureAttributes private (val attributes: Array[Attribute], - val producer: Option[String]) { - - private val nameToIndex: Map[String,Int] = - attributes.filter(_.name.isDefined).map(att => (att.name.get, att.index)).toMap - private val indexToAttribute: Map[Int,Attribute] = - attributes.map(att => (att.index, att)).toMap - private val categoricalIndices: Array[Int] = - attributes.filter(_.featureType match { - case c: CategoricalFeatureType => true - case _ => false - }).map(_.index) - - def getFeatureAttribute(index: Int): Option[Attribute] = indexToAttribute.get(index) - - def getFeatureIndex(featureName: String): Option[Int] = nameToIndex.get(featureName) - - def categoricalFeatureIndices(): Array[Int] = categoricalIndices - - def toMetadata(): Metadata = { - val builder = new MetadataBuilder() - builder.putMetadataArray("features", attributes.map(_.toMetadata())) - if (producer.isDefined) { - builder.putString("producer", producer.get) - } - builder.build() - } - -} - -object FeatureAttributes { - - def fromMetadata(metadata: Metadata): FeatureAttributes = { - val attributes = metadata.getMetadataArray("features").map(Attribute.fromMetadata(_)) - val producer = - if (metadata.contains("producer")) Some(metadata.getString("producer")) else None - new FeatureAttributes(attributes, producer) - } - -} \ No newline at end of file diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala new file mode 100644 index 0000000000000..43a363de6cedf --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala @@ -0,0 +1,488 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import scala.annotation.varargs + +import org.apache.spark.sql.types.{DoubleType, Metadata, MetadataBuilder, StructField} + +/** + * Abstract class for ML attributes. + */ +sealed abstract class Attribute extends Serializable { + + /** Attribute type. */ + def attrType: AttributeType + + /** Name of the attribute. None if it is not set. */ + def name: Option[String] + + /** Copy with a new name. */ + def withName(name: String): Attribute + + /** Copy without the name. */ + def withoutName: Attribute + + /** Index of the attribute. None if it is not set. */ + def index: Option[Int] + + /** Copy with a new index. */ + def withIndex(index: Int): Attribute + + /** Copy without the index. */ + def withoutIndex: Attribute + + /** + * Tests whether this attribute is numeric, true for [[NumericAttribute]] and [[BinaryAttribute]]. + */ + def isNumeric: Boolean + + /** + * Tests whether this attribute is nominal, true for [[NominalAttribute]] and [[BinaryAttribute]]. + */ + def isNominal: Boolean + + /** + * Converts this attribute to [[Metadata]]. + * @param withType whether to include the type info + */ + private[attribute] def toMetadata(withType: Boolean): Metadata + + /** + * Converts this attribute to [[Metadata]]. For numeric attributes, the type info is excluded to + * save space, because numeric type is the default attribute type. For nominal and binary + * attributes, the type info is included. + */ + private[attribute] def toMetadata(): Metadata = { + if (attrType == AttributeType.Numeric) { + toMetadata(withType = false) + } else { + toMetadata(withType = true) + } + } + + /** + * Converts to a [[StructField]] with some existing metadata. + * @param existingMetadata existing metadata to carry over + */ + def toStructField(existingMetadata: Metadata): StructField = { + val newMetadata = new MetadataBuilder() + .withMetadata(existingMetadata) + .putMetadata(AttributeKeys.ML_ATTR, withoutName.withoutIndex.toMetadata()) + .build() + StructField(name.get, DoubleType, nullable = false, newMetadata) + } + + /** Converts to a [[StructField]]. */ + def toStructField(): StructField = toStructField(Metadata.empty) + + override def toString: String = toMetadata(withType = true).toString +} + +/** Trait for ML attribute factories. */ +private[attribute] trait AttributeFactory { + + /** + * Creates an [[Attribute]] from a [[Metadata]] instance. + */ + private[attribute] def fromMetadata(metadata: Metadata): Attribute + + /** + * Creates an [[Attribute]] from a [[StructField]] instance. + */ + def fromStructField(field: StructField): Attribute = { + require(field.dataType == DoubleType) + fromMetadata(field.metadata.getMetadata(AttributeKeys.ML_ATTR)).withName(field.name) + } +} + +object Attribute extends AttributeFactory { + + private[attribute] override def fromMetadata(metadata: Metadata): Attribute = { + import org.apache.spark.ml.attribute.AttributeKeys._ + val attrType = if (metadata.contains(TYPE)) { + metadata.getString(TYPE) + } else { + AttributeType.Numeric.name + } + getFactory(attrType).fromMetadata(metadata) + } + + /** Gets the attribute factory given the attribute type name. */ + private def getFactory(attrType: String): AttributeFactory = { + if (attrType == AttributeType.Numeric.name) { + NumericAttribute + } else if (attrType == AttributeType.Nominal.name) { + NominalAttribute + } else if (attrType == AttributeType.Binary.name) { + BinaryAttribute + } else { + throw new IllegalArgumentException(s"Cannot recognize type $attrType.") + } + } +} + + +/** + * A numeric attribute with optional summary statistics. + * @param name optional name + * @param index optional index + * @param min optional min value + * @param max optional max value + * @param std optional standard deviation + * @param sparsity optional sparsity (ratio of zeros) + */ +class NumericAttribute private[ml] ( + override val name: Option[String] = None, + override val index: Option[Int] = None, + val min: Option[Double] = None, + val max: Option[Double] = None, + val std: Option[Double] = None, + val sparsity: Option[Double] = None) extends Attribute { + + override def attrType: AttributeType = AttributeType.Numeric + + override def withName(name: String): NumericAttribute = copy(name = Some(name)) + override def withoutName: NumericAttribute = copy(name = None) + + override def withIndex(index: Int): NumericAttribute = copy(index = Some(index)) + override def withoutIndex: NumericAttribute = copy(index = None) + + /** Copy with a new min value. */ + def withMin(min: Double): NumericAttribute = copy(min = Some(min)) + + /** Copy without the min value. */ + def withoutMin: NumericAttribute = copy(min = None) + + + /** Copy with a new max value. */ + def withMax(max: Double): NumericAttribute = copy(max = Some(max)) + + /** Copy without the max value. */ + def withoutMax: NumericAttribute = copy(max = None) + + /** Copy with a new standard deviation. */ + def withStd(std: Double): NumericAttribute = copy(std = Some(std)) + + /** Copy without the standard deviation. */ + def withoutStd: NumericAttribute = copy(std = None) + + /** Copy with a new sparsity. */ + def withSparsity(sparsity: Double): NumericAttribute = copy(sparsity = Some(sparsity)) + + /** Copy without the sparsity. */ + def withoutSparsity: NumericAttribute = copy(sparsity = None) + + /** Copy without summary statistics. */ + def withoutSummary: NumericAttribute = copy(min = None, max = None, std = None, sparsity = None) + + override def isNumeric: Boolean = true + + override def isNominal: Boolean = false + + /** Convert this attribute to metadata. */ + private[attribute] override def toMetadata(withType: Boolean): Metadata = { + import org.apache.spark.ml.attribute.AttributeKeys._ + val bldr = new MetadataBuilder() + if (withType) bldr.putString(TYPE, attrType.name) + name.foreach(bldr.putString(NAME, _)) + index.foreach(bldr.putLong(INDEX, _)) + min.foreach(bldr.putDouble(MIN, _)) + max.foreach(bldr.putDouble(MAX, _)) + std.foreach(bldr.putDouble(STD, _)) + sparsity.foreach(bldr.putDouble(SPARSITY, _)) + bldr.build() + } + + /** Creates a copy of this attribute with optional changes. */ + private def copy( + name: Option[String] = name, + index: Option[Int] = index, + min: Option[Double] = min, + max: Option[Double] = max, + std: Option[Double] = std, + sparsity: Option[Double] = sparsity): NumericAttribute = { + new NumericAttribute(name, index, min, max, std, sparsity) + } + + override def equals(other: Any): Boolean = { + other match { + case o: NumericAttribute => + (name == o.name) && + (index == o.index) && + (min == o.min) && + (max == o.max) && + (std == o.std) && + (sparsity == o.sparsity) + case _ => + false + } + } + + override def hashCode: Int = { + var sum = 17 + sum = 37 * sum + name.hashCode + sum = 37 * sum + index.hashCode + sum = 37 * sum + min.hashCode + sum = 37 * sum + max.hashCode + sum = 37 * sum + std.hashCode + sum = 37 * sum + sparsity.hashCode + sum + } +} + +/** + * Factory methods for numeric attributes. + */ +object NumericAttribute extends AttributeFactory { + + /** The default numeric attribute. */ + val defaultAttr: NumericAttribute = new NumericAttribute + + private[attribute] override def fromMetadata(metadata: Metadata): NumericAttribute = { + import org.apache.spark.ml.attribute.AttributeKeys._ + val name = if (metadata.contains(NAME)) Some(metadata.getString(NAME)) else None + val index = if (metadata.contains(INDEX)) Some(metadata.getLong(INDEX).toInt) else None + val min = if (metadata.contains(MIN)) Some(metadata.getDouble(MIN)) else None + val max = if (metadata.contains(MAX)) Some(metadata.getDouble(MAX)) else None + val std = if (metadata.contains(STD)) Some(metadata.getDouble(STD)) else None + val sparsity = if (metadata.contains(SPARSITY)) Some(metadata.getDouble(SPARSITY)) else None + new NumericAttribute(name, index, min, max, std, sparsity) + } +} + +/** + * A nominal attribute. + * @param name optional name + * @param index optional index + * @param isOrdinal whether this attribute is ordinal (optional) + * @param cardinality optional number of values + * @param values optional values + */ +class NominalAttribute private[ml] ( + override val name: Option[String] = None, + override val index: Option[Int] = None, + val isOrdinal: Option[Boolean] = None, + val cardinality: Option[Int] = None, + val values: Option[Array[String]] = None) extends Attribute { + + override def attrType: AttributeType = AttributeType.Nominal + + override def isNumeric: Boolean = false + + override def isNominal: Boolean = true + + private lazy val valueToIndex: Map[String, Int] = { + values.map(_.zipWithIndex.toMap).getOrElse(Map.empty) + } + + /** Index of a specific value. */ + def indexOf(value: String): Int = { + valueToIndex(value) + } + + /** Tests whether this attribute contains a specific value. */ + def hasValue(value: String): Boolean = valueToIndex.contains(value) + + /** Copy with new values. */ + def withValues(values: Array[String]): NominalAttribute = { + copy(cardinality = None, values = Some(values)) + } + + /** Copy with new vaues. */ + @varargs + def withValues(first: String, others: String*): NominalAttribute = { + copy(cardinality = None, values = Some((first +: others).toArray)) + } + + /** Copy without the values. */ + def withoutValues: NominalAttribute = { + copy(values = None) + } + + /** Copy with a new cardinality. */ + def withCardinality(cardinality: Int): NominalAttribute = { + if (values.isDefined) { + throw new IllegalArgumentException("Cannot copy with cardinality if values are defined.") + } else { + copy(cardinality = Some(cardinality)) + } + } + + /** Copy without the cardinality. */ + def withoutCardinality: NominalAttribute = copy(cardinality = None) + + /** Creates a copy of this attribute with optional changes. */ + private def copy( + name: Option[String] = name, + index: Option[Int] = index, + isOrdinal: Option[Boolean] = isOrdinal, + cardinality: Option[Int] = cardinality, + values: Option[Array[String]] = values): NominalAttribute = { + new NominalAttribute(name, index, isOrdinal, cardinality, values) + } + + override def withName(name: String): NominalAttribute = copy(name = Some(name)) + override def withoutName: NominalAttribute = copy(name = None) + + override def withIndex(index: Int): NominalAttribute = copy(index = Some(index)) + override def withoutIndex: NominalAttribute = copy(index = None) + + private[attribute] override def toMetadata(withType: Boolean): Metadata = { + import org.apache.spark.ml.attribute.AttributeKeys._ + val bldr = new MetadataBuilder() + if (withType) bldr.putString(TYPE, attrType.name) + name.foreach(bldr.putString(NAME, _)) + index.foreach(bldr.putLong(INDEX, _)) + isOrdinal.foreach(bldr.putBoolean(ORDINAL, _)) + cardinality.foreach(bldr.putLong(CARDINALITY, _)) + values.foreach(v => bldr.putStringArray(VALUES, v)) + bldr.build() + } + + override def equals(other: Any): Boolean = { + other match { + case o: NominalAttribute => + (name == o.name) && + (index == o.index) && + (isOrdinal == o.isOrdinal) && + (cardinality == o.cardinality) && + (values.map(_.toSeq) == o.values.map(_.toSeq)) + case _ => + false + } + } + + override def hashCode: Int = { + var sum = 17 + sum = 37 * sum + name.hashCode + sum = 37 * sum + index.hashCode + sum = 37 * sum + isOrdinal.hashCode + sum = 37 * sum + cardinality.hashCode + sum = 37 * sum + values.map(_.toSeq).hashCode + sum + } +} + +/** Factory methods for nominal attributes. */ +object NominalAttribute extends AttributeFactory { + + /** The default nominal attribute. */ + final val defaultAttr: NominalAttribute = new NominalAttribute + + private[attribute] override def fromMetadata(metadata: Metadata): NominalAttribute = { + import org.apache.spark.ml.attribute.AttributeKeys._ + val name = if (metadata.contains(NAME)) Some(metadata.getString(NAME)) else None + val index = if (metadata.contains(INDEX)) Some(metadata.getLong(INDEX).toInt) else None + val isOrdinal = if (metadata.contains(ORDINAL)) Some(metadata.getBoolean(ORDINAL)) else None + val cardinality = + if (metadata.contains(CARDINALITY)) Some(metadata.getLong(CARDINALITY).toInt) else None + val values = + if (metadata.contains(VALUES)) Some(metadata.getStringArray(VALUES)) else None + new NominalAttribute(name, index, isOrdinal, cardinality, values) + } +} + +/** + * A binary attribute. + * @param name optional name + * @param index optional index + * @param values optionla values. If set, its size must be 2. + */ +class BinaryAttribute private[ml] ( + override val name: Option[String] = None, + override val index: Option[Int] = None, + val values: Option[Array[String]] = None) + extends Attribute { + + override def attrType: AttributeType = AttributeType.Binary + + override def isNumeric: Boolean = true + + override def isNominal: Boolean = true + + override def withName(name: String): BinaryAttribute = copy(name = Some(name)) + override def withoutName: BinaryAttribute = copy(name = None) + + override def withIndex(index: Int): BinaryAttribute = copy(index = Some(index)) + override def withoutIndex: BinaryAttribute = copy(index = None) + + /** + * Copy with new values. + * @param negative name for negative + * @param positive name for positive + */ + def withValues(negative: String, positive: String): BinaryAttribute = + copy(values = Some(Array(negative, positive))) + + /** Copy without the values. */ + def withoutValues: BinaryAttribute = copy(values = None) + + /** Creates a copy of this attribute with optional changes. */ + private def copy( + name: Option[String] = name, + index: Option[Int] = index, + values: Option[Array[String]] = values): BinaryAttribute = { + new BinaryAttribute(name, index, values) + } + + private[attribute] override def toMetadata(withType: Boolean): Metadata = { + import org.apache.spark.ml.attribute.AttributeKeys._ + val bldr = new MetadataBuilder + if (withType) bldr.putString(TYPE, attrType.name) + name.foreach(bldr.putString(NAME, _)) + index.foreach(bldr.putLong(INDEX, _)) + values.foreach(v => bldr.putStringArray(VALUES, v)) + bldr.build() + } + + override def equals(other: Any): Boolean = { + other match { + case o: BinaryAttribute => + (name == o.name) && + (index == o.index) && + (values.map(_.toSeq) == o.values.map(_.toSeq)) + case _ => + false + } + } + + override def hashCode: Int = { + var sum = 17 + sum = 37 * sum + name.hashCode + sum = 37 * sum + index.hashCode + sum = 37 * sum + values.map(_.toSeq).hashCode + sum + } +} + +/** Factory methods for binary attributes. */ +object BinaryAttribute extends AttributeFactory { + + /** The default binary attribute. */ + final val defaultAttr: BinaryAttribute = new BinaryAttribute + + private[attribute] override def fromMetadata(metadata: Metadata): BinaryAttribute = { + import org.apache.spark.ml.attribute.AttributeKeys._ + val name = if (metadata.contains(NAME)) Some(metadata.getString(NAME)) else None + val index = if (metadata.contains(INDEX)) Some(metadata.getLong(INDEX).toInt) else None + val values = + if (metadata.contains(VALUES)) Some(metadata.getStringArray(VALUES)) else None + new BinaryAttribute(name, index, values) + } +} diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala new file mode 100644 index 0000000000000..7ac21d7d563f2 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml + +import org.apache.spark.sql.DataFrame +import org.apache.spark.ml.attribute.{Attribute, AttributeGroup} + +/** + * ==ML attributes== + * + * The ML pipeline API uses [[DataFrame]]s as ML datasets. + * Each dataset consists of typed columns, e.g., string, double, vector, etc. + * However, knowing only the column type may not be sufficient to handle the data properly. + * For instance, a double column with values 0.0, 1.0, 2.0, ... may represent some label indices, + * which cannot be treated as numeric values in ML algorithms, and, for another instance, we may + * want to know the names and types of features stored in a vector column. + * ML attributes are used to provide additional information to describe columns in a dataset. + * + * ===ML columns=== + * + * A column with ML attributes attached is called an ML column. + * The data in ML columns are stored as double values, i.e., an ML column is either a scalar column + * of double values or a vector column. + * Columns of other types must be encoded into ML columns using transformers. + * We use [[Attribute]] to describe a scalar ML column, and [[AttributeGroup]] to describe a vector + * ML column. + * ML attributes are stored in the metadata field of the column schema. + */ +package object attribute diff --git a/mllib/src/test/java/org/apache/spark/ml/attribute/JavaAttributeSuite.java b/mllib/src/test/java/org/apache/spark/ml/attribute/JavaAttributeSuite.java new file mode 100644 index 0000000000000..b74bbed231434 --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/ml/attribute/JavaAttributeSuite.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute; + +import org.junit.Test; +import org.junit.Assert; + +public class JavaAttributeSuite { + + @Test + public void testAttributeType() { + AttributeType numericType = AttributeType.Numeric(); + AttributeType nominalType = AttributeType.Nominal(); + AttributeType binaryType = AttributeType.Binary(); + Assert.assertEquals(numericType, NumericAttribute.defaultAttr().attrType()); + Assert.assertEquals(nominalType, NominalAttribute.defaultAttr().attrType()); + Assert.assertEquals(binaryType, BinaryAttribute.defaultAttr().attrType()); + } + + @Test + public void testNumericAttribute() { + NumericAttribute attr = NumericAttribute.defaultAttr() + .withName("age").withIndex(0).withMin(0.0).withMax(1.0).withStd(0.5).withSparsity(0.4); + Assert.assertEquals(attr.withoutIndex(), Attribute.fromStructField(attr.toStructField())); + } + + @Test + public void testNominalAttribute() { + NominalAttribute attr = NominalAttribute.defaultAttr() + .withName("size").withIndex(1).withValues("small", "medium", "large"); + Assert.assertEquals(attr.withoutIndex(), Attribute.fromStructField(attr.toStructField())); + } + + @Test + public void testBinaryAttribute() { + BinaryAttribute attr = BinaryAttribute.defaultAttr() + .withName("clicked").withIndex(2).withValues("no", "yes"); + Assert.assertEquals(attr.withoutIndex(), Attribute.fromStructField(attr.toStructField())); + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala new file mode 100644 index 0000000000000..4400f32d3187e --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import org.scalatest.FunSuite + +class AttributeGroupSuite extends FunSuite { + + test("attribute group") { + val attrs = Array( + NumericAttribute.defaultAttr, + NominalAttribute.defaultAttr, + BinaryAttribute.defaultAttr.withIndex(0), + NumericAttribute.defaultAttr.withName("age").withSparsity(0.8), + NominalAttribute.defaultAttr.withName("size").withValues("small", "medium", "large"), + BinaryAttribute.defaultAttr.withName("clicked").withValues("no", "yes"), + NumericAttribute.defaultAttr, + NumericAttribute.defaultAttr) + val group = new AttributeGroup("user", attrs) + assert(group.size === 8) + assert(group.name === "user") + assert(group(0) === NumericAttribute.defaultAttr.withIndex(0)) + assert(group(2) === BinaryAttribute.defaultAttr.withIndex(2)) + assert(group.indexOf("age") === 3) + assert(group.indexOf("size") === 4) + assert(group.indexOf("clicked") === 5) + assert(!group.hasAttr("abc")) + intercept[NoSuchElementException] { + group("abc") + } + assert(group === AttributeGroup.fromMetadata(group.toMetadata, group.name)) + assert(group === AttributeGroup.fromStructField(group.toStructField())) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala new file mode 100644 index 0000000000000..2552d1dba0403 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute + +import org.scalatest.FunSuite + +import org.apache.spark.sql.types.{DoubleType, MetadataBuilder, Metadata} + +class AttributeSuite extends FunSuite { + + test("default numeric attribute") { + val attr: NumericAttribute = NumericAttribute.defaultAttr + val metadata = Metadata.fromJson("{}") + val metadataWithType = Metadata.fromJson("""{"type":"numeric"}""") + assert(attr.attrType === AttributeType.Numeric) + assert(attr.isNumeric) + assert(!attr.isNominal) + assert(attr.name.isEmpty) + assert(attr.index.isEmpty) + assert(attr.min.isEmpty) + assert(attr.max.isEmpty) + assert(attr.std.isEmpty) + assert(attr.sparsity.isEmpty) + assert(attr.toMetadata() === metadata) + assert(attr.toMetadata(withType = false) === metadata) + assert(attr.toMetadata(withType = true) === metadataWithType) + assert(attr === Attribute.fromMetadata(metadata)) + assert(attr === Attribute.fromMetadata(metadataWithType)) + intercept[NoSuchElementException] { + attr.toStructField() + } + } + + test("customized numeric attribute") { + val name = "age" + val index = 0 + val metadata = Metadata.fromJson("""{"name":"age","idx":0}""") + val metadataWithType = Metadata.fromJson("""{"type":"numeric","name":"age","idx":0}""") + val attr: NumericAttribute = NumericAttribute.defaultAttr + .withName(name) + .withIndex(index) + assert(attr.attrType == AttributeType.Numeric) + assert(attr.isNumeric) + assert(!attr.isNominal) + assert(attr.name === Some(name)) + assert(attr.index === Some(index)) + assert(attr.toMetadata() === metadata) + assert(attr.toMetadata(withType = false) === metadata) + assert(attr.toMetadata(withType = true) === metadataWithType) + assert(attr === Attribute.fromMetadata(metadata)) + assert(attr === Attribute.fromMetadata(metadataWithType)) + val field = attr.toStructField() + assert(field.dataType === DoubleType) + assert(!field.nullable) + assert(attr.withoutIndex === Attribute.fromStructField(field)) + val existingMetadata = new MetadataBuilder() + .putString("name", "test") + .build() + assert(attr.toStructField(existingMetadata).metadata.getString("name") === "test") + + val attr2 = + attr.withoutName.withoutIndex.withMin(0.0).withMax(1.0).withStd(0.5).withSparsity(0.3) + assert(attr2.name.isEmpty) + assert(attr2.index.isEmpty) + assert(attr2.min === Some(0.0)) + assert(attr2.max === Some(1.0)) + assert(attr2.std === Some(0.5)) + assert(attr2.sparsity === Some(0.3)) + assert(attr2 === Attribute.fromMetadata(attr2.toMetadata())) + } + + test("default nominal attribute") { + val attr: NominalAttribute = NominalAttribute.defaultAttr + val metadata = Metadata.fromJson("""{"type":"nominal"}""") + val metadataWithoutType = Metadata.fromJson("{}") + assert(attr.attrType === AttributeType.Nominal) + assert(!attr.isNumeric) + assert(attr.isNominal) + assert(attr.name.isEmpty) + assert(attr.index.isEmpty) + assert(attr.values.isEmpty) + assert(attr.cardinality.isEmpty) + assert(attr.isOrdinal.isEmpty) + assert(attr.toMetadata() === metadata) + assert(attr.toMetadata(withType = true) === metadata) + assert(attr.toMetadata(withType = false) === metadataWithoutType) + assert(attr === Attribute.fromMetadata(metadata)) + assert(attr === NominalAttribute.fromMetadata(metadataWithoutType)) + intercept[NoSuchElementException] { + attr.toStructField() + } + } + + test("customized nominal attribute") { + val name = "size" + val index = 1 + val values = Array("small", "medium", "large") + val metadata = Metadata.fromJson( + """{"type":"nominal","name":"size","idx":1,"vals":["small","medium","large"]}""") + val metadataWithoutType = Metadata.fromJson( + """{"name":"size","idx":1,"vals":["small","medium","large"]}""") + val attr: NominalAttribute = NominalAttribute.defaultAttr + .withName(name) + .withIndex(index) + .withValues(values) + assert(attr.attrType === AttributeType.Nominal) + assert(!attr.isNumeric) + assert(attr.isNominal) + assert(attr.name === Some(name)) + assert(attr.index === Some(index)) + assert(attr.values === Some(values)) + assert(attr.indexOf("medium") === 1) + assert(attr.toMetadata() === metadata) + assert(attr.toMetadata(withType = true) === metadata) + assert(attr.toMetadata(withType = false) === metadataWithoutType) + assert(attr === Attribute.fromMetadata(metadata)) + assert(attr === NominalAttribute.fromMetadata(metadataWithoutType)) + assert(attr.withoutIndex === Attribute.fromStructField(attr.toStructField())) + + val attr2 = attr.withoutName.withoutIndex.withValues(attr.values.get :+ "x-large") + assert(attr2.name.isEmpty) + assert(attr2.index.isEmpty) + assert(attr2.values.get === Array("small", "medium", "large", "x-large")) + assert(attr2.indexOf("x-large") === 3) + assert(attr2 === Attribute.fromMetadata(attr2.toMetadata())) + assert(attr2 === NominalAttribute.fromMetadata(attr2.toMetadata(withType = false))) + } + + test("default binary attribute") { + val attr = BinaryAttribute.defaultAttr + val metadata = Metadata.fromJson("""{"type":"binary"}""") + val metadataWithoutType = Metadata.fromJson("{}") + assert(attr.attrType === AttributeType.Binary) + assert(attr.isNumeric) + assert(attr.isNominal) + assert(attr.name.isEmpty) + assert(attr.index.isEmpty) + assert(attr.values.isEmpty) + assert(attr.toMetadata() === metadata) + assert(attr.toMetadata(withType = true) === metadata) + assert(attr.toMetadata(withType = false) === metadataWithoutType) + assert(attr === Attribute.fromMetadata(metadata)) + assert(attr === BinaryAttribute.fromMetadata(metadataWithoutType)) + intercept[NoSuchElementException] { + attr.toStructField() + } + } + + test("customized binary attribute") { + val name = "clicked" + val index = 2 + val values = Array("no", "yes") + val metadata = Metadata.fromJson( + """{"type":"binary","name":"clicked","idx":2,"vals":["no","yes"]}""") + val metadataWithoutType = Metadata.fromJson( + """{"name":"clicked","idx":2,"vals":["no","yes"]}""") + val attr = BinaryAttribute.defaultAttr + .withName(name) + .withIndex(index) + .withValues(values(0), values(1)) + assert(attr.attrType === AttributeType.Binary) + assert(attr.isNumeric) + assert(attr.isNominal) + assert(attr.name === Some(name)) + assert(attr.index === Some(index)) + assert(attr.values.get === values) + assert(attr.toMetadata() === metadata) + assert(attr.toMetadata(withType = true) === metadata) + assert(attr.toMetadata(withType = false) === metadataWithoutType) + assert(attr === Attribute.fromMetadata(metadata)) + assert(attr === BinaryAttribute.fromMetadata(metadataWithoutType)) + assert(attr.withoutIndex === Attribute.fromStructField(attr.toStructField())) + } +} From b1aceefaf9738e7528c179b43bfc99c793160164 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Thu, 5 Mar 2015 19:44:13 -0800 Subject: [PATCH 4/7] more tests --- .../spark/ml/attribute/AttributeGroup.scala | 13 +++- .../spark/ml/attribute/attributes.scala | 78 ++++++++++++------- .../ml/attribute/AttributeGroupSuite.scala | 16 ++++ .../spark/ml/attribute/AttributeSuite.scala | 25 +++++- 4 files changed, 103 insertions(+), 29 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala index d3e616f7f6a7c..970e6ad5514d1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeGroup.scala @@ -36,6 +36,10 @@ class AttributeGroup private ( val numAttributes: Option[Int], attrs: Option[Array[Attribute]]) extends Serializable { + require(name.nonEmpty, "Cannot have an empty string for name.") + require(!(numAttributes.isDefined && attrs.isDefined), + "Cannot have both numAttributes and attrs defined.") + /** * Creates an attribute group without attribute info. * @param name name of the attribute group @@ -87,13 +91,20 @@ class AttributeGroup private ( /** Index of an attribute specified by name. */ def indexOf(attrName: String): Int = nameToIndex(attrName) - /** Gets an attribute by name. */ + /** Gets an attribute by its name. */ def apply(attrName: String): Attribute = { attributes.get(indexOf(attrName)) } + /** Gets an attribute by its name. */ + def getAttr(attrName: String): Attribute = this(attrName) + + /** Gets an attribute by its index. */ def apply(attrIndex: Int): Attribute = attributes.get(attrIndex) + /** Gets an attribute by its index. */ + def getAttr(attrIndex: Int): Attribute = this(attrIndex) + /** Converts to metadata without name. */ private[attribute] def toMetadata: Metadata = { import AttributeKeys._ diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala index 43a363de6cedf..07f6c6c256120 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala @@ -26,6 +26,13 @@ import org.apache.spark.sql.types.{DoubleType, Metadata, MetadataBuilder, Struct */ sealed abstract class Attribute extends Serializable { + name.foreach { n => + require(n.nonEmpty, "Cannot have an empty string for name.") + } + index.foreach { i => + require(i >= 0, s"Index cannot be negative but got $i") + } + /** Attribute type. */ def attrType: AttributeType @@ -155,6 +162,13 @@ class NumericAttribute private[ml] ( val std: Option[Double] = None, val sparsity: Option[Double] = None) extends Attribute { + std.foreach { s => + require(s >= 0.0, s"Standard deviation cannot be negative but got $s.") + } + sparsity.foreach { s => + require(s >= 0.0 && s <= 1.0, s"Sparsity must be in [0, 1] but got $s.") + } + override def attrType: AttributeType = AttributeType.Numeric override def withName(name: String): NumericAttribute = copy(name = Some(name)) @@ -271,16 +285,23 @@ object NumericAttribute extends AttributeFactory { * @param name optional name * @param index optional index * @param isOrdinal whether this attribute is ordinal (optional) - * @param cardinality optional number of values - * @param values optional values + * @param numValues optional number of values. At most one of `numValues` and `values` can be + * defined. + * @param values optional values. At most one of `numValues` and `values` can be defined. */ class NominalAttribute private[ml] ( override val name: Option[String] = None, override val index: Option[Int] = None, val isOrdinal: Option[Boolean] = None, - val cardinality: Option[Int] = None, + val numValues: Option[Int] = None, val values: Option[Array[String]] = None) extends Attribute { + numValues.foreach { n => + require(n >= 0, s"numValues cannot be negative but got $n.") + } + require(!(numValues.isDefined && values.isDefined), + "Cannot have both numValues and values defined.") + override def attrType: AttributeType = AttributeType.Nominal override def isNumeric: Boolean = false @@ -299,15 +320,24 @@ class NominalAttribute private[ml] ( /** Tests whether this attribute contains a specific value. */ def hasValue(value: String): Boolean = valueToIndex.contains(value) - /** Copy with new values. */ + /** Gets a value given its index. */ + def getValue(index: Int): String = values.get(index) + + override def withName(name: String): NominalAttribute = copy(name = Some(name)) + override def withoutName: NominalAttribute = copy(name = None) + + override def withIndex(index: Int): NominalAttribute = copy(index = Some(index)) + override def withoutIndex: NominalAttribute = copy(index = None) + + /** Copy with new values and empty `numValues`. */ def withValues(values: Array[String]): NominalAttribute = { - copy(cardinality = None, values = Some(values)) + copy(numValues = None, values = Some(values)) } - /** Copy with new vaues. */ + /** Copy with new values and empty `numValues`. */ @varargs def withValues(first: String, others: String*): NominalAttribute = { - copy(cardinality = None, values = Some((first +: others).toArray)) + copy(numValues = None, values = Some((first +: others).toArray)) } /** Copy without the values. */ @@ -315,34 +345,24 @@ class NominalAttribute private[ml] ( copy(values = None) } - /** Copy with a new cardinality. */ - def withCardinality(cardinality: Int): NominalAttribute = { - if (values.isDefined) { - throw new IllegalArgumentException("Cannot copy with cardinality if values are defined.") - } else { - copy(cardinality = Some(cardinality)) - } + /** Copy with a new `numValues` and empty `values`. */ + def withNumValues(numValues: Int): NominalAttribute = { + copy(numValues = Some(numValues), values = None) } - /** Copy without the cardinality. */ - def withoutCardinality: NominalAttribute = copy(cardinality = None) + /** Copy without the `numValues`. */ + def withoutNumValues: NominalAttribute = copy(numValues = None) /** Creates a copy of this attribute with optional changes. */ private def copy( name: Option[String] = name, index: Option[Int] = index, isOrdinal: Option[Boolean] = isOrdinal, - cardinality: Option[Int] = cardinality, + numValues: Option[Int] = numValues, values: Option[Array[String]] = values): NominalAttribute = { - new NominalAttribute(name, index, isOrdinal, cardinality, values) + new NominalAttribute(name, index, isOrdinal, numValues, values) } - override def withName(name: String): NominalAttribute = copy(name = Some(name)) - override def withoutName: NominalAttribute = copy(name = None) - - override def withIndex(index: Int): NominalAttribute = copy(index = Some(index)) - override def withoutIndex: NominalAttribute = copy(index = None) - private[attribute] override def toMetadata(withType: Boolean): Metadata = { import org.apache.spark.ml.attribute.AttributeKeys._ val bldr = new MetadataBuilder() @@ -350,7 +370,7 @@ class NominalAttribute private[ml] ( name.foreach(bldr.putString(NAME, _)) index.foreach(bldr.putLong(INDEX, _)) isOrdinal.foreach(bldr.putBoolean(ORDINAL, _)) - cardinality.foreach(bldr.putLong(CARDINALITY, _)) + numValues.foreach(bldr.putLong(CARDINALITY, _)) values.foreach(v => bldr.putStringArray(VALUES, v)) bldr.build() } @@ -361,7 +381,7 @@ class NominalAttribute private[ml] ( (name == o.name) && (index == o.index) && (isOrdinal == o.isOrdinal) && - (cardinality == o.cardinality) && + (numValues == o.numValues) && (values.map(_.toSeq) == o.values.map(_.toSeq)) case _ => false @@ -373,7 +393,7 @@ class NominalAttribute private[ml] ( sum = 37 * sum + name.hashCode sum = 37 * sum + index.hashCode sum = 37 * sum + isOrdinal.hashCode - sum = 37 * sum + cardinality.hashCode + sum = 37 * sum + numValues.hashCode sum = 37 * sum + values.map(_.toSeq).hashCode sum } @@ -410,6 +430,10 @@ class BinaryAttribute private[ml] ( val values: Option[Array[String]] = None) extends Attribute { + values.foreach { v => + require(v.length == 2, s"Number of values must be 2 for a binary attribute but got ${v.toSeq}.") + } + override def attrType: AttributeType = AttributeType.Binary override def isNumeric: Boolean = true diff --git a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala index 4400f32d3187e..3fb6e2ec46468 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeGroupSuite.scala @@ -46,4 +46,20 @@ class AttributeGroupSuite extends FunSuite { assert(group === AttributeGroup.fromMetadata(group.toMetadata, group.name)) assert(group === AttributeGroup.fromStructField(group.toStructField())) } + + test("attribute group without attributes") { + val group0 = new AttributeGroup("user", 10) + assert(group0.name === "user") + assert(group0.numAttributes === Some(10)) + assert(group0.size === 10) + assert(group0.attributes.isEmpty) + assert(group0 === AttributeGroup.fromMetadata(group0.toMetadata, group0.name)) + assert(group0 === AttributeGroup.fromStructField(group0.toStructField())) + + val group1 = new AttributeGroup("item") + assert(group1.name === "item") + assert(group1.numAttributes.isEmpty) + assert(group1.attributes.isEmpty) + assert(group1.size === -1) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala index 2552d1dba0403..6ec35b03656f9 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/attribute/AttributeSuite.scala @@ -84,6 +84,15 @@ class AttributeSuite extends FunSuite { assert(attr2 === Attribute.fromMetadata(attr2.toMetadata())) } + test("bad numeric attributes") { + val attr = NumericAttribute.defaultAttr + intercept[IllegalArgumentException](attr.withName("")) + intercept[IllegalArgumentException](attr.withIndex(-1)) + intercept[IllegalArgumentException](attr.withStd(-0.1)) + intercept[IllegalArgumentException](attr.withSparsity(-0.5)) + intercept[IllegalArgumentException](attr.withSparsity(1.5)) + } + test("default nominal attribute") { val attr: NominalAttribute = NominalAttribute.defaultAttr val metadata = Metadata.fromJson("""{"type":"nominal"}""") @@ -94,7 +103,7 @@ class AttributeSuite extends FunSuite { assert(attr.name.isEmpty) assert(attr.index.isEmpty) assert(attr.values.isEmpty) - assert(attr.cardinality.isEmpty) + assert(attr.numValues.isEmpty) assert(attr.isOrdinal.isEmpty) assert(attr.toMetadata() === metadata) assert(attr.toMetadata(withType = true) === metadata) @@ -125,6 +134,7 @@ class AttributeSuite extends FunSuite { assert(attr.index === Some(index)) assert(attr.values === Some(values)) assert(attr.indexOf("medium") === 1) + assert(attr.getValue(1) === "medium") assert(attr.toMetadata() === metadata) assert(attr.toMetadata(withType = true) === metadata) assert(attr.toMetadata(withType = false) === metadataWithoutType) @@ -141,6 +151,13 @@ class AttributeSuite extends FunSuite { assert(attr2 === NominalAttribute.fromMetadata(attr2.toMetadata(withType = false))) } + test("bad nominal attributes") { + val attr = NominalAttribute.defaultAttr + intercept[IllegalArgumentException](attr.withName("")) + intercept[IllegalArgumentException](attr.withIndex(-1)) + intercept[IllegalArgumentException](attr.withNumValues(-1)) + } + test("default binary attribute") { val attr = BinaryAttribute.defaultAttr val metadata = Metadata.fromJson("""{"type":"binary"}""") @@ -186,4 +203,10 @@ class AttributeSuite extends FunSuite { assert(attr === BinaryAttribute.fromMetadata(metadataWithoutType)) assert(attr.withoutIndex === Attribute.fromStructField(attr.toStructField())) } + + test("bad binary attributes") { + val attr = BinaryAttribute.defaultAttr + intercept[IllegalArgumentException](attr.withName("")) + intercept[IllegalArgumentException](attr.withIndex(-1)) + } } From 393ffdca897b27e4bb263bb538395a045dbedf26 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Fri, 6 Mar 2015 09:16:13 -0800 Subject: [PATCH 5/7] forgot to include Java attribute group tests --- .../ml/attribute/JavaAttributeGroupSuite.java | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 mllib/src/test/java/org/apache/spark/ml/attribute/JavaAttributeGroupSuite.java diff --git a/mllib/src/test/java/org/apache/spark/ml/attribute/JavaAttributeGroupSuite.java b/mllib/src/test/java/org/apache/spark/ml/attribute/JavaAttributeGroupSuite.java new file mode 100644 index 0000000000000..38eb58673ad53 --- /dev/null +++ b/mllib/src/test/java/org/apache/spark/ml/attribute/JavaAttributeGroupSuite.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.attribute; + +import org.junit.Assert; +import org.junit.Test; + +public class JavaAttributeGroupSuite { + + @Test + public void testAttributeGroup() { + Attribute[] attrs = new Attribute[]{ + NumericAttribute.defaultAttr(), + NominalAttribute.defaultAttr(), + BinaryAttribute.defaultAttr().withIndex(0), + NumericAttribute.defaultAttr().withName("age").withSparsity(0.8), + NominalAttribute.defaultAttr().withName("size").withValues("small", "medium", "large"), + BinaryAttribute.defaultAttr().withName("clicked").withValues("no", "yes"), + NumericAttribute.defaultAttr(), + NumericAttribute.defaultAttr() + }; + AttributeGroup group = new AttributeGroup("user", attrs); + Assert.assertEquals(8, group.size()); + Assert.assertEquals("user", group.name()); + Assert.assertEquals(NumericAttribute.defaultAttr().withIndex(0), group.getAttr(0)); + Assert.assertEquals(3, group.indexOf("age")); + Assert.assertFalse(group.hasAttr("abc")); + Assert.assertEquals(group, AttributeGroup.fromStructField(group.toStructField())); + } +} From 617be409e8a973a93c3ed6d9dcc1b345708e22e4 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Fri, 6 Mar 2015 14:49:03 -0800 Subject: [PATCH 6/7] remove final; rename cardinality to numValues --- .../spark/ml/attribute/AttributeKeys.scala | 26 +++++++++---------- .../spark/ml/attribute/AttributeType.scala | 6 ++--- .../spark/ml/attribute/attributes.scala | 8 +++--- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeKeys.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeKeys.scala index f513d40f30a73..f714f7becc7e6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeKeys.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeKeys.scala @@ -21,17 +21,17 @@ package org.apache.spark.ml.attribute * Keys used to store attributes. */ private[attribute] object AttributeKeys { - final val ML_ATTR: String = "ml_attr" - final val TYPE: String = "type" - final val NAME: String = "name" - final val INDEX: String = "idx" - final val VALUES: String = "vals" - final val MIN: String = "min" - final val MAX: String = "max" - final val STD: String = "std" - final val SPARSITY: String = "sparsity" - final val ORDINAL: String = "ord" - final val CARDINALITY: String = "card" - final val ATTRIBUTES: String = "attrs" - final val NUM_ATTRIBUTES: String = "num_attrs" + val ML_ATTR: String = "ml_attr" + val TYPE: String = "type" + val NAME: String = "name" + val INDEX: String = "idx" + val MIN: String = "min" + val MAX: String = "max" + val STD: String = "std" + val SPARSITY: String = "sparsity" + val ORDINAL: String = "ord" + val VALUES: String = "vals" + val NUM_VALUES: String = "num_vals" + val ATTRIBUTES: String = "attrs" + val NUM_ATTRIBUTES: String = "num_attrs" } diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala index 1f414459b67b9..65e7e43d5a5b0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala @@ -26,19 +26,19 @@ sealed abstract class AttributeType(val name: String) object AttributeType { /** Numeric type. */ - final val Numeric: AttributeType = { + val Numeric: AttributeType = { case object Numeric extends AttributeType("numeric") Numeric } /** Nominal type. */ - final val Nominal: AttributeType = { + val Nominal: AttributeType = { case object Nominal extends AttributeType("nominal") Nominal } /** Binary type. */ - final val Binary: AttributeType = { + val Binary: AttributeType = { case object Binary extends AttributeType("binary") Binary } diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala index 07f6c6c256120..00b7566aab434 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/attributes.scala @@ -370,7 +370,7 @@ class NominalAttribute private[ml] ( name.foreach(bldr.putString(NAME, _)) index.foreach(bldr.putLong(INDEX, _)) isOrdinal.foreach(bldr.putBoolean(ORDINAL, _)) - numValues.foreach(bldr.putLong(CARDINALITY, _)) + numValues.foreach(bldr.putLong(NUM_VALUES, _)) values.foreach(v => bldr.putStringArray(VALUES, v)) bldr.build() } @@ -410,11 +410,11 @@ object NominalAttribute extends AttributeFactory { val name = if (metadata.contains(NAME)) Some(metadata.getString(NAME)) else None val index = if (metadata.contains(INDEX)) Some(metadata.getLong(INDEX).toInt) else None val isOrdinal = if (metadata.contains(ORDINAL)) Some(metadata.getBoolean(ORDINAL)) else None - val cardinality = - if (metadata.contains(CARDINALITY)) Some(metadata.getLong(CARDINALITY).toInt) else None + val numValues = + if (metadata.contains(NUM_VALUES)) Some(metadata.getLong(NUM_VALUES).toInt) else None val values = if (metadata.contains(VALUES)) Some(metadata.getStringArray(VALUES)) else None - new NominalAttribute(name, index, isOrdinal, cardinality, values) + new NominalAttribute(name, index, isOrdinal, numValues, values) } } From 71d1bd042360376b7f8f75ba6be3471f443ce34e Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Wed, 11 Mar 2015 18:00:54 -0700 Subject: [PATCH 7/7] add JavaDoc for package ml.attribute --- .../spark/ml/attribute/package-info.java | 41 +++++++++++++++++++ project/SparkBuild.scala | 3 +- 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/attribute/package-info.java diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/package-info.java b/mllib/src/main/scala/org/apache/spark/ml/attribute/package-info.java new file mode 100644 index 0000000000000..e3474f3c1d3ff --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/package-info.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The content here should be in sync with `package.scala`. + +/** + *

ML attributes

+ * + * The ML pipeline API uses {@link org.apache.spark.sql.DataFrame}s as ML datasets. + * Each dataset consists of typed columns, e.g., string, double, vector, etc. + * However, knowing only the column type may not be sufficient to handle the data properly. + * For instance, a double column with values 0.0, 1.0, 2.0, ... may represent some label indices, + * which cannot be treated as numeric values in ML algorithms, and, for another instance, we may + * want to know the names and types of features stored in a vector column. + * ML attributes are used to provide additional information to describe columns in a dataset. + * + *

ML columns

+ * + * A column with ML attributes attached is called an ML column. + * The data in ML columns are stored as double values, i.e., an ML column is either a scalar column + * of double values or a vector column. + * Columns of other types must be encoded into ML columns using transformers. + * We use {@link org.apache.spark.ml.attribute.Attribute} to describe a scalar ML column, and + * {@link org.apache.spark.ml.attribute.AttributeGroup} to describe a vector ML column. + * ML attributes are stored in the metadata field of the column schema. + */ +package org.apache.spark.ml.attribute; diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index e4b1b96527fbd..3cf8566592b64 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -402,7 +402,8 @@ object Unidoc { "mllib.tree.impurity", "mllib.tree.model", "mllib.util", "mllib.evaluation", "mllib.feature", "mllib.random", "mllib.stat.correlation", "mllib.stat.test", "mllib.tree.impl", "mllib.tree.loss", - "ml", "ml.classification", "ml.evaluation", "ml.feature", "ml.param", "ml.tuning" + "ml", "ml.attribute", "ml.classification", "ml.evaluation", "ml.feature", "ml.param", + "ml.tuning" ), "-group", "Spark SQL", packageList("sql.api.java", "sql.api.java.types", "sql.hive.api.java"), "-noqualifier", "java.lang"