Skip to content

Commit 4922722

Browse files
committed
[SPARK-5726] [MLLIB] Hadamard Vector Product Transformer
1 parent 068c315 commit 4922722

File tree

4 files changed

+263
-0
lines changed

4 files changed

+263
-0
lines changed

docs/mllib-feature-extraction.md

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,3 +477,76 @@ sc.stop();
477477
</div>
478478
</div>
479479

480+
## HadamardProduct
481+
482+
HadamardProduct scales individual vector samples by a provided weighting vector component-wise. This represents the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_%28matrices%29) between the input vector, `v` and weighting vector, `w`, to yield a result vector.
483+
484+
`\[ \begin{pmatrix}
485+
v_1 \\
486+
\vdots \\
487+
v_N
488+
\end{pmatrix} \circ \begin{pmatrix}
489+
w_1 \\
490+
\vdots \\
491+
w_N
492+
\end{pmatrix}
493+
= \begin{pmatrix}
494+
v_1 w_1 \\
495+
\vdots \\
496+
v_N w_N
497+
\end{pmatrix}
498+
\]`
499+
500+
[`HadamardProduct`](api/scala/index.html#org.apache.spark.mllib.feature.HadamardProduct) has the following parameter in the constructor:
501+
502+
* `w` Vector, the scaling vector.
503+
504+
`HadamardProduct` implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer) which can apply the weighting on a `Vector` to produce a transformed `Vector` or on an `RDD[Vector]` to produce a transformed `RDD[Vector]`.
505+
506+
### Example
507+
508+
This example below demonstrates how to load a simple vectors file, extract a set of vectors, then weight those vectors using a weighting vector value.
509+
510+
511+
<div class="codetabs">
512+
<div data-lang="scala">
513+
{% highlight scala %}
514+
import org.apache.spark.SparkContext._
515+
import org.apache.spark.mllib.feature.HadamardProduct
516+
import org.apache.spark.mllib.linalg.Vectors
517+
518+
//load and parse the data
519+
val data = sc.textFile("data/mllib/kmeans_data.txt")
520+
val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble)))
521+
522+
val weightingVector = Vectors.dense(0.0, 1.0, 2.0)
523+
val scaler = new HadamardProduct(weightingVector)
524+
525+
//same results:
526+
val weightedData = scaler.transform(parsedData)
527+
val weightedData2 = parsedData.map(x => scaler.transform(x))
528+
529+
{% endhighlight %}
530+
</div>
531+
532+
<div data-lang="python">
533+
{% highlight python %}
534+
from pyspark.mllib.linalg import Vectors
535+
from pyspark.mllib.feature import HadamardProduct
536+
537+
# Load and parse the data
538+
data = sc.textFile("data/mllib/kmeans_data.txt")
539+
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
540+
541+
weightingVector = Vectors.dense(0.0, 1.0, 2.0)
542+
scaler = HadamardProduct(weightingVector)
543+
544+
# Same results:
545+
weightedData = scaler.transform(parsedData)
546+
weightedData2 = parsedData.map(lambda x: scaler.transform(x))
547+
548+
{% endhighlight %}
549+
</div>
550+
</div>
551+
552+
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.ml.feature
19+
20+
import org.apache.spark.annotation.AlphaComponent
21+
import org.apache.spark.ml.UnaryTransformer
22+
import org.apache.spark.ml.param.{Param, ParamMap}
23+
import org.apache.spark.mllib.feature.HadamardProduct
24+
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
25+
import org.apache.spark.sql.types.DataType
26+
27+
/**
28+
* :: AlphaComponent
29+
* Maps a vector to the hadamard product of it and a reference vector.
30+
*/
31+
@AlphaComponent
32+
class HadamardProductTF extends UnaryTransformer[Vector, Vector, HadamardProductTF] {
33+
34+
/** the vector to multiply with input vectors */
35+
val scalingVec : Param[Vector] = new Param(this, "scalingVector", "vector for hadamard product")
36+
def setScalingVec(value: Vector) = set(scalingVec, value)
37+
def getScalingVec: Vector = get(scalingVec)
38+
39+
override protected def createTransformFunc(paramMap: ParamMap): Vector => Vector = {
40+
val hadScaler = new HadamardProduct(paramMap(scalingVec))
41+
hadScaler.transform
42+
}
43+
44+
override protected def outputDataType: DataType = new VectorUDT()
45+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.mllib.feature
19+
20+
import org.apache.spark.annotation.Experimental
21+
import org.apache.spark.mllib.linalg._
22+
23+
/**
24+
* :: Experimental ::
25+
* Component-wise scaling of dense vectors by a provided vector's components.
26+
*
27+
* @param scalingVector The values used to scale the reference vector's individual components.
28+
*/
29+
@Experimental
30+
class HadamardProduct(val scalingVector: Vector) extends VectorTransformer {
31+
32+
/**
33+
* Does the hadamard product transformation.
34+
*
35+
* @param vector vector to be transformed.
36+
* @return transformed vector.
37+
*/
38+
override def transform(vector: Vector): Vector = {
39+
require(vector.size == scalingVector.size)
40+
vector match {
41+
case dv: DenseVector =>
42+
val values: Array[Double] = dv.values.clone()
43+
val dim = scalingVector.size
44+
var i = 0
45+
while(i < dim) {
46+
values(i) *= scalingVector(i)
47+
i+=1
48+
}
49+
Vectors.dense(values)
50+
case SparseVector(size, indices, vs) =>
51+
val values = vs.clone()
52+
val dim = values.size
53+
var i = 0
54+
while (i < dim) {
55+
values(i) *= scalingVector.apply(indices(i))
56+
i += 1
57+
}
58+
Vectors.sparse(size, indices, values)
59+
case v => throw new IllegalArgumentException("Does not support vector type " + v.getClass)
60+
}
61+
}
62+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.mllib.feature
19+
20+
import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, Vectors}
21+
import org.apache.spark.mllib.util.MLlibTestSparkContext
22+
import org.scalatest.FunSuite
23+
import org.apache.spark.mllib.util.TestingUtils._
24+
25+
class HadamardProductSuite extends FunSuite with MLlibTestSparkContext{
26+
27+
val denseData = Array(
28+
Vectors.dense(1.0, 1.0, 0.0, 0.0),
29+
Vectors.dense(1.0, 2.0, -3.0, 0.0),
30+
Vectors.dense(1.0, 3.0, 0.0, 0.0),
31+
Vectors.dense(1.0, 4.0, 1.9, -9.0),
32+
Vectors.dense(1.0, 5.0, 0.0, 0.0)
33+
)
34+
35+
val sparseData = Array(
36+
Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
37+
Vectors.sparse(3, Seq((1, -1.0), (2, -3.0))),
38+
Vectors.sparse(3, Seq((1, -5.1))),
39+
Vectors.sparse(3, Seq((0, 3.8), (2, 1.9))),
40+
Vectors.sparse(3, Seq((0, 1.7), (1, -0.6))),
41+
Vectors.sparse(3, Seq((1, 1.9)))
42+
)
43+
44+
val scalingVector = Vectors.dense(2.0, 0.5, 0.0, 0.25)
45+
46+
test("hadamard product should properly apply vector to dense data set") {
47+
48+
val scaler = new HadamardProduct(scalingVector)
49+
val scaledData = scaler.transform(sc.makeRDD(denseData))
50+
51+
val scaledVecs = scaledData.collect()
52+
53+
val fourthVec = scaledVecs.apply(3).toArray
54+
55+
assert(fourthVec.apply(0) === 2.0, "product by 2.0 should have been applied")
56+
assert(fourthVec.apply(1) === 2.0, "product by 0.5 should have been applied")
57+
assert(fourthVec.apply(2) === 0.0, "product by 0.0 should have been applied")
58+
assert(fourthVec.apply(3) === -2.25, "product by 0.25 should have been applied")
59+
}
60+
61+
test("hadamard product should properly apply vector to sparse data set") {
62+
63+
val dataRDD = sc.parallelize(sparseData, 3)
64+
65+
val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
66+
67+
val hadScaler = new HadamardProduct(scalingVec)
68+
69+
val data2 = sparseData.map(hadScaler.transform)
70+
val data2RDD = hadScaler.transform(dataRDD)
71+
72+
assert((sparseData, data2, data2RDD.collect()).zipped.forall {
73+
case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
74+
case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
75+
case _ => false
76+
}, "The vector type should be preserved after hadamard product")
77+
78+
assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
79+
80+
assert(data2(0) ~== Vectors.sparse(3, Seq((0, -2.0), (1, 0.0))) absTol 1E-5)
81+
assert(data2(1) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
82+
}
83+
}

0 commit comments

Comments
 (0)