@@ -117,15 +117,22 @@ Word2Vec is implemented in [Word2Vec](api/scala/index.html#org.apache.spark.ml.f
117
117
{% highlight scala %}
118
118
import org.apache.spark.ml.feature.Word2Vec
119
119
120
+ // Input data: Each row is a bag of words from a sentence or document.
120
121
val documentDF = sqlContext.createDataFrame(Seq(
121
122
"Hi I heard about Spark".split(" "),
122
123
"I wish Java could use case classes".split(" "),
123
124
"Logistic regression models are neat".split(" ")
124
- )) .map(Tuple1.apply).toDF("text")
125
+ ).map(Tuple1.apply) ).toDF("text")
125
126
126
- val word2Vec = new Word2Vec.setInputCol("text").setOutputCol("result").setVectorSize(3)
127
+ // Learn a mapping from words to Vectors.
128
+ val word2Vec = new Word2Vec()
129
+ .setInputCol("text")
130
+ .setOutputCol("result")
131
+ .setVectorSize(3)
132
+ .setMinCount(0)
127
133
val model = word2Vec.fit(documentDF)
128
- val result = model.transform(documentDF).select("result").take(3).foreach(println)
134
+ val result = model.transform(documentDF)
135
+ result.select("result").take(3).foreach(println)
129
136
{% endhighlight %}
130
137
</div >
131
138
@@ -143,24 +150,26 @@ import org.apache.spark.sql.types.*;
143
150
144
151
JavaSparkContext jsc = ...
145
152
SQLContext sqlContext = ...
153
+
154
+ // Input data: Each row is a bag of words from a sentence or document.
146
155
JavaRDD<Row > jrdd = jsc.parallelize(Lists.newArrayList(
147
156
RowFactory.create(Lists.newArrayList("Hi I heard about Spark".split(" "))),
148
157
RowFactory.create(Lists.newArrayList("I wish Java could use case classes".split(" "))),
149
158
RowFactory.create(Lists.newArrayList("Logistic regression models are neat".split(" ")))
150
159
));
151
160
StructType schema = new StructType(new StructField[ ] {
152
- new StructField("text", new ArrayType(StringType$.MODULE$ , true), false, Metadata.empty())
161
+ new StructField("text", new ArrayType(DataTypes.StringType , true), false, Metadata.empty())
153
162
});
154
163
DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
155
164
165
+ // Learn a mapping from words to Vectors.
156
166
Word2Vec word2Vec = new Word2Vec()
157
167
.setInputCol("text")
158
168
.setOutputCol("result")
159
169
.setVectorSize(3)
160
170
.setMinCount(0);
161
171
Word2VecModel model = word2Vec.fit(documentDF);
162
172
DataFrame result = model.transform(documentDF);
163
-
164
173
for (Row r: result.select("result").take(3)) {
165
174
System.out.println(r);
166
175
}
@@ -171,12 +180,14 @@ for (Row r: result.select("result").take(3)) {
171
180
{% highlight python %}
172
181
from pyspark.ml.feature import Word2Vec
173
182
183
+ # Input data: Each row is a bag of words from a sentence or document.
174
184
documentDF = sqlContext.createDataFrame([
175
185
("Hi I heard about Spark".split(" "), ),
176
186
("I wish Java could use case classes".split(" "), ),
177
187
("Logistic regression models are neat".split(" "), )
178
188
] , [ "text"] )
179
- word2Vec = Word2Vec(vectorSize = 3, minCount = 0, inputCol = "text", outputCol = "result")
189
+ # Learn a mapping from words to Vectors.
190
+ word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
180
191
model = word2Vec.fit(documentDF)
181
192
result = model.transform(documentDF)
182
193
for feature in result.select("result").take(3):
0 commit comments