Skip to content

Commit 3bb2177

Browse files
kashifmengxr
authored andcommitted
[SPARK-8872] [MLLIB] added verification results from R for FPGrowthSuite
Author: Kashif Rasul <[email protected]> Closes #7269 from kashif/SPARK-8872 and squashes the following commits: 2d5457f [Kashif Rasul] added R code for FP Int type 3de6808 [Kashif Rasul] added verification results from R for FPGrowthSuite
1 parent 8a9d9cc commit 3bb2177

File tree

1 file changed

+114
-0
lines changed

1 file changed

+114
-0
lines changed

mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,22 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
3939
.setMinSupport(0.9)
4040
.setNumPartitions(1)
4141
.run(rdd)
42+
43+
/* Verify results using the `R` code:
44+
transactions = as(sapply(
45+
list("r z h k p",
46+
"z y x w v u t s",
47+
"s x o n r",
48+
"x z y m t s q e",
49+
"z",
50+
"x z y r q t p"),
51+
FUN=function(x) strsplit(x," ",fixed=TRUE)),
52+
"transactions")
53+
> eclat(transactions, parameter = list(support = 0.9))
54+
...
55+
eclat - zero frequent items
56+
set of 0 itemsets
57+
*/
4258
assert(model6.freqItemsets.count() === 0)
4359

4460
val model3 = fpg
@@ -48,6 +64,33 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
4864
val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
4965
(itemset.items.toSet, itemset.freq)
5066
}
67+
68+
/* Verify results using the `R` code:
69+
fp = eclat(transactions, parameter = list(support = 0.5))
70+
fpDF = as(sort(fp), "data.frame")
71+
fpDF$support = fpDF$support * length(transactions)
72+
names(fpDF)[names(fpDF) == "support"] = "freq"
73+
> fpDF
74+
items freq
75+
13 {z} 5
76+
14 {x} 4
77+
1 {s,x} 3
78+
2 {t,x,y,z} 3
79+
3 {t,y,z} 3
80+
4 {t,x,y} 3
81+
5 {x,y,z} 3
82+
6 {y,z} 3
83+
7 {x,y} 3
84+
8 {t,y} 3
85+
9 {t,x,z} 3
86+
10 {t,z} 3
87+
11 {t,x} 3
88+
12 {x,z} 3
89+
15 {t} 3
90+
16 {y} 3
91+
17 {s} 3
92+
18 {r} 3
93+
*/
5194
val expected = Set(
5295
(Set("s"), 3L), (Set("z"), 5L), (Set("x"), 4L), (Set("t"), 3L), (Set("y"), 3L),
5396
(Set("r"), 3L),
@@ -62,12 +105,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
62105
.setMinSupport(0.3)
63106
.setNumPartitions(4)
64107
.run(rdd)
108+
109+
/* Verify results using the `R` code:
110+
fp = eclat(transactions, parameter = list(support = 0.3))
111+
fpDF = as(fp, "data.frame")
112+
fpDF$support = fpDF$support * length(transactions)
113+
names(fpDF)[names(fpDF) == "support"] = "freq"
114+
> nrow(fpDF)
115+
[1] 54
116+
*/
65117
assert(model2.freqItemsets.count() === 54)
66118

67119
val model1 = fpg
68120
.setMinSupport(0.1)
69121
.setNumPartitions(8)
70122
.run(rdd)
123+
124+
/* Verify results using the `R` code:
125+
fp = eclat(transactions, parameter = list(support = 0.1))
126+
fpDF = as(fp, "data.frame")
127+
fpDF$support = fpDF$support * length(transactions)
128+
names(fpDF)[names(fpDF) == "support"] = "freq"
129+
> nrow(fpDF)
130+
[1] 625
131+
*/
71132
assert(model1.freqItemsets.count() === 625)
72133
}
73134

@@ -89,6 +150,23 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
89150
.setMinSupport(0.9)
90151
.setNumPartitions(1)
91152
.run(rdd)
153+
154+
/* Verify results using the `R` code:
155+
transactions = as(sapply(
156+
list("1 2 3",
157+
"1 2 3 4",
158+
"5 4 3 2 1",
159+
"6 5 4 3 2 1",
160+
"2 4",
161+
"1 3",
162+
"1 7"),
163+
FUN=function(x) strsplit(x," ",fixed=TRUE)),
164+
"transactions")
165+
> eclat(transactions, parameter = list(support = 0.9))
166+
...
167+
eclat - zero frequent items
168+
set of 0 itemsets
169+
*/
92170
assert(model6.freqItemsets.count() === 0)
93171

94172
val model3 = fpg
@@ -100,6 +178,24 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
100178
val freqItemsets3 = model3.freqItemsets.collect().map { itemset =>
101179
(itemset.items.toSet, itemset.freq)
102180
}
181+
182+
/* Verify results using the `R` code:
183+
fp = eclat(transactions, parameter = list(support = 0.5))
184+
fpDF = as(sort(fp), "data.frame")
185+
fpDF$support = fpDF$support * length(transactions)
186+
names(fpDF)[names(fpDF) == "support"] = "freq"
187+
> fpDF
188+
items freq
189+
6 {1} 6
190+
3 {1,3} 5
191+
7 {2} 5
192+
8 {3} 5
193+
1 {2,4} 4
194+
2 {1,2,3} 4
195+
4 {2,3} 4
196+
5 {1,2} 4
197+
9 {4} 4
198+
*/
103199
val expected = Set(
104200
(Set(1), 6L), (Set(2), 5L), (Set(3), 5L), (Set(4), 4L),
105201
(Set(1, 2), 4L), (Set(1, 3), 5L), (Set(2, 3), 4L),
@@ -110,12 +206,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext {
110206
.setMinSupport(0.3)
111207
.setNumPartitions(4)
112208
.run(rdd)
209+
210+
/* Verify results using the `R` code:
211+
fp = eclat(transactions, parameter = list(support = 0.3))
212+
fpDF = as(fp, "data.frame")
213+
fpDF$support = fpDF$support * length(transactions)
214+
names(fpDF)[names(fpDF) == "support"] = "freq"
215+
> nrow(fpDF)
216+
[1] 15
217+
*/
113218
assert(model2.freqItemsets.count() === 15)
114219

115220
val model1 = fpg
116221
.setMinSupport(0.1)
117222
.setNumPartitions(8)
118223
.run(rdd)
224+
225+
/* Verify results using the `R` code:
226+
fp = eclat(transactions, parameter = list(support = 0.1))
227+
fpDF = as(fp, "data.frame")
228+
fpDF$support = fpDF$support * length(transactions)
229+
names(fpDF)[names(fpDF) == "support"] = "freq"
230+
> nrow(fpDF)
231+
[1] 65
232+
*/
119233
assert(model1.freqItemsets.count() === 65)
120234
}
121235
}

0 commit comments

Comments
 (0)