Skip to content

Commit 2e0f357

Browse files
brkyvzrxin
authored andcommitted
[SPARK-7242] added python api for freqItems in DataFrames
The python api for DataFrame's plus addressed your comments from previous PR. rxin Author: Burak Yavuz <[email protected]> Closes #5859 from brkyvz/df-freq-py2 and squashes the following commits: f9aa9ce [Burak Yavuz] addressed comments v0.1 4b25056 [Burak Yavuz] added python api for freqItems
1 parent b79aeb9 commit 2e0f357

File tree

3 files changed

+38
-3
lines changed

3 files changed

+38
-3
lines changed

python/pyspark/sql/dataframe.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -889,6 +889,26 @@ def cov(self, col1, col2):
889889
raise ValueError("col2 should be a string.")
890890
return self._jdf.stat().cov(col1, col2)
891891

892+
def freqItems(self, cols, support=None):
893+
"""
894+
Finding frequent items for columns, possibly with false positives. Using the
895+
frequent element count algorithm described in
896+
"http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
897+
:func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.
898+
899+
:param cols: Names of the columns to calculate frequent items for as a list or tuple of
900+
strings.
901+
:param support: The frequency with which to consider an item 'frequent'. Default is 1%.
902+
The support must be greater than 1e-4.
903+
"""
904+
if isinstance(cols, tuple):
905+
cols = list(cols)
906+
if not isinstance(cols, list):
907+
raise ValueError("cols must be a list or tuple of column names as strings.")
908+
if not support:
909+
support = 0.01
910+
return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)
911+
892912
@ignore_unicode_prefix
893913
def withColumn(self, colName, col):
894914
"""Returns a new :class:`DataFrame` by adding a column.
@@ -1344,6 +1364,11 @@ def cov(self, col1, col2):
13441364

13451365
cov.__doc__ = DataFrame.cov.__doc__
13461366

1367+
def freqItems(self, cols, support=None):
1368+
return self.df.freqItems(cols, support)
1369+
1370+
freqItems.__doc__ = DataFrame.freqItems.__doc__
1371+
13471372

13481373
def _test():
13491374
import doctest

python/pyspark/sql/tests.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,13 @@ def test_column_select(self):
375375
self.assertEqual(self.testData, df.select(df.key, df.value).collect())
376376
self.assertEqual([Row(value='1')], df.where(df.key == 1).select(df.value).collect())
377377

378+
def test_freqItems(self):
379+
vals = [Row(a=1, b=-2.0) if i % 2 == 0 else Row(a=i, b=i * 1.0) for i in range(100)]
380+
df = self.sc.parallelize(vals).toDF()
381+
items = df.stat.freqItems(("a", "b"), 0.4).collect()[0]
382+
self.assertTrue(1 in items[0])
383+
self.assertTrue(-2.0 in items[1])
384+
378385
def test_aggregator(self):
379386
df = self.df
380387
g = df.groupBy()

sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,10 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
4343
}
4444

4545
/**
46-
* Runs `freqItems` with a default `support` of 1%.
46+
* Finding frequent items for columns, possibly with false positives. Using the
47+
* frequent element count algorithm described in
48+
* [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
49+
* Uses a `default` support of 1%.
4750
*
4851
* @param cols the names of the columns to search frequent items in.
4952
* @return A Local DataFrame with the Array of frequent items for each column.
@@ -55,14 +58,14 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
5558
/**
5659
* Python friendly implementation for `freqItems`
5760
*/
58-
def freqItems(cols: List[String], support: Double): DataFrame = {
61+
def freqItems(cols: Seq[String], support: Double): DataFrame = {
5962
FrequentItems.singlePassFreqItems(df, cols, support)
6063
}
6164

6265
/**
6366
* Python friendly implementation for `freqItems` with a default `support` of 1%.
6467
*/
65-
def freqItems(cols: List[String]): DataFrame = {
68+
def freqItems(cols: Seq[String]): DataFrame = {
6669
FrequentItems.singlePassFreqItems(df, cols, 0.01)
6770
}
6871

0 commit comments

Comments
 (0)