Skip to content

Commit 5097d54

Browse files
author
Davies Liu
committed
add Hypothesis test Python API
1 parent b671ce0 commit 5097d54

File tree

5 files changed

+221
-4
lines changed

5 files changed

+221
-4
lines changed

docs/mllib-statistics.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,46 @@ for (ChiSqTestResult result : featureTestResults) {
380380
{% endhighlight %}
381381
</div>
382382

383+
<div data-lang="python" markdown="1">
384+
[`Statistics`](api/python/index.html#pyspark.mllib.stat.Statistics$) provides methods to
385+
run Pearson's chi-squared tests. The following example demonstrates how to run and interpret
386+
hypothesis tests.
387+
388+
{% highlight python %}
389+
from pyspark import SparkContext
390+
from pyspark.mllib.linalg import Vectors, Matrices
391+
from pyspark.mllib.regresssion import LabeledPoint
392+
from pyspark.mllib.stat import Statistics
393+
394+
sc = SparkContext()
395+
396+
vec = Vectors.dense(...) # a vector composed of the frequencies of events
397+
398+
# compute the goodness of fit. If a second vector to test against is not supplied as a parameter,
399+
# the test runs against a uniform distribution.
400+
goodnessOfFitTestResult = Statistics.chiSqTest(vec)
401+
print goodnessOfFitTestResult # summary of the test including the p-value, degrees of freedom,
402+
# test statistic, the method used, and the null hypothesis.
403+
404+
mat = Matrices.dense(...) # a contingency matrix
405+
406+
# conduct Pearson's independence test on the input contingency matrix
407+
independenceTestResult = Statistics.chiSqTest(mat)
408+
print independenceTestResult # summary of the test including the p-value, degrees of freedom...
409+
410+
obs = sc.parallelize(...) # LabeledPoint(feature, label) .
411+
412+
# The contingency table is constructed from an RDD of LabeledPoint and used to conduct
413+
# the independence test. Returns an array containing the ChiSquaredTestResult for every feature
414+
# against the label.
415+
featureTestResults = Statistics.chiSqTest(obs)
416+
417+
for i, result in enumerate(featureTestResults):
418+
print "Column $d:" % (i + 1)
419+
print result
420+
{% endhighlight %}
421+
</div>
422+
383423
</div>
384424

385425
## Random data generation

mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import org.apache.spark.mllib.tree.impurity._
4343
import org.apache.spark.mllib.tree.model.DecisionTreeModel
4444
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
4545
import org.apache.spark.mllib.stat.correlation.CorrelationNames
46+
import org.apache.spark.mllib.stat.test.ChiSqTestResult
4647
import org.apache.spark.mllib.util.MLUtils
4748
import org.apache.spark.rdd.RDD
4849
import org.apache.spark.storage.StorageLevel
@@ -454,6 +455,33 @@ class PythonMLLibAPI extends Serializable {
454455
Statistics.corr(x.rdd, y.rdd, getCorrNameOrDefault(method))
455456
}
456457

458+
/**
459+
* Java stub for mllib Statistics.chiSqTest()
460+
*/
461+
def chiSqTest(observed: Vector, expected: Vector): ChiSqTestResult = {
462+
if (expected == null) {
463+
Statistics.chiSqTest(observed)
464+
} else {
465+
Statistics.chiSqTest(observed, expected)
466+
}
467+
}
468+
469+
/**
470+
* Java stub for mllib Statistics.chiSqTest(observed: Matrix)
471+
* @param observed
472+
* @return
473+
*/
474+
def chiSqTest(observed: Matrix): ChiSqTestResult = {
475+
Statistics.chiSqTest(observed)
476+
}
477+
478+
/**
479+
* Java stub for mllib Statistics.chiSqTest(RDD[LabelPoint])
480+
*/
481+
def chiSqTest(data: JavaRDD[LabeledPoint]): Array[ChiSqTestResult] = {
482+
Statistics.chiSqTest(data.rdd)
483+
}
484+
457485
// used by the corr methods to retrieve the name of the correlation method passed in via pyspark
458486
private def getCorrNameOrDefault(method: String) = {
459487
if (method == null) CorrelationNames.defaultCorrName else method

python/pyspark/mllib/common.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,13 @@ def _java2py(sc, r):
9898
jrdd = sc._jvm.SerDe.javaToPython(r)
9999
return RDD(jrdd, sc, AutoBatchedSerializer(PickleSerializer()))
100100

101-
elif isinstance(r, (JavaArray, JavaList)) or clsName in _picklable_classes:
101+
if clsName in _picklable_classes:
102102
r = sc._jvm.SerDe.dumps(r)
103+
elif isinstance(r, (JavaArray, JavaList)):
104+
try:
105+
r = sc._jvm.SerDe.dumps(r)
106+
except Py4JJavaError:
107+
pass # not pickable
103108

104109
if isinstance(r, bytearray):
105110
r = PickleSerializer().loads(str(r))

python/pyspark/mllib/linalg.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
IntegerType, ByteType, Row
3434

3535

36-
__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors']
36+
__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors', 'DenseMatrix', 'Matrices']
3737

3838

3939
if sys.version_info[:2] == (2, 7):
@@ -578,6 +578,8 @@ class DenseMatrix(Matrix):
578578
def __init__(self, numRows, numCols, values):
579579
Matrix.__init__(self, numRows, numCols)
580580
assert len(values) == numRows * numCols
581+
if not isinstance(values, array.array):
582+
values = array.array('d', values)
581583
self.values = values
582584

583585
def __reduce__(self):
@@ -596,6 +598,15 @@ def toArray(self):
596598
return np.reshape(self.values, (self.numRows, self.numCols), order='F')
597599

598600

601+
class Matrices(object):
602+
@staticmethod
603+
def dense(numRows, numCols, values):
604+
"""
605+
Create a DenseMatrix
606+
"""
607+
return DenseMatrix(numRows, numCols, values)
608+
609+
599610
def _test():
600611
import doctest
601612
(failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)

python/pyspark/mllib/stat.py

Lines changed: 135 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,12 @@
1919
Python package for statistical functions in MLlib.
2020
"""
2121

22+
from pyspark import RDD
2223
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
23-
from pyspark.mllib.linalg import _convert_to_vector
24+
from pyspark.mllib.linalg import Matrix, _convert_to_vector
2425

2526

26-
__all__ = ['MultivariateStatisticalSummary', 'Statistics']
27+
__all__ = ['MultivariateStatisticalSummary', 'ChiSqTestResult', 'Statistics']
2728

2829

2930
class MultivariateStatisticalSummary(JavaModelWrapper):
@@ -51,6 +52,54 @@ def min(self):
5152
return self.call("min").toArray()
5253

5354

55+
class ChiSqTestResult(JavaModelWrapper):
56+
"""
57+
:: Experimental ::
58+
59+
Object containing the test results for the chi-squared hypothesis test.
60+
"""
61+
@property
62+
def method(self):
63+
"""
64+
Name of the test method
65+
"""
66+
return self._java_model.method()
67+
68+
@property
69+
def pValue(self):
70+
"""
71+
The probability of obtaining a test statistic result at least as
72+
extreme as the one that was actually observed, assuming that the
73+
null hypothesis is true.
74+
"""
75+
return self._java_model.pValue()
76+
77+
@property
78+
def degreesOfFreedom(self):
79+
"""
80+
Returns the degree(s) of freedom of the hypothesis test.
81+
Return type should be Number(e.g. Int, Double) or tuples of Numbers.
82+
"""
83+
return self._java_model.degreesOfFreedom()
84+
85+
@property
86+
def statistic(self):
87+
"""
88+
Test statistic.
89+
"""
90+
return self._java_model.statistic()
91+
92+
@property
93+
def nullHypothesis(self):
94+
"""
95+
Null hypothesis of the test.
96+
"""
97+
return self._java_model.nullHypothesis()
98+
99+
def __str__(self):
100+
return self._java_model.toString()
101+
102+
54103
class Statistics(object):
55104

56105
@staticmethod
@@ -135,6 +184,90 @@ def corr(x, y=None, method=None):
135184
else:
136185
return callMLlibFunc("corr", x.map(float), y.map(float), method)
137186

187+
@staticmethod
188+
def chiSqTest(observed, expected=None):
189+
"""
190+
:: Experimental ::
191+
192+
If `observed` is Vector, conduct Pearson's chi-squared goodness
193+
of fit test of the observed data against the expected distribution,
194+
or againt the uniform distribution (by default), with each category
195+
having an expected frequency of `1 / len(observed)`.
196+
(Note: `observed` cannot contain negative values)
197+
198+
If `observed` is matrix, conduct Pearson's independence test on the
199+
input contingency matrix, which cannot contain negative entries or
200+
columns or rows that sum up to 0.
201+
202+
If `observed` is an RDD of LabeledPoint, conduct Pearson's independence
203+
test for every feature against the label across the input RDD.
204+
For each feature, the (feature, label) pairs are converted into a
205+
contingency matrix for which the chi-squared statistic is computed.
206+
All label and feature values must be categorical.
207+
208+
:param observed: it could be a vector containing the observed categorical
209+
counts/relative frequencies, or the contingency matrix
210+
(containing either counts or relative frequencies),
211+
or an RDD of LabeledPoint containing the labeled dataset
212+
with categorical features. Real-valued features will be
213+
treated as categorical for each distinct value.
214+
:param expected: Vector containing the expected categorical counts/relative
215+
frequencies. `expected` is rescaled if the `expected` sum
216+
differs from the `observed` sum.
217+
:return: ChiSquaredTest object containing the test statistic, degrees
218+
of freedom, p-value, the method used, and the null hypothesis.
219+
220+
>>> from pyspark.mllib.linalg import Vectors, Matrices
221+
>>> observed = Vectors.dense([4, 6, 5])
222+
>>> pearson = Statistics.chiSqTest(observed)
223+
>>> pearson.statistic
224+
0.4
225+
>>> pearson.degreesOfFreedom
226+
2
227+
>>> round(pearson.pValue, 4)
228+
0.8187
229+
>>> pearson.method
230+
u'pearson'
231+
>>> pearson.nullHypothesis
232+
u'observed follows the same distribution as expected.'
233+
234+
>>> observed = Vectors.dense([21, 38, 43, 80])
235+
>>> expected = Vectors.dense([3, 5, 7, 20])
236+
>>> pearson = Statistics.chiSqTest(observed, expected)
237+
>>> round(pearson.pValue, 4)
238+
0.0027
239+
240+
>>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
241+
>>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))
242+
>>> round(chi.statistic)
243+
22.0
244+
245+
>>> from pyspark.mllib.regression import LabeledPoint
246+
>>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
247+
... LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
248+
... LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
249+
... LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
250+
... LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
251+
... LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),]
252+
>>> rdd = sc.parallelize(data, 4)
253+
>>> chi = Statistics.chiSqTest(rdd)
254+
>>> chi[0].statistic
255+
0.75
256+
>>> chi[1].statistic
257+
1.5
258+
"""
259+
if isinstance(observed, RDD):
260+
jmodels = callMLlibFunc("chiSqTest", observed)
261+
return [ChiSqTestResult(m) for m in jmodels]
262+
263+
if isinstance(observed, Matrix):
264+
jmodel = callMLlibFunc("chiSqTest", observed)
265+
else:
266+
if expected and len(expected) != len(observed):
267+
raise ValueError("`expected` should have same length with `observed`")
268+
jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
269+
return ChiSqTestResult(jmodel)
270+
138271

139272
def _test():
140273
import doctest

0 commit comments

Comments
 (0)