|
19 | 19 | Python package for statistical functions in MLlib.
|
20 | 20 | """
|
21 | 21 |
|
| 22 | +from pyspark import RDD |
22 | 23 | from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
|
23 |
| -from pyspark.mllib.linalg import _convert_to_vector |
| 24 | +from pyspark.mllib.linalg import Matrix, _convert_to_vector |
24 | 25 |
|
25 | 26 |
|
26 |
| -__all__ = ['MultivariateStatisticalSummary', 'Statistics'] |
| 27 | +__all__ = ['MultivariateStatisticalSummary', 'ChiSqTestResult', 'Statistics'] |
27 | 28 |
|
28 | 29 |
|
29 | 30 | class MultivariateStatisticalSummary(JavaModelWrapper):
|
@@ -51,6 +52,54 @@ def min(self):
|
51 | 52 | return self.call("min").toArray()
|
52 | 53 |
|
53 | 54 |
|
| 55 | +class ChiSqTestResult(JavaModelWrapper): |
| 56 | + """ |
| 57 | + :: Experimental :: |
| 58 | +
|
| 59 | + Object containing the test results for the chi-squared hypothesis test. |
| 60 | + """ |
| 61 | + @property |
| 62 | + def method(self): |
| 63 | + """ |
| 64 | + Name of the test method |
| 65 | + """ |
| 66 | + return self._java_model.method() |
| 67 | + |
| 68 | + @property |
| 69 | + def pValue(self): |
| 70 | + """ |
| 71 | + The probability of obtaining a test statistic result at least as |
| 72 | + extreme as the one that was actually observed, assuming that the |
| 73 | + null hypothesis is true. |
| 74 | + """ |
| 75 | + return self._java_model.pValue() |
| 76 | + |
| 77 | + @property |
| 78 | + def degreesOfFreedom(self): |
| 79 | + """ |
| 80 | + Returns the degree(s) of freedom of the hypothesis test. |
| 81 | + Return type should be Number(e.g. Int, Double) or tuples of Numbers. |
| 82 | + """ |
| 83 | + return self._java_model.degreesOfFreedom() |
| 84 | + |
| 85 | + @property |
| 86 | + def statistic(self): |
| 87 | + """ |
| 88 | + Test statistic. |
| 89 | + """ |
| 90 | + return self._java_model.statistic() |
| 91 | + |
| 92 | + @property |
| 93 | + def nullHypothesis(self): |
| 94 | + """ |
| 95 | + Null hypothesis of the test. |
| 96 | + """ |
| 97 | + return self._java_model.nullHypothesis() |
| 98 | + |
| 99 | + def __str__(self): |
| 100 | + return self._java_model.toString() |
| 101 | + |
| 102 | + |
54 | 103 | class Statistics(object):
|
55 | 104 |
|
56 | 105 | @staticmethod
|
@@ -135,6 +184,90 @@ def corr(x, y=None, method=None):
|
135 | 184 | else:
|
136 | 185 | return callMLlibFunc("corr", x.map(float), y.map(float), method)
|
137 | 186 |
|
| 187 | + @staticmethod |
| 188 | + def chiSqTest(observed, expected=None): |
| 189 | + """ |
| 190 | + :: Experimental :: |
| 191 | +
|
| 192 | + If `observed` is Vector, conduct Pearson's chi-squared goodness |
| 193 | + of fit test of the observed data against the expected distribution, |
| 194 | + or againt the uniform distribution (by default), with each category |
| 195 | + having an expected frequency of `1 / len(observed)`. |
| 196 | + (Note: `observed` cannot contain negative values) |
| 197 | +
|
| 198 | + If `observed` is matrix, conduct Pearson's independence test on the |
| 199 | + input contingency matrix, which cannot contain negative entries or |
| 200 | + columns or rows that sum up to 0. |
| 201 | +
|
| 202 | + If `observed` is an RDD of LabeledPoint, conduct Pearson's independence |
| 203 | + test for every feature against the label across the input RDD. |
| 204 | + For each feature, the (feature, label) pairs are converted into a |
| 205 | + contingency matrix for which the chi-squared statistic is computed. |
| 206 | + All label and feature values must be categorical. |
| 207 | +
|
| 208 | + :param observed: it could be a vector containing the observed categorical |
| 209 | + counts/relative frequencies, or the contingency matrix |
| 210 | + (containing either counts or relative frequencies), |
| 211 | + or an RDD of LabeledPoint containing the labeled dataset |
| 212 | + with categorical features. Real-valued features will be |
| 213 | + treated as categorical for each distinct value. |
| 214 | + :param expected: Vector containing the expected categorical counts/relative |
| 215 | + frequencies. `expected` is rescaled if the `expected` sum |
| 216 | + differs from the `observed` sum. |
| 217 | + :return: ChiSquaredTest object containing the test statistic, degrees |
| 218 | + of freedom, p-value, the method used, and the null hypothesis. |
| 219 | +
|
| 220 | + >>> from pyspark.mllib.linalg import Vectors, Matrices |
| 221 | + >>> observed = Vectors.dense([4, 6, 5]) |
| 222 | + >>> pearson = Statistics.chiSqTest(observed) |
| 223 | + >>> pearson.statistic |
| 224 | + 0.4 |
| 225 | + >>> pearson.degreesOfFreedom |
| 226 | + 2 |
| 227 | + >>> round(pearson.pValue, 4) |
| 228 | + 0.8187 |
| 229 | + >>> pearson.method |
| 230 | + u'pearson' |
| 231 | + >>> pearson.nullHypothesis |
| 232 | + u'observed follows the same distribution as expected.' |
| 233 | +
|
| 234 | + >>> observed = Vectors.dense([21, 38, 43, 80]) |
| 235 | + >>> expected = Vectors.dense([3, 5, 7, 20]) |
| 236 | + >>> pearson = Statistics.chiSqTest(observed, expected) |
| 237 | + >>> round(pearson.pValue, 4) |
| 238 | + 0.0027 |
| 239 | +
|
| 240 | + >>> data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0] |
| 241 | + >>> chi = Statistics.chiSqTest(Matrices.dense(3, 4, data)) |
| 242 | + >>> round(chi.statistic) |
| 243 | + 22.0 |
| 244 | +
|
| 245 | + >>> from pyspark.mllib.regression import LabeledPoint |
| 246 | + >>> data = [LabeledPoint(0.0, Vectors.dense([0.5, 10.0])), |
| 247 | + ... LabeledPoint(0.0, Vectors.dense([1.5, 20.0])), |
| 248 | + ... LabeledPoint(1.0, Vectors.dense([1.5, 30.0])), |
| 249 | + ... LabeledPoint(0.0, Vectors.dense([3.5, 30.0])), |
| 250 | + ... LabeledPoint(0.0, Vectors.dense([3.5, 40.0])), |
| 251 | + ... LabeledPoint(1.0, Vectors.dense([3.5, 40.0])),] |
| 252 | + >>> rdd = sc.parallelize(data, 4) |
| 253 | + >>> chi = Statistics.chiSqTest(rdd) |
| 254 | + >>> chi[0].statistic |
| 255 | + 0.75 |
| 256 | + >>> chi[1].statistic |
| 257 | + 1.5 |
| 258 | + """ |
| 259 | + if isinstance(observed, RDD): |
| 260 | + jmodels = callMLlibFunc("chiSqTest", observed) |
| 261 | + return [ChiSqTestResult(m) for m in jmodels] |
| 262 | + |
| 263 | + if isinstance(observed, Matrix): |
| 264 | + jmodel = callMLlibFunc("chiSqTest", observed) |
| 265 | + else: |
| 266 | + if expected and len(expected) != len(observed): |
| 267 | + raise ValueError("`expected` should have same length with `observed`") |
| 268 | + jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected) |
| 269 | + return ChiSqTestResult(jmodel) |
| 270 | + |
138 | 271 |
|
139 | 272 | def _test():
|
140 | 273 | import doctest
|
|
0 commit comments