|
18 | 18 | import numpy as np
|
19 | 19 | from numpy import array
|
20 | 20 |
|
| 21 | +from pyspark import RDD |
21 | 22 | from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py, inherit_doc
|
22 |
| -from pyspark.mllib.linalg import SparseVector, _convert_to_vector |
| 23 | +from pyspark.mllib.linalg import SparseVector, Vectors, _convert_to_vector |
23 | 24 | from pyspark.mllib.util import Saveable, Loader
|
24 | 25 |
|
25 | 26 | __all__ = ['LabeledPoint', 'LinearModel',
|
@@ -396,6 +397,78 @@ def train(rdd, i):
|
396 | 397 | return _regression_train_wrapper(train, RidgeRegressionModel, data, initialWeights)
|
397 | 398 |
|
398 | 399 |
|
| 400 | +class IsotonicRegressionModel(Saveable, Loader): |
| 401 | + |
| 402 | + """Regression model for isotonic regression. |
| 403 | +
|
| 404 | + >>> data = [(2, 1, 1), (1, 1, 1), (4, 2, 1), (2, 2, 1), (6, 3, 1), (5, 3, 1)] |
| 405 | + >>> irm = IsotonicRegression.train(sc.parallelize(data)) |
| 406 | + >>> irm.predict(1.5) |
| 407 | + 2.0 |
| 408 | + >>> irm.predict(2.5) |
| 409 | + 4.5 |
| 410 | + >>> irm.predict(4) |
| 411 | + 6.0 |
| 412 | + >>> irm.predict(sc.parallelize([1.5, 2.5, 4])).collect() |
| 413 | + [2.0, 4.5, 6.0] |
| 414 | + >>> import os, tempfile |
| 415 | + >>> path = tempfile.mkdtemp() |
| 416 | + >>> irm.save(sc, path) |
| 417 | + >>> sameModel = IsotonicRegressionModel.load(sc, path) |
| 418 | + >>> sameModel.predict(1.5) |
| 419 | + 2.0 |
| 420 | + >>> sameModel.predict(2.5) |
| 421 | + 4.5 |
| 422 | + >>> sameModel.predict(4) |
| 423 | + 6.0 |
| 424 | + >>> try: |
| 425 | + ... os.removedirs(path) |
| 426 | + ... except OSError: |
| 427 | + ... pass |
| 428 | + """ |
| 429 | + |
| 430 | + def __init__(self, boundaries, predictions, isotonic): |
| 431 | + self.boundaries = boundaries |
| 432 | + self.predictions = predictions |
| 433 | + self.isotonic = isotonic |
| 434 | + |
| 435 | + def predict(self, x): |
| 436 | + if isinstance(x, RDD): |
| 437 | + return x.map(lambda v: self.predict(v)) |
| 438 | + return np.interp(x, self.boundaries, self.predictions) |
| 439 | + |
| 440 | + def save(self, sc, path): |
| 441 | + java_boundaries = _py2java(sc, self.boundaries.tolist()) |
| 442 | + java_predictions = _py2java(sc, self.predictions.tolist()) |
| 443 | + java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel( |
| 444 | + java_boundaries, java_predictions, self.isotonic) |
| 445 | + java_model.save(sc._jsc.sc(), path) |
| 446 | + |
| 447 | + @classmethod |
| 448 | + def load(cls, sc, path): |
| 449 | + java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load( |
| 450 | + sc._jsc.sc(), path) |
| 451 | + py_boundaries = _java2py(sc, java_model.boundaries()) |
| 452 | + py_predictions = _java2py(sc, java_model.predictions()) |
| 453 | + return IsotonicRegressionModel(np.array(py_boundaries), |
| 454 | + np.array(py_predictions), java_model.isotonic) |
| 455 | + |
| 456 | + |
| 457 | +class IsotonicRegression(object): |
| 458 | + """ |
| 459 | + Run IsotonicRegression algorithm to obtain isotonic regression model. |
| 460 | +
|
| 461 | + :param data: RDD of data points |
| 462 | + :param isotonic: Whether this is isotonic or antitonic. |
| 463 | + """ |
| 464 | + @classmethod |
| 465 | + def train(cls, data, isotonic=True): |
| 466 | + """Train a isotonic regression model on the given data.""" |
| 467 | + boundaries, predictions = callMLlibFunc("trainIsotonicRegressionModel", |
| 468 | + data.map(_convert_to_vector), bool(isotonic)) |
| 469 | + return IsotonicRegressionModel(np.array(boundaries), np.array(predictions), isotonic) |
| 470 | + |
| 471 | + |
399 | 472 | def _test():
|
400 | 473 | import doctest
|
401 | 474 | from pyspark import SparkContext
|
|
0 commit comments