address comments

Davies Liu · Davies Liu · commit 83c92fedc4f6 · 2015-02-03T12:21:08.000-08:00
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
@@ -62,7 +62,7 @@
     "StringType", "BinaryType", "BooleanType", "DateType", "TimestampType", "DecimalType",
     "DoubleType", "FloatType", "ByteType", "IntegerType", "LongType",
     "ShortType", "ArrayType", "MapType", "StructField", "StructType",
-    "SQLContext", "HiveContext", "DataFrame", "GroupedDataFrame", "Column", "Row",
+    "SQLContext", "HiveContext", "DataFrame", "GroupedDataFrame", "Column", "Row", "Dsl",
     "SchemaRDD"]
 
 
@@ -2121,6 +2121,8 @@ def sort(self, *cols):
 
         >>> df.sort(df.age.desc()).collect()
         [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
+        >>> df.sortBy(df.age.desc()).collect()
+        [Row(age=5, name=u'Bob'), Row(age=2, name=u'Alice')]
         """
         if not cols:
             raise ValueError("should sort by at least one column")
@@ -2427,34 +2429,34 @@ def _scalaMethod(name):
     return ''.join(SCALA_METHOD_MAPPINGS.get(c, c) for c in name)
 
 
-def _unary_op(name):
+def _unary_op(name, doc="unary operator"):
     """ Create a method for given unary operator """
     def _(self):
         jc = getattr(self._jc, _scalaMethod(name))()
         return Column(jc, self.sql_ctx)
+    _.__doc__ = doc
     return _
 
 
-def _bin_op(name):
+def _bin_op(name, doc="binary operator"):
     """ Create a method for given binary operator
-
-    Keyword arguments:
-    pass_literal_through -- whether to pass literal value directly through to the JVM.
     """
     def _(self, other):
         jc = other._jc if isinstance(other, Column) else other
         njc = getattr(self._jc, _scalaMethod(name))(jc)
         return Column(njc, self.sql_ctx)
+    _.__doc__ = doc
     return _
 
 
-def _reverse_op(name):
+def _reverse_op(name, doc="binary operator"):
     """ Create a method for binary operator (this object is on right side)
     """
     def _(self, other):
         jother = _create_column_from_literal(other)
         jc = getattr(jother, _scalaMethod(name))(self._jc)
         return Column(jc, self.sql_ctx)
+    _.__doc__ = doc
     return _
 
 
@@ -2491,8 +2493,6 @@ def __init__(self, jc, sql_ctx=None):
     __rdiv__ = _reverse_op("/")
     __rmod__ = _reverse_op("%")
     __abs__ = _unary_op("abs")
-    abs = _unary_op("abs")
-    sqrt = _unary_op("sqrt")
 
     # logistic operators
     __eq__ = _bin_op("===")
@@ -2501,36 +2501,25 @@ def __init__(self, jc, sql_ctx=None):
     __le__ = _bin_op("<=")
     __ge__ = _bin_op(">=")
     __gt__ = _bin_op(">")
-    # `and`, `or`, `not` cannot be overloaded in Python
-    And = _bin_op('&&')
-    Or = _bin_op('||')
-    Not = _unary_op('unary_!')
-
-    # bitwise operators
-    __and__ = _bin_op("&")
-    __or__ = _bin_op("|")
-    __invert__ = _unary_op("unary_~")
-    __xor__ = _bin_op("^")
-    # __lshift__ = _bin_op("<<")
-    # __rshift__ = _bin_op(">>")
-    __rand__ = _bin_op("&")
-    __ror__ = _bin_op("|")
-    __rxor__ = _bin_op("^")
-    # __rlshift__ = _reverse_op("<<")
-    # __rrshift__ = _reverse_op(">>")
+
+    # `and`, `or`, `not` cannot be overloaded in Python,
+    # so use bitwise operators as boolean operators
+    __and__ = _bin_op('&&')
+    __or__ = _bin_op('||')
+    __invert__ = _unary_op('unary_!')
+    __rand__ = _bin_op("&&")
+    __ror__ = _bin_op("||")
 
     # container operators
     __contains__ = _bin_op("contains")
     __getitem__ = _bin_op("getItem")
-    # __getattr__ = _bin_op("getField")
+    getField = _bin_op("getField", "An expression that gets a field by name in a StructField.")
 
     # string methods
     rlike = _bin_op("rlike")
     like = _bin_op("like")
     startswith = _bin_op("startsWith")
     endswith = _bin_op("endsWith")
-    upper = _unary_op("upper")
-    lower = _unary_op("lower")
 
     def substr(self, startPos, length):
         """
@@ -2558,12 +2547,20 @@ def substr(self, startPos, length):
     asc = _unary_op("asc")
     desc = _unary_op("desc")
 
-    isNull = _unary_op("isNull")
-    isNotNull = _unary_op("isNotNull")
+    isNull = _unary_op("isNull", "True if the current expression is null.")
+    isNotNull = _unary_op("isNotNull", "True if the current expression is not null.")
 
     # `as` is keyword
-    def As(self, alias):
+    def alias(self, alias):
+        """Return a alias for this column
+
+        >>> df.age.As("age2").collect()
+        [Row(age2=2), Row(age2=5)]
+        >>> df.age.alias("age2").collect()
+        [Row(age2=2), Row(age2=5)]
+        """
         return Column(getattr(self._jc, "as")(alias), self.sql_ctx)
+    As = alias
 
     def cast(self, dataType):
         """ Convert the column into type `dataType`
@@ -2580,27 +2577,44 @@ def cast(self, dataType):
         return Column(self._jc.cast(jdt), self.sql_ctx)
 
 
-def _aggregate_func(name):
+def _aggregate_func(name, doc=""):
     """ Create a function for aggregator by name"""
     def _(col):
         sc = SparkContext._active_spark_context
         jc = getattr(sc._jvm.Dsl, name)(_to_java_column(col))
         return Column(jc)
-
+    _.__name__ = name
+    _.__doc__ = doc
     return staticmethod(_)
 
 
 class Dsl(object):
     """
     A collections of builtin aggregators
     """
-    AGGS = [
-        'lit', 'col', 'column', 'upper', 'lower', 'sqrt', 'abs',
-        'min', 'max', 'first', 'last', 'count', 'avg', 'mean', 'sum', 'sumDistinct',
-    ]
-    for _name in AGGS:
-        locals()[_name] = _aggregate_func(_name)
-    del _name
+    DSLS = {
+        'lit': 'Creates a [[Column]] of literal value.',
+        'col': 'Returns a [[Column]] based on the given column name.',
+        'column': 'Returns a [[Column]] based on the given column name.',
+        'upper': 'Converts a string expression to upper case.',
+        'lower': 'Converts a string expression to upper case.',
+        'sqrt': 'Computes the square root of the specified float value.',
+        'abs': 'Computes the absolutle value.',
+
+        'max': 'Aggregate function: returns the maximum value of the expression in a group.',
+        'min': 'Aggregate function: returns the minimum value of the expression in a group.',
+        'first': 'Aggregate function: returns the first value in a group.',
+        'last': 'Aggregate function: returns the last value in a group.',
+        'count': 'Aggregate function: returns the number of items in a group.',
+        'sum': 'Aggregate function: returns the sum of all values in the expression.',
+        'avg': 'Aggregate function: returns the average of the values in a group.',
+        'mean': 'Aggregate function: returns the average of the values in a group.',
+        'sumDistinct': 'Aggregate function: returns the sum of distinct values in the expression.',
+    }
+
+    for _name, _doc in DSLS.items():
+        locals()[_name] = _aggregate_func(_name, _doc)
+    del _name, _doc
 
     @staticmethod
     def countDistinct(col, *cols):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -115,15 +115,6 @@ trait Column extends DataFrame {
    */
   def unary_- : Column = exprToColumn(UnaryMinus(expr))
 
-  /**
-   * Bitwise NOT.
-   * {{{
-   *   // Scala: select the flags column and negate every bit.
-   *   df.select( ~df("flags") )
-   * }}}
-   */
-  def unary_~ : Column = exprToColumn(BitwiseNot(expr))
-
   /**
    * Inversion of boolean expression, i.e. NOT.
    * {{
@@ -362,27 +353,6 @@ trait Column extends DataFrame {
    */
   def and(other: Column): Column = this && other
 
-  /**
-   * Bitwise AND.
-   */
-  def & (other: Any): Column = constructColumn(other) { o =>
-    BitwiseAnd(expr, o.expr)
-  }
-
-  /**
-   * Bitwise OR with an expression.
-   */
-  def | (other: Any): Column = constructColumn(other) { o =>
-    BitwiseOr(expr, o.expr)
-  }
-
-  /**
-   * Bitwise XOR with an expression.
-   */
-  def ^ (other: Any): Column = constructColumn(other) { o =>
-    BitwiseXor(expr, o.expr)
-  }
-
   /**
    * Sum of this expression and another expression.
    * {{{
@@ -527,16 +497,16 @@ trait Column extends DataFrame {
    * @param startPos expression for the starting position.
    * @param len expression for the length of the substring.
    */
-  def substr(startPos: Column, len: Column): Column = constructColumn(null) {
-    Substring(expr, startPos.expr, len.expr)
-  }
+  def substr(startPos: Column, len: Column): Column =
+    exprToColumn(Substring(expr, startPos.expr, len.expr), computable = false)
 
   /**
    * An expression that returns a substring.
    * @param startPos starting position.
    * @param len length of the substring.
    */
-  def substr(startPos: Int, len: Int): Column = this.substr(lit(startPos), lit(len))
+  def substr(startPos: Int, len: Int): Column =
+    exprToColumn(Substring(expr, lit(startPos).expr, lit(len).expr))
 
   def contains(other: Any): Column = constructColumn(other) { o =>
     Contains(expr, o.expr)