apache
diff --git a/‎.gitignore
Lines changed: 0 additions & 1 deletion b/‎.gitignore
Lines changed: 0 additions & 1 deletion
diff --git a/‎LICENSE
Lines changed: 1 addition & 1 deletion b/‎LICENSE
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/NAMESPACE
Lines changed: 4 additions & 1 deletion b/‎R/pkg/NAMESPACE
Lines changed: 4 additions & 1 deletion
diff --git a/‎R/pkg/R/DataFrame.R
Lines changed: 28 additions & 0 deletions b/‎R/pkg/R/DataFrame.R
Lines changed: 28 additions & 0 deletions
diff --git a/‎R/pkg/R/SQLContext.R
Lines changed: 26 additions & 0 deletions b/‎R/pkg/R/SQLContext.R
Lines changed: 26 additions & 0 deletions
diff --git a/‎R/pkg/R/functions.R
Lines changed: 20 additions & 0 deletions b/‎R/pkg/R/functions.R
Lines changed: 20 additions & 0 deletions
diff --git a/‎R/pkg/R/generics.R
Lines changed: 8 additions & 0 deletions b/‎R/pkg/R/generics.R
Lines changed: 8 additions & 0 deletions
diff --git a/‎R/pkg/inst/tests/testthat/test_sparkSQL.R
Lines changed: 22 additions & 1 deletion b/‎R/pkg/inst/tests/testthat/test_sparkSQL.R
Lines changed: 22 additions & 1 deletion
diff --git a/‎bin/pyspark
Lines changed: 1 addition & 1 deletion b/‎bin/pyspark
Lines changed: 1 addition & 1 deletion
diff --git a/‎bin/pyspark2.cmd
Lines changed: 1 addition & 1 deletion b/‎bin/pyspark2.cmd
Lines changed: 1 addition & 1 deletion
@@ -60,7 +60,6 @@ dev/create-release/*final
 spark-*-bin-*.tgz
 unit-tests.log
 /lib/
-ec2/lib/
 rat-results.txt
 scalastyle.txt
 scalastyle-output.xml
 
@@ -264,7 +264,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.9 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.9.1 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
 
@@ -94,7 +94,8 @@ exportMethods("arrange",
               "withColumnRenamed",
               "write.df",
               "write.json",
-              "write.parquet")
+              "write.parquet",
+              "write.text")
 
 exportClasses("Column")
 
@@ -129,6 +130,7 @@ exportMethods("%in%",
               "count",
               "countDistinct",
               "crc32",
+              "hash",
               "cume_dist",
               "date_add",
               "date_format",
@@ -274,6 +276,7 @@ export("as.DataFrame",
        "parquetFile",
        "read.df",
        "read.parquet",
+       "read.text",
        "sql",
        "table",
        "tableNames",
 
@@ -664,6 +664,34 @@ setMethod("saveAsParquetFile",
             write.parquet(x, path)
           })
 
+#' write.text
+#'
+#' Saves the content of the DataFrame in a text file at the specified path.
+#' The DataFrame must have only one column of string type with the name "value".
+#' Each row becomes a new line in the output file.
+#'
+#' @param x A SparkSQL DataFrame
+#' @param path The directory where the file is saved
+#'
+#' @family DataFrame functions
+#' @rdname write.text
+#' @name write.text
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlContext <- sparkRSQL.init(sc)
+#' path <- "path/to/file.txt"
+#' df <- read.text(sqlContext, path)
+#' write.text(df, "/tmp/sparkr-tmp/")
+#'}
+setMethod("write.text",
+          signature(x = "DataFrame", path = "character"),
+          function(x, path) {
+            write <- callJMethod(x@sdf, "write")
+            invisible(callJMethod(write, "text", path))
+          })
+
 #' Distinct
 #'
 #' Return a new DataFrame containing the distinct rows in this DataFrame.
 
@@ -295,6 +295,32 @@ parquetFile <- function(sqlContext, ...) {
   read.parquet(sqlContext, unlist(list(...)))
 }
 
+#' Create a DataFrame from a text file.
+#'
+#' Loads a text file and returns a DataFrame with a single string column named "value".
+#' Each line in the text file is a new row in the resulting DataFrame.
+#'
+#' @param sqlContext SQLContext to use
+#' @param path Path of file to read. A vector of multiple paths is allowed.
+#' @return DataFrame
+#' @rdname read.text
+#' @name read.text
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlContext <- sparkRSQL.init(sc)
+#' path <- "path/to/file.txt"
+#' df <- read.text(sqlContext, path)
+#' }
+read.text <- function(sqlContext, path) {
+  # Allow the user to have a more flexible definiton of the text file path
+  paths <- as.list(suppressWarnings(normalizePath(path)))
+  read <- callJMethod(sqlContext, "read")
+  sdf <- callJMethod(read, "text", paths)
+  dataFrame(sdf)
+}
+
 #' SQL Query
 #'
 #' Executes a SQL query using Spark, returning the result as a DataFrame.
 
@@ -340,6 +340,26 @@ setMethod("crc32",
             column(jc)
           })
 
+#' hash
+#'
+#' Calculates the hash code of given columns, and returns the result as a int column.
+#'
+#' @rdname hash
+#' @name hash
+#' @family misc_funcs
+#' @export
+#' @examples \dontrun{hash(df$c)}
+setMethod("hash",
+          signature(x = "Column"),
+          function(x, ...) {
+            jcols <- lapply(list(x, ...), function (x) {
+              stopifnot(class(x) == "Column")
+              x@jc
+            })
+            jc <- callJStatic("org.apache.spark.sql.functions", "hash", jcols)
+            column(jc)
+          })
+
 #' dayofmonth
 #'
 #' Extracts the day of the month as an integer from a given date/timestamp/string.
 
@@ -549,6 +549,10 @@ setGeneric("write.parquet", function(x, path) { standardGeneric("write.parquet")
 #' @export
 setGeneric("saveAsParquetFile", function(x, path) { standardGeneric("saveAsParquetFile") })
 
+#' @rdname write.text
+#' @export
+setGeneric("write.text", function(x, path) { standardGeneric("write.text") })
+
 #' @rdname schema
 #' @export
 setGeneric("schema", function(x) { standardGeneric("schema") })
@@ -732,6 +736,10 @@ setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct")
 #' @export
 setGeneric("crc32", function(x) { standardGeneric("crc32") })
 
+#' @rdname hash
+#' @export
+setGeneric("hash", function(x, ...) { standardGeneric("hash") })
+
 #' @rdname cume_dist
 #' @export
 setGeneric("cume_dist", function(x) { standardGeneric("cume_dist") })
 
@@ -922,7 +922,7 @@ test_that("column functions", {
   c <- column("a")
   c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
   c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
-  c3 <- cosh(c) + count(c) + crc32(c) + exp(c)
+  c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
   c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
   c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c)
   c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c)
@@ -1497,6 +1497,27 @@ test_that("read/write Parquet files", {
   unlink(parquetPath4)
 })
 
+test_that("read/write text files", {
+  # Test write.df and read.df
+  df <- read.df(sqlContext, jsonPath, "text")
+  expect_is(df, "DataFrame")
+  expect_equal(colnames(df), c("value"))
+  expect_equal(count(df), 3)
+  textPath <- tempfile(pattern = "textPath", fileext = ".txt")
+  write.df(df, textPath, "text", mode="overwrite")
+
+  # Test write.text and read.text
+  textPath2 <- tempfile(pattern = "textPath2", fileext = ".txt")
+  write.text(df, textPath2)
+  df2 <- read.text(sqlContext, c(textPath, textPath2))
+  expect_is(df2, "DataFrame")
+  expect_equal(colnames(df2), c("value"))
+  expect_equal(count(df2), count(df) * 2)
+
+  unlink(textPath)
+  unlink(textPath2)
+})
+
 test_that("describe() and summarize() on a DataFrame", {
   df <- read.json(sqlContext, jsonPath)
   stats <- describe(df, "age")
 
@@ -67,7 +67,7 @@ export PYSPARK_PYTHON
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
-export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9.1-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
 
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
 )
 
 set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9.1-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (`
`30`	`30`	`)`
`31`	`31`
`32`	`32`	`set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%`
`33`		`-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9-src.zip;%PYTHONPATH%`
	`33`	`+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9.1-src.zip;%PYTHONPATH%`
`34`	`34`
`35`	`35`	`set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%`
`36`	`36`	`set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py`