Skip to content

Commit 330e984

Browse files
author
Marcelo Vanzin
committed
Merge branch 'master' into SPARK-2750
2 parents 123d958 + 63eee86 commit 330e984

File tree

362 files changed

+8851
-7450
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

362 files changed

+8851
-7450
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ dev/create-release/*final
6060
spark-*-bin-*.tgz
6161
unit-tests.log
6262
/lib/
63-
ec2/lib/
6463
rat-results.txt
6564
scalastyle.txt
6665
scalastyle-output.xml

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
264264
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
265265
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
266266
(The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
267-
(The New BSD License) Py4J (net.sf.py4j:py4j:0.9 - http://py4j.sourceforge.net/)
267+
(The New BSD License) Py4J (net.sf.py4j:py4j:0.9.1 - http://py4j.sourceforge.net/)
268268
(Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
269269
(BSD licence) sbt and sbt-launch-lib.bash
270270
(BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)

R/pkg/NAMESPACE

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ exportMethods("arrange",
9494
"withColumnRenamed",
9595
"write.df",
9696
"write.json",
97-
"write.parquet")
97+
"write.parquet",
98+
"write.text")
9899

99100
exportClasses("Column")
100101

@@ -129,6 +130,7 @@ exportMethods("%in%",
129130
"count",
130131
"countDistinct",
131132
"crc32",
133+
"hash",
132134
"cume_dist",
133135
"date_add",
134136
"date_format",
@@ -274,6 +276,7 @@ export("as.DataFrame",
274276
"parquetFile",
275277
"read.df",
276278
"read.parquet",
279+
"read.text",
277280
"sql",
278281
"table",
279282
"tableNames",

R/pkg/R/DataFrame.R

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,34 @@ setMethod("saveAsParquetFile",
664664
write.parquet(x, path)
665665
})
666666

667+
#' write.text
668+
#'
669+
#' Saves the content of the DataFrame in a text file at the specified path.
670+
#' The DataFrame must have only one column of string type with the name "value".
671+
#' Each row becomes a new line in the output file.
672+
#'
673+
#' @param x A SparkSQL DataFrame
674+
#' @param path The directory where the file is saved
675+
#'
676+
#' @family DataFrame functions
677+
#' @rdname write.text
678+
#' @name write.text
679+
#' @export
680+
#' @examples
681+
#'\dontrun{
682+
#' sc <- sparkR.init()
683+
#' sqlContext <- sparkRSQL.init(sc)
684+
#' path <- "path/to/file.txt"
685+
#' df <- read.text(sqlContext, path)
686+
#' write.text(df, "/tmp/sparkr-tmp/")
687+
#'}
688+
setMethod("write.text",
689+
signature(x = "DataFrame", path = "character"),
690+
function(x, path) {
691+
write <- callJMethod(x@sdf, "write")
692+
invisible(callJMethod(write, "text", path))
693+
})
694+
667695
#' Distinct
668696
#'
669697
#' Return a new DataFrame containing the distinct rows in this DataFrame.

R/pkg/R/SQLContext.R

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,32 @@ parquetFile <- function(sqlContext, ...) {
295295
read.parquet(sqlContext, unlist(list(...)))
296296
}
297297

298+
#' Create a DataFrame from a text file.
299+
#'
300+
#' Loads a text file and returns a DataFrame with a single string column named "value".
301+
#' Each line in the text file is a new row in the resulting DataFrame.
302+
#'
303+
#' @param sqlContext SQLContext to use
304+
#' @param path Path of file to read. A vector of multiple paths is allowed.
305+
#' @return DataFrame
306+
#' @rdname read.text
307+
#' @name read.text
308+
#' @export
309+
#' @examples
310+
#'\dontrun{
311+
#' sc <- sparkR.init()
312+
#' sqlContext <- sparkRSQL.init(sc)
313+
#' path <- "path/to/file.txt"
314+
#' df <- read.text(sqlContext, path)
315+
#' }
316+
read.text <- function(sqlContext, path) {
317+
# Allow the user to have a more flexible definiton of the text file path
318+
paths <- as.list(suppressWarnings(normalizePath(path)))
319+
read <- callJMethod(sqlContext, "read")
320+
sdf <- callJMethod(read, "text", paths)
321+
dataFrame(sdf)
322+
}
323+
298324
#' SQL Query
299325
#'
300326
#' Executes a SQL query using Spark, returning the result as a DataFrame.

R/pkg/R/functions.R

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,26 @@ setMethod("crc32",
340340
column(jc)
341341
})
342342

343+
#' hash
344+
#'
345+
#' Calculates the hash code of given columns, and returns the result as a int column.
346+
#'
347+
#' @rdname hash
348+
#' @name hash
349+
#' @family misc_funcs
350+
#' @export
351+
#' @examples \dontrun{hash(df$c)}
352+
setMethod("hash",
353+
signature(x = "Column"),
354+
function(x, ...) {
355+
jcols <- lapply(list(x, ...), function (x) {
356+
stopifnot(class(x) == "Column")
357+
x@jc
358+
})
359+
jc <- callJStatic("org.apache.spark.sql.functions", "hash", jcols)
360+
column(jc)
361+
})
362+
343363
#' dayofmonth
344364
#'
345365
#' Extracts the day of the month as an integer from a given date/timestamp/string.

R/pkg/R/generics.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,10 @@ setGeneric("write.parquet", function(x, path) { standardGeneric("write.parquet")
549549
#' @export
550550
setGeneric("saveAsParquetFile", function(x, path) { standardGeneric("saveAsParquetFile") })
551551

552+
#' @rdname write.text
553+
#' @export
554+
setGeneric("write.text", function(x, path) { standardGeneric("write.text") })
555+
552556
#' @rdname schema
553557
#' @export
554558
setGeneric("schema", function(x) { standardGeneric("schema") })
@@ -732,6 +736,10 @@ setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct")
732736
#' @export
733737
setGeneric("crc32", function(x) { standardGeneric("crc32") })
734738

739+
#' @rdname hash
740+
#' @export
741+
setGeneric("hash", function(x, ...) { standardGeneric("hash") })
742+
735743
#' @rdname cume_dist
736744
#' @export
737745
setGeneric("cume_dist", function(x) { standardGeneric("cume_dist") })

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -922,7 +922,7 @@ test_that("column functions", {
922922
c <- column("a")
923923
c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
924924
c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
925-
c3 <- cosh(c) + count(c) + crc32(c) + exp(c)
925+
c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
926926
c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
927927
c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c)
928928
c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c)
@@ -1497,6 +1497,27 @@ test_that("read/write Parquet files", {
14971497
unlink(parquetPath4)
14981498
})
14991499

1500+
test_that("read/write text files", {
1501+
# Test write.df and read.df
1502+
df <- read.df(sqlContext, jsonPath, "text")
1503+
expect_is(df, "DataFrame")
1504+
expect_equal(colnames(df), c("value"))
1505+
expect_equal(count(df), 3)
1506+
textPath <- tempfile(pattern = "textPath", fileext = ".txt")
1507+
write.df(df, textPath, "text", mode="overwrite")
1508+
1509+
# Test write.text and read.text
1510+
textPath2 <- tempfile(pattern = "textPath2", fileext = ".txt")
1511+
write.text(df, textPath2)
1512+
df2 <- read.text(sqlContext, c(textPath, textPath2))
1513+
expect_is(df2, "DataFrame")
1514+
expect_equal(colnames(df2), c("value"))
1515+
expect_equal(count(df2), count(df) * 2)
1516+
1517+
unlink(textPath)
1518+
unlink(textPath2)
1519+
})
1520+
15001521
test_that("describe() and summarize() on a DataFrame", {
15011522
df <- read.json(sqlContext, jsonPath)
15021523
stats <- describe(df, "age")

bin/pyspark

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ export PYSPARK_PYTHON
6767

6868
# Add the PySpark classes to the Python path:
6969
export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
70-
export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
70+
export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9.1-src.zip:$PYTHONPATH"
7171

7272
# Load the PySpark shell.py script when ./pyspark is used interactively:
7373
export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"

bin/pyspark2.cmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
3030
)
3131

3232
set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
33-
set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9-src.zip;%PYTHONPATH%
33+
set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9.1-src.zip;%PYTHONPATH%
3434

3535
set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
3636
set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py

0 commit comments

Comments
 (0)