Skip to content

Commit 7bc7d28

Browse files
committed
Merge pull request apache#17 from apache/master
merge lastest spark
2 parents 0ba5f42 + 778f3ca commit 7bc7d28

File tree

1,019 files changed

+18312
-12201
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,019 files changed

+18312
-12201
lines changed

.rat-excludes

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,5 +80,8 @@ local-1425081759269/*
8080
local-1426533911241/*
8181
local-1426633911242/*
8282
local-1430917381534/*
83+
local-1430917381535_1
84+
local-1430917381535_2
8385
DESCRIPTION
8486
NAMESPACE
87+
test_support/*

LICENSE

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,52 @@ and
853853

854854
Vis.js may be distributed under either license.
855855

856+
========================================================================
857+
For dagre-d3 (core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js):
858+
========================================================================
859+
Copyright (c) 2013 Chris Pettitt
860+
861+
Permission is hereby granted, free of charge, to any person obtaining a copy
862+
of this software and associated documentation files (the "Software"), to deal
863+
in the Software without restriction, including without limitation the rights
864+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
865+
copies of the Software, and to permit persons to whom the Software is
866+
furnished to do so, subject to the following conditions:
867+
868+
The above copyright notice and this permission notice shall be included in
869+
all copies or substantial portions of the Software.
870+
871+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
872+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
873+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
874+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
875+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
876+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
877+
THE SOFTWARE.
878+
879+
========================================================================
880+
For graphlib-dot (core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js):
881+
========================================================================
882+
Copyright (c) 2012-2013 Chris Pettitt
883+
884+
Permission is hereby granted, free of charge, to any person obtaining a copy
885+
of this software and associated documentation files (the "Software"), to deal
886+
in the Software without restriction, including without limitation the rights
887+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
888+
copies of the Software, and to permit persons to whom the Software is
889+
furnished to do so, subject to the following conditions:
890+
891+
The above copyright notice and this permission notice shall be included in
892+
all copies or substantial portions of the Software.
893+
894+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
895+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
896+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
897+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
898+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
899+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
900+
THE SOFTWARE.
901+
856902
========================================================================
857903
BSD-style licenses
858904
========================================================================

R/create-docs.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@
2323
# After running this script the html docs can be found in
2424
# $SPARK_HOME/R/pkg/html
2525

26+
set -o pipefail
27+
set -e
28+
2629
# Figure out where the script is
2730
export FWDIR="$(cd "`dirname "$0"`"; pwd)"
2831
pushd $FWDIR
2932

30-
# Generate Rd file
31-
Rscript -e 'library(devtools); devtools::document(pkg="./pkg", roclets=c("rd"))'
32-
33-
# Install the package
33+
# Install the package (this will also generate the Rd files)
3434
./install-dev.sh
3535

3636
# Now create HTML files

R/install-dev.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,20 @@
2626
# NOTE(shivaram): Right now we use $SPARK_HOME/R/lib to be the installation directory
2727
# to load the SparkR package on the worker nodes.
2828

29+
set -o pipefail
30+
set -e
2931

3032
FWDIR="$(cd `dirname $0`; pwd)"
3133
LIB_DIR="$FWDIR/lib"
3234

3335
mkdir -p $LIB_DIR
3436

35-
# Install R
37+
pushd $FWDIR
38+
39+
# Generate Rd files if devtools is installed
40+
Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
41+
42+
# Install SparkR to $LIB_DIR
3643
R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
44+
45+
popd

R/pkg/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@ exportMethods("arrange",
1919
"count",
2020
"describe",
2121
"distinct",
22+
"dropna",
2223
"dtypes",
2324
"except",
2425
"explain",
26+
"fillna",
2527
"filter",
2628
"first",
2729
"group_by",

R/pkg/R/DataFrame.R

Lines changed: 129 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,9 +1314,8 @@ setMethod("except",
13141314
#' write.df(df, "myfile", "parquet", "overwrite")
13151315
#' }
13161316
setMethod("write.df",
1317-
signature(df = "DataFrame", path = 'character', source = 'character',
1318-
mode = 'character'),
1319-
function(df, path = NULL, source = NULL, mode = "append", ...){
1317+
signature(df = "DataFrame", path = 'character'),
1318+
function(df, path, source = NULL, mode = "append", ...){
13201319
if (is.null(source)) {
13211320
sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
13221321
source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
@@ -1338,9 +1337,8 @@ setMethod("write.df",
13381337
#' @aliases saveDF
13391338
#' @export
13401339
setMethod("saveDF",
1341-
signature(df = "DataFrame", path = 'character', source = 'character',
1342-
mode = 'character'),
1343-
function(df, path = NULL, source = NULL, mode = "append", ...){
1340+
signature(df = "DataFrame", path = 'character'),
1341+
function(df, path, source = NULL, mode = "append", ...){
13441342
write.df(df, path, source, mode, ...)
13451343
})
13461344

@@ -1431,3 +1429,128 @@ setMethod("describe",
14311429
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
14321430
dataFrame(sdf)
14331431
})
1432+
1433+
#' dropna
1434+
#'
1435+
#' Returns a new DataFrame omitting rows with null values.
1436+
#'
1437+
#' @param x A SparkSQL DataFrame.
1438+
#' @param how "any" or "all".
1439+
#' if "any", drop a row if it contains any nulls.
1440+
#' if "all", drop a row only if all its values are null.
1441+
#' if minNonNulls is specified, how is ignored.
1442+
#' @param minNonNulls If specified, drop rows that have less than
1443+
#' minNonNulls non-null values.
1444+
#' This overwrites the how parameter.
1445+
#' @param cols Optional list of column names to consider.
1446+
#' @return A DataFrame
1447+
#'
1448+
#' @rdname nafunctions
1449+
#' @export
1450+
#' @examples
1451+
#'\dontrun{
1452+
#' sc <- sparkR.init()
1453+
#' sqlCtx <- sparkRSQL.init(sc)
1454+
#' path <- "path/to/file.json"
1455+
#' df <- jsonFile(sqlCtx, path)
1456+
#' dropna(df)
1457+
#' }
1458+
setMethod("dropna",
1459+
signature(x = "DataFrame"),
1460+
function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
1461+
how <- match.arg(how)
1462+
if (is.null(cols)) {
1463+
cols <- columns(x)
1464+
}
1465+
if (is.null(minNonNulls)) {
1466+
minNonNulls <- if (how == "any") { length(cols) } else { 1 }
1467+
}
1468+
1469+
naFunctions <- callJMethod(x@sdf, "na")
1470+
sdf <- callJMethod(naFunctions, "drop",
1471+
as.integer(minNonNulls), listToSeq(as.list(cols)))
1472+
dataFrame(sdf)
1473+
})
1474+
1475+
#' @aliases dropna
1476+
#' @export
1477+
setMethod("na.omit",
1478+
signature(x = "DataFrame"),
1479+
function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
1480+
dropna(x, how, minNonNulls, cols)
1481+
})
1482+
1483+
#' fillna
1484+
#'
1485+
#' Replace null values.
1486+
#'
1487+
#' @param x A SparkSQL DataFrame.
1488+
#' @param value Value to replace null values with.
1489+
#' Should be an integer, numeric, character or named list.
1490+
#' If the value is a named list, then cols is ignored and
1491+
#' value must be a mapping from column name (character) to
1492+
#' replacement value. The replacement value must be an
1493+
#' integer, numeric or character.
1494+
#' @param cols optional list of column names to consider.
1495+
#' Columns specified in cols that do not have matching data
1496+
#' type are ignored. For example, if value is a character, and
1497+
#' subset contains a non-character column, then the non-character
1498+
#' column is simply ignored.
1499+
#' @return A DataFrame
1500+
#'
1501+
#' @rdname nafunctions
1502+
#' @export
1503+
#' @examples
1504+
#'\dontrun{
1505+
#' sc <- sparkR.init()
1506+
#' sqlCtx <- sparkRSQL.init(sc)
1507+
#' path <- "path/to/file.json"
1508+
#' df <- jsonFile(sqlCtx, path)
1509+
#' fillna(df, 1)
1510+
#' fillna(df, list("age" = 20, "name" = "unknown"))
1511+
#' }
1512+
setMethod("fillna",
1513+
signature(x = "DataFrame"),
1514+
function(x, value, cols = NULL) {
1515+
if (!(class(value) %in% c("integer", "numeric", "character", "list"))) {
1516+
stop("value should be an integer, numeric, charactor or named list.")
1517+
}
1518+
1519+
if (class(value) == "list") {
1520+
# Check column names in the named list
1521+
colNames <- names(value)
1522+
if (length(colNames) == 0 || !all(colNames != "")) {
1523+
stop("value should be an a named list with each name being a column name.")
1524+
}
1525+
1526+
# Convert to the named list to an environment to be passed to JVM
1527+
valueMap <- new.env()
1528+
for (col in colNames) {
1529+
# Check each item in the named list is of valid type
1530+
v <- value[[col]]
1531+
if (!(class(v) %in% c("integer", "numeric", "character"))) {
1532+
stop("Each item in value should be an integer, numeric or charactor.")
1533+
}
1534+
valueMap[[col]] <- v
1535+
}
1536+
1537+
# When value is a named list, caller is expected not to pass in cols
1538+
if (!is.null(cols)) {
1539+
warning("When value is a named list, cols is ignored!")
1540+
cols <- NULL
1541+
}
1542+
1543+
value <- valueMap
1544+
} else if (is.integer(value)) {
1545+
# Cast an integer to a numeric
1546+
value <- as.numeric(value)
1547+
}
1548+
1549+
naFunctions <- callJMethod(x@sdf, "na")
1550+
sdf <- if (length(cols) == 0) {
1551+
callJMethod(naFunctions, "fill", value)
1552+
} else {
1553+
callJMethod(naFunctions, "fill", value, listToSeq(as.list(cols)))
1554+
}
1555+
dataFrame(sdf)
1556+
})

R/pkg/R/SQLContext.R

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -452,20 +452,31 @@ dropTempTable <- function(sqlContext, tableName) {
452452
#' df <- read.df(sqlContext, "path/to/file.json", source = "json")
453453
#' }
454454

455-
read.df <- function(sqlContext, path = NULL, source = NULL, ...) {
455+
read.df <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
456456
options <- varargsToEnv(...)
457457
if (!is.null(path)) {
458458
options[['path']] <- path
459459
}
460-
sdf <- callJMethod(sqlContext, "load", source, options)
460+
if (is.null(source)) {
461+
sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
462+
source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
463+
"org.apache.spark.sql.parquet")
464+
}
465+
if (!is.null(schema)) {
466+
stopifnot(class(schema) == "structType")
467+
sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sqlContext, source,
468+
schema$jobj, options)
469+
} else {
470+
sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "loadDF", sqlContext, source, options)
471+
}
461472
dataFrame(sdf)
462473
}
463474

464475
#' @aliases loadDF
465476
#' @export
466477

467-
loadDF <- function(sqlContext, path = NULL, source = NULL, ...) {
468-
read.df(sqlContext, path, source, ...)
478+
loadDF <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
479+
read.df(sqlContext, path, source, schema, ...)
469480
}
470481

471482
#' Create an external table

R/pkg/R/generics.R

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,20 @@ setGeneric("columns", function(x) {standardGeneric("columns") })
396396
#' @export
397397
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
398398

399+
#' @rdname nafunctions
400+
#' @export
401+
setGeneric("dropna",
402+
function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
403+
standardGeneric("dropna")
404+
})
405+
406+
#' @rdname nafunctions
407+
#' @export
408+
setGeneric("na.omit",
409+
function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
410+
standardGeneric("na.omit")
411+
})
412+
399413
#' @rdname schema
400414
#' @export
401415
setGeneric("dtypes", function(x) { standardGeneric("dtypes") })
@@ -408,6 +422,10 @@ setGeneric("explain", function(x, ...) { standardGeneric("explain") })
408422
#' @export
409423
setGeneric("except", function(x, y) { standardGeneric("except") })
410424

425+
#' @rdname nafunctions
426+
#' @export
427+
setGeneric("fillna", function(x, value, cols = NULL) { standardGeneric("fillna") })
428+
411429
#' @rdname filter
412430
#' @export
413431
setGeneric("filter", function(x, condition) { standardGeneric("filter") })
@@ -482,11 +500,11 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
482500

483501
#' @rdname write.df
484502
#' @export
485-
setGeneric("write.df", function(df, path, source, mode, ...) { standardGeneric("write.df") })
503+
setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
486504

487505
#' @rdname write.df
488506
#' @export
489-
setGeneric("saveDF", function(df, path, source, mode, ...) { standardGeneric("saveDF") })
507+
setGeneric("saveDF", function(df, path, ...) { standardGeneric("saveDF") })
490508

491509
#' @rdname schema
492510
#' @export

R/pkg/R/serialize.R

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ writeObject <- function(con, object, writeType = TRUE) {
3737
# passing in vectors as arrays and instead require arrays to be passed
3838
# as lists.
3939
type <- class(object)[[1]] # class of POSIXlt is c("POSIXlt", "POSIXt")
40+
# Checking types is needed here, since ‘is.na’ only handles atomic vectors,
41+
# lists and pairlists
42+
if (type %in% c("integer", "character", "logical", "double", "numeric")) {
43+
if (is.na(object)) {
44+
object <- NULL
45+
type <- "NULL"
46+
}
47+
}
4048
if (writeType) {
4149
writeType(con, type)
4250
}
@@ -160,6 +168,14 @@ writeList <- function(con, arr) {
160168
}
161169
}
162170

171+
# Used to pass arrays where the elements can be of different types
172+
writeGenericList <- function(con, list) {
173+
writeInt(con, length(list))
174+
for (elem in list) {
175+
writeObject(con, elem)
176+
}
177+
}
178+
163179
# Used to pass in hash maps required on Java side.
164180
writeEnv <- function(con, env) {
165181
len <- length(env)
@@ -168,7 +184,7 @@ writeEnv <- function(con, env) {
168184
if (len > 0) {
169185
writeList(con, as.list(ls(env)))
170186
vals <- lapply(ls(env), function(x) { env[[x]] })
171-
writeList(con, as.list(vals))
187+
writeGenericList(con, as.list(vals))
172188
}
173189
}
174190

0 commit comments

Comments
 (0)