davies
diff --git a/‎README.md
Lines changed: 5 additions & 0 deletions b/‎README.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎pkg/NAMESPACE
Lines changed: 3 additions & 0 deletions b/‎pkg/NAMESPACE
Lines changed: 3 additions & 0 deletions
diff --git a/‎pkg/R/RDD.R
Lines changed: 116 additions & 54 deletions b/‎pkg/R/RDD.R
Lines changed: 116 additions & 54 deletions
diff --git a/‎pkg/R/context.R
Lines changed: 10 additions & 5 deletions b/‎pkg/R/context.R
Lines changed: 10 additions & 5 deletions
diff --git a/‎pkg/R/sparkR.R
Lines changed: 28 additions & 19 deletions b/‎pkg/R/sparkR.R
Lines changed: 28 additions & 19 deletions
diff --git a/‎pkg/R/sparkRClient.R
Lines changed: 4 additions & 2 deletions b/‎pkg/R/sparkRClient.R
Lines changed: 4 additions & 2 deletions
@@ -84,7 +84,12 @@ pass the variable `spark.executor.memory` to the SparkContext constructor.
     sc <- sparkR.init(master="spark://<master>:7077",
                       sparkEnvir=list(spark.executor.memory="1g"))
 
+Finally, to stop the cluster run
 
+    sparkR.stop()
+    
+sparkR.stop() can be invoked to terminate a SparkContext created previously via sparkR.init(). Then you can call sparR.init() again to create a new SparkContext that may have different configurations.
+    
 ## Examples, Unit tests
 
 SparkR comes with several sample programs in the `examples` directory.
 
@@ -6,6 +6,7 @@ exportMethods(
               "checkpoint",
               "cogroup",
               "collect",
+              "collectAsMap",
               "collectPartition",
               "combineByKey",
               "count",
@@ -40,6 +41,7 @@ exportMethods(
               "persist",
               "reduce",
               "reduceByKey",
+              "reduceByKeyLocally",
               "rightOuterJoin",
               "sampleRDD",
               "saveAsTextFile",
@@ -66,6 +68,7 @@ export(
        "setCheckpointDir"
       )
 export("sparkR.init")
+export("sparkR.stop")
 export("print.jobj")
 useDynLib(SparkR, stringHashCode)
 importFrom(methods, setGeneric, setMethod, setOldClass)
@@ -358,6 +358,7 @@ setMethod("collect",
             convertJListToRList(collected, flatten)
           })
 
+
 #' @rdname collect-methods
 #' @export
 #' @description
@@ -382,6 +383,29 @@ setMethod("collectPartition",
             convertJListToRList(jList, flatten = TRUE)
           })
 
+#' @rdname collect-methods
+#' @export
+#' @description
+#' \code{collectAsMap} returns a named list as a map that contains all of the elements
+#' in a key-value pair RDD. 
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' rdd <- parallelize(sc, list(list(1, 2), list(3, 4)), 2L)
+#' collectAsMap(rdd) # list(`1` = 2, `3` = 4)
+#'}
+setGeneric("collectAsMap", function(rdd) { standardGeneric("collectAsMap") })
+
+#' @rdname collect-methods
+#' @aliases collectAsMap,RDD-method
+setMethod("collectAsMap",
+          signature(rdd = "RDD"),
+          function(rdd) {
+            pairList <- collect(rdd)
+            map <- new.env()
+            lapply(pairList, function(x) { assign(as.character(x[[1]]), x[[2]], envir = map) })
+            as.list(map)
+          })
 
 #' Look up elements of a key in an RDD
 #'
@@ -1388,26 +1412,32 @@ setMethod("groupByKey",
             groupVals <- function(part) {
               vals <- new.env()
               keys <- new.env()
+              pred <- function(item) exists(item$hash, keys)
+              appendList <- function(acc, x) {
+                addItemToAccumulator(acc, x)
+                acc
+              }
+              makeList <- function(x) {
+                acc <- initAccumulator()
+                addItemToAccumulator(acc, x)
+                acc
+              }
               # Each item in the partition is list of (K, V)
               lapply(part,
                      function(item) {
-                       hashVal <- as.character(hashCode(item[[1]]))
-                       if (exists(hashVal, vals)) {
-                         acc <- vals[[hashVal]]
-                         acc[[length(acc) + 1]] <- item[[2]]
-                         vals[[hashVal]] <- acc
-                       } else {
-                         vals[[hashVal]] <- list(item[[2]])
-                         keys[[hashVal]] <- item[[1]]
-                       }
+                       item$hash <- as.character(hashCode(item[[1]]))
+                       updateOrCreatePair(item, keys, vals, pred,
+                                          appendList, makeList)
                      })
+              # extract out data field
+              vals <- eapply(vals,
+                             function(x) {
+                               length(x$data) <- x$counter
+                               x$data
+                             })
               # Every key in the environment contains a list
               # Convert that to list(K, Seq[V])
-              grouped <- lapply(ls(vals),
-                                function(name) {
-                                  list(keys[[name]], vals[[name]])
-                                })
-              grouped
+              convertEnvsToList(keys, vals)
             }
             lapplyPartition(shuffled, groupVals)
           })
@@ -1448,28 +1478,78 @@ setMethod("reduceByKey",
             reduceVals <- function(part) {
               vals <- new.env()
               keys <- new.env()
+              pred <- function(item) exists(item$hash, keys)
               lapply(part,
                      function(item) {
-                       hashVal <- as.character(hashCode(item[[1]]))
-                       if (exists(hashVal, vals)) {
-                         vals[[hashVal]] <- do.call(
-                           combineFunc, list(vals[[hashVal]], item[[2]]))
-                       } else {
-                         vals[[hashVal]] <- item[[2]]
-                         keys[[hashVal]] <- item[[1]]
-                       }
+                       item$hash <- as.character(hashCode(item[[1]]))
+                       updateOrCreatePair(item, keys, vals, pred, combineFunc, identity)
                      })
-              combined <- lapply(ls(vals),
-                                  function(name) {
-                                    list(keys[[name]], vals[[name]])
-                                  })
-              combined
+              convertEnvsToList(keys, vals)
             }
             locallyReduced <- lapplyPartition(rdd, reduceVals)
             shuffled <- partitionBy(locallyReduced, numPartitions)
             lapplyPartition(shuffled, reduceVals)
           })
 
+#' Merge values by key locally
+#'
+#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
+#' and merges the values for each key using an associative reduce function, but return the
+#' results immediately to the driver as an R list.
+#'
+#' @param rdd The RDD to reduce by key. Should be an RDD where each element is
+#'             list(K, V) or c(K, V).
+#' @param combineFunc The associative reduce function to use.
+#' @return A list of elements of type list(K, V') where V' is the merged value for each key
+#' @rdname reduceByKeyLocally
+#' @seealso reduceByKey
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
+#' rdd <- parallelize(sc, pairs)
+#' reduced <- reduceByKeyLocally(rdd, "+")
+#' reduced # list(list(1, 6), list(1.1, 3))
+#'}
+setGeneric("reduceByKeyLocally",
+           function(rdd, combineFunc) {
+             standardGeneric("reduceByKeyLocally")
+           })
+
+#' @rdname reduceByKeyLocally
+#' @aliases reduceByKeyLocally,RDD,integer-method
+setMethod("reduceByKeyLocally",
+          signature(rdd = "RDD", combineFunc = "ANY"),
+          function(rdd, combineFunc) {
+            reducePart <- function(part) {
+              vals <- new.env()
+              keys <- new.env()
+              pred <- function(item) exists(item$hash, keys)
+              lapply(part,
+                     function(item) {
+                       item$hash <- as.character(hashCode(item[[1]]))
+                       updateOrCreatePair(item, keys, vals, pred, combineFunc, identity)
+                     })
+              list(list(keys, vals)) # return hash to avoid re-compute in merge
+            }
+            mergeParts <- function(accum, x) {
+              pred <- function(item) {
+                exists(item$hash, accum[[1]])
+              }
+              lapply(ls(x[[1]]),
+                     function(name) {
+                       item <- list(x[[1]][[name]], x[[2]][[name]])
+                       item$hash <- name
+                       updateOrCreatePair(item, accum[[1]], accum[[2]], pred, combineFunc, identity)
+                     })
+              accum
+            }
+            reduced <- mapPartitions(rdd, reducePart)
+            merged <- reduce(reduced, mergeParts)
+            convertEnvsToList(merged[[1]], merged[[2]])
+          })
+
 #' Combine values by key
 #'
 #' Generic function to combine the elements for each key using a custom set of
@@ -1519,46 +1599,28 @@ setMethod("combineByKey",
             combineLocally <- function(part) {
               combiners <- new.env()
               keys <- new.env()
+              pred <- function(item) exists(item$hash, keys)
               lapply(part,
                      function(item) {
-                       k <- as.character(item[[1]])
-                       if (!exists(k, keys)) {
-                         combiners[[k]] <- do.call(createCombiner,
-                                                   list(item[[2]]))
-                         keys[[k]] <- item[[1]]
-                       } else {
-                         combiners[[k]] <- do.call(mergeValue,
-                                                   list(combiners[[k]],
-                                                        item[[2]]))
-                       }
-                     })
-              lapply(ls(keys), function(k) {
-                      list(keys[[k]], combiners[[k]])
+                       item$hash <- as.character(item[[1]])
+                       updateOrCreatePair(item, keys, combiners, pred, mergeValue, createCombiner)
                      })
+              convertEnvsToList(keys, combiners)
             }
             locallyCombined <- lapplyPartition(rdd, combineLocally)
             shuffled <- partitionBy(locallyCombined, numPartitions)
             mergeAfterShuffle <- function(part) {
               combiners <- new.env()
               keys <- new.env()
+              pred <- function(item) exists(item$hash, keys)
               lapply(part,
                      function(item) {
-                       k <- as.character(item[[1]])
-                       if (!exists(k, combiners)) {
-                         combiners[[k]] <- item[[2]]
-                         keys[[k]] <- item[[1]]
-                       } else {
-                         combiners[[k]] <- do.call(mergeCombiners,
-                                                   list(combiners[[k]],
-                                                        item[[2]]))
-                       }
-                     })
-              lapply(ls(keys), function(k) {
-                      list(keys[[k]], combiners[[k]])
+                       item$hash <- as.character(item[[1]])
+                       updateOrCreatePair(item, keys, combiners, pred, mergeCombiners, identity)
                      })
+              convertEnvsToList(keys, combiners)
             }
-            combined <-lapplyPartition(shuffled, mergeAfterShuffle)
-            combined
+            lapplyPartition(shuffled, mergeAfterShuffle)
           })
 
 ############ Binary Functions #############
 
@@ -85,10 +85,16 @@ parallelize <- function(sc, coll, numSlices = 1) {
   # TODO: bound/safeguard numSlices
   # TODO: unit tests for if the split works for all primitives
   # TODO: support matrix, data frame, etc
-  if (!is.list(coll)) {
-    if (!is.vector(coll)) {
-      message(paste("context.R: parallelize() currently only supports lists and vectors.",
-                    "Calling as.list() to coerce coll into a list."))
+  if ((!is.list(coll) && !is.vector(coll)) || is.data.frame(coll)) {
+    if (is.data.frame(coll)) {
+      message(paste("context.R: A data frame is parallelized by columns."))
+    } else {
+      if (is.matrix(coll)) {
+        message(paste("context.R: A matrix is parallelized by elements."))
+      } else {
+        message(paste("context.R: parallelize() currently only supports lists and vectors.",
+                      "Calling as.list() to coerce coll into a list."))
+      }
     }
     coll <- as.list(coll)
   }
@@ -109,7 +115,6 @@ parallelize <- function(sc, coll, numSlices = 1) {
   RDD(jrdd, TRUE)
 }
 
-
 #' Include this specified package on all workers
 #'
 #' This function can be used to include a package on all workers before the
 
@@ -23,30 +23,39 @@ connExists <- function(env) {
 
 # Stop the Spark context.
 # Also terminates the backend this R session is connected to
-sparkR.stop <- function(env) {
-  cat("Stopping SparkR\n")
+sparkR.stop <- function(env = .sparkREnv) {
 
   if (!connExists(env)) {
     # When the workspace is saved in R, the connections are closed
     # *before* the finalizer is run. In these cases, we reconnect
     # to the backend, so we can shut it down.
-    connectBackend("localhost", .sparkREnv$sparkRBackendPort)
-  }
-
-  if (exists(".sparkRjsc", envir = env)) {
-    sc <- get(".sparkRjsc", envir = env)
-    callJMethod(sc, "stop")
+    tryCatch({
+      connectBackend("localhost", .sparkREnv$sparkRBackendPort)
+    }, error = function(err) {
+      cat("Error in Connection: Use sparkR.init() to restart SparkR\n")
+    }, warning = function(war) {
+      cat("No Connection Found: Use sparkR.init() to restart SparkR\n")
+    })
+  } 
+
+  if (exists(".sparkRCon", envir = env)) {
+    cat("Stopping SparkR\n")
+    if (exists(".sparkRjsc", envir = env)) {
+      sc <- get(".sparkRjsc", envir = env)
+      callJMethod(sc, "stop")
+      rm(".sparkRjsc", envir = env)
+    }
+  
+    callJStatic("SparkRHandler", "stopBackend")
+    # Also close the connection and remove it from our env
+    conn <- get(".sparkRCon", env)
+    close(conn)
+    rm(".sparkRCon", envir = env)
+    # Finally, sleep for 1 sec to let backend finish exiting.
+    # Without this we get port conflicts in RStudio when we try to 'Restart R'.
+    Sys.sleep(1)
   }
-
-  callJStatic("SparkRHandler", "stopBackend")
-  # Also close the connection and remove it from our env
-  conn <- get(".sparkRCon", env)
-  close(conn)
-  rm(".sparkRCon", envir = env)
-
-  # Finally, sleep for 1 sec to let backend finish exiting.
-  # Without this we get port conflicts in RStudio when we try to 'Restart R'.
-  Sys.sleep(1)
+  
 }
 
 #' Initialize a new Spark Context.
@@ -84,7 +93,7 @@ sparkR.init <- function(
   sparkRBackendPort = 12345) {
 
   if (exists(".sparkRjsc", envir = .sparkREnv)) {
-    cat("Re-using existing Spark Context. Please restart R to create a new Spark Context\n")
+    cat("Re-using existing Spark Context. Please stop SparkR with sparkR.stop() or restart R to create a new Spark Context\n")
     return(get(".sparkRjsc", envir = .sparkREnv))
   }
 
 
@@ -4,8 +4,10 @@
 # if one doesn't already exist
 connectBackend <- function(hostname, port, timeout = 6000) {
   if (exists(".sparkRcon", envir = .sparkREnv)) {
-    cat("SparkRBackend client connection already exists\n")
-    return(get(".sparkRcon", envir = .sparkREnv))
+    if (isOpen(env[[".sparkRCon"]])) {
+      cat("SparkRBackend client connection already exists\n")
+      return(get(".sparkRcon", envir = .sparkREnv))
+    }
   }
 
   con <- socketConnection(host = hostname, port = port, server = FALSE,