From 7ac3e6917028958c67992047eed5736cacb77f0e Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 26 Dec 2018 17:52:49 -0500 Subject: [PATCH 01/13] Added ignite and kafka dataset constructors --- R-package/R/ignite_dataset.R | 56 ++++++++++++++++++++++++++++++++++++ R-package/R/kafka_dataset.R | 27 +++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 R-package/R/ignite_dataset.R create mode 100644 R-package/R/kafka_dataset.R diff --git a/R-package/R/ignite_dataset.R b/R-package/R/ignite_dataset.R new file mode 100644 index 000000000..5b7ffa3a5 --- /dev/null +++ b/R-package/R/ignite_dataset.R @@ -0,0 +1,56 @@ +#' Create a IgniteDataset. +#' +#' Apache Ignite is a memory-centric distributed database, caching, and +#' processing platform for transactional, analytical, and streaming workloads, +#' delivering in-memory speeds at petabyte scale. This contrib package +#' contains an integration between Apache Ignite and TensorFlow. The +#' integration is based on tf.data from TensorFlow side and Binary Client +#' Protocol from Apache Ignite side. It allows to use Apache Ignite as a +#' datasource for neural network training, inference and all other +#' computations supported by TensorFlow. Ignite Dataset is based on Apache +#' Ignite Binary Client Protocol. +#' +#' @param cache_name Cache name to be used as datasource. +#' @param host Apache Ignite Thin Client host to be connected. +#' @param port Apache Ignite Thin Client port to be connected. +#' @param local Local flag that defines to query only local data. +#' @param part Number of partitions to be queried. +#' @param page_size Apache Ignite Thin Client page size. +#' @param username Apache Ignite Thin Client authentication username. +#' @param password Apache Ignite Thin Client authentication password. +#' @param certfile File in PEM format containing the certificate as well as any +#' number of CA certificates needed to establish the certificate's +#' authenticity. +#' @param keyfile File containing the private key (otherwise the private key +#' will be taken from certfile as well). +#' @param cert_password Password to be used if the private key is encrypted and +#' a password is necessary. +#' +#' @export +ignite_dataset <- function( + cache_name, + host = "localhost", + port = 10800, + local = FALSE, + part = -1, + page_size = 100, + username = NULL, + password = NULL, + certfile = NULL, + keyfile = NULL, + cert_password = NULL) { + dataset <- tfio_lib$ignite$IgniteDataset( + cache_name = cache_name, + host = host, + port = port, + local = local, + part = part, + page_size = page_size, + username = username, + password = password, + certfile = certfile, + keyfile = keyfile, + cert_password = cert_password + ) + as_tf_dataset(dataset) +} diff --git a/R-package/R/kafka_dataset.R b/R-package/R/kafka_dataset.R new file mode 100644 index 000000000..914faa121 --- /dev/null +++ b/R-package/R/kafka_dataset.R @@ -0,0 +1,27 @@ +#' Creates a Kafka dataset that consumes Kafka messages. +#' +#' @param topics A `tf.string` tensor containing one or more subscriptions, in +#' the format of `[topic:partition:offset:length]`, by default length is -1 +#' for unlimited. +#' @param servers A list of bootstrap servers. +#' @param group The consumer group id. +#' @param eof If True, the kafka reader will stop on EOF. +#' @param timeout The timeout value for the Kafka Consumer to wait (in +#' millisecond). +#' +#' @export +kafka_dataset <- function( + topics, + servers = "localhost", + group = "", + eof = FALSE, + timeout = 1000) { + dataset <- tfio_lib$kafka$KafkaDataset( + topics = topics, + servers = servers, + group = group, + eof = eof, + timeout = timeout + ) + as_tf_dataset(dataset) +} From 091d4163fd89a5de610850bae81105ef5db833d3 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 26 Dec 2018 17:58:50 -0500 Subject: [PATCH 02/13] Added hadoop and kinesis dataset wrappers --- R-package/R/hadoop_dataset.R | 14 ++++++++++++++ R-package/R/ignite_dataset.R | 2 +- R-package/R/kafka_dataset.R | 2 +- R-package/R/kinesis_dataset.R | 28 ++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 R-package/R/hadoop_dataset.R create mode 100644 R-package/R/kinesis_dataset.R diff --git a/R-package/R/hadoop_dataset.R b/R-package/R/hadoop_dataset.R new file mode 100644 index 000000000..8386fbdbc --- /dev/null +++ b/R-package/R/hadoop_dataset.R @@ -0,0 +1,14 @@ +#' Create a `SequenceFileDataset`. +#' +#' This function allows a user to read data from a hadoop sequence +#' file. A sequence file consists of (key value) pairs sequentially. At +#' the moment, `org.apache.hadoop.io.Text` is the only serialization type +#' being supported, and there is no compression support. +#' +#' @param filenames A `tf.string` tensor containing one or more filenames. +#' +#' @export +sequence_file_dataset <- function(filenames) { + dataset <- tfio_lib$hadoop$SequenceFileDataset(filenames = filenames) + as_tf_dataset(dataset) +} diff --git a/R-package/R/ignite_dataset.R b/R-package/R/ignite_dataset.R index 5b7ffa3a5..bcf71630a 100644 --- a/R-package/R/ignite_dataset.R +++ b/R-package/R/ignite_dataset.R @@ -1,4 +1,4 @@ -#' Create a IgniteDataset. +#' Create a `IgniteDataset`. #' #' Apache Ignite is a memory-centric distributed database, caching, and #' processing platform for transactional, analytical, and streaming workloads, diff --git a/R-package/R/kafka_dataset.R b/R-package/R/kafka_dataset.R index 914faa121..98c30b8db 100644 --- a/R-package/R/kafka_dataset.R +++ b/R-package/R/kafka_dataset.R @@ -1,4 +1,4 @@ -#' Creates a Kafka dataset that consumes Kafka messages. +#' Creates a `KafkaDataset`. #' #' @param topics A `tf.string` tensor containing one or more subscriptions, in #' the format of `[topic:partition:offset:length]`, by default length is -1 diff --git a/R-package/R/kinesis_dataset.R b/R-package/R/kinesis_dataset.R new file mode 100644 index 000000000..1cf297ddf --- /dev/null +++ b/R-package/R/kinesis_dataset.R @@ -0,0 +1,28 @@ +#' Creates a `KinesisDataset`. +#' +#' Kinesis is a managed service provided by AWS for data streaming. +#' This dataset reads messages from Kinesis with each message presented +#' as a `tf.string`. +#' +#' @param stream A `tf.string` tensor containing the name of the stream. +#' @param shard A `tf.string` tensor containing the id of the shard. +#' @param read_indefinitely If `True`, the Kinesis dataset will keep retry again +#' on `EOF` after the `interval` period. If `False`, then the dataset will +#' stop on `EOF`. The default value is `True`. +#' @param interval The interval for the Kinesis Client to wait before it tries +#' to get records again (in millisecond). +#' +#' @export +kinesis_dataset <- function( + stream, + shard = "", + read_indefinitely = TRUE, + interval = 100000) { + dataset <- tfio_lib$kinesis$KinesisDataset( + stream = stream, + shard = shard, + read_indefinitely = read_indefinitely, + interval = interval + ) + as_tf_dataset(dataset) +} From 69852d756d7372dea2e472f87052516dc717814c Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 26 Dec 2018 18:00:11 -0500 Subject: [PATCH 03/13] Explicitly cast some args to integers --- R-package/R/ignite_dataset.R | 6 +++--- R-package/R/kafka_dataset.R | 2 +- R-package/R/kinesis_dataset.R | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R-package/R/ignite_dataset.R b/R-package/R/ignite_dataset.R index bcf71630a..6ee08e10b 100644 --- a/R-package/R/ignite_dataset.R +++ b/R-package/R/ignite_dataset.R @@ -42,10 +42,10 @@ ignite_dataset <- function( dataset <- tfio_lib$ignite$IgniteDataset( cache_name = cache_name, host = host, - port = port, + port = as.integer(port), local = local, - part = part, - page_size = page_size, + part = as.integer(part), + page_size = as.integer(page_size), username = username, password = password, certfile = certfile, diff --git a/R-package/R/kafka_dataset.R b/R-package/R/kafka_dataset.R index 98c30b8db..09a4b734c 100644 --- a/R-package/R/kafka_dataset.R +++ b/R-package/R/kafka_dataset.R @@ -21,7 +21,7 @@ kafka_dataset <- function( servers = servers, group = group, eof = eof, - timeout = timeout + timeout = as.integer(timeout) ) as_tf_dataset(dataset) } diff --git a/R-package/R/kinesis_dataset.R b/R-package/R/kinesis_dataset.R index 1cf297ddf..86b71f5e4 100644 --- a/R-package/R/kinesis_dataset.R +++ b/R-package/R/kinesis_dataset.R @@ -22,7 +22,7 @@ kinesis_dataset <- function( stream = stream, shard = shard, read_indefinitely = read_indefinitely, - interval = interval + interval = as.integer(interval) ) as_tf_dataset(dataset) } From a84fd9df2079a296aa8525de69099eac7f0671b8 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 26 Dec 2018 18:15:06 -0500 Subject: [PATCH 04/13] Use forge package for type checking and casting --- R-package/DESCRIPTION | 5 ++- R-package/NAMESPACE | 7 ++++ R-package/R/dataset_utils.R | 6 ++-- R-package/R/ignite_dataset.R | 18 +++++----- R-package/R/kafka_dataset.R | 4 +-- R-package/R/kinesis_dataset.R | 4 +-- R-package/R/package.R | 23 +----------- R-package/R/reexports.R | 4 +++ R-package/man/ignite_dataset.Rd | 49 ++++++++++++++++++++++++++ R-package/man/kafka_dataset.Rd | 26 ++++++++++++++ R-package/man/kinesis_dataset.Rd | 26 ++++++++++++++ R-package/man/reexports.Rd | 3 +- R-package/man/sequence_file_dataset.Rd | 17 +++++++++ 13 files changed, 150 insertions(+), 42 deletions(-) create mode 100644 R-package/man/ignite_dataset.Rd create mode 100644 R-package/man/kafka_dataset.Rd create mode 100644 R-package/man/kinesis_dataset.Rd create mode 100644 R-package/man/sequence_file_dataset.Rd diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index d6a78cb3b..e80872668 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -24,6 +24,7 @@ Imports: reticulate (>= 1.10), tensorflow (>= 1.9), tfdatasets (>= 1.9), + forge, magrittr, rlang, tidyselect, @@ -32,7 +33,5 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 6.1.0 Suggests: testthat, - knitr, - tfestimators, - keras + knitr VignetteBuilder: knitr diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index af3467069..6b42d2eb1 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -1,9 +1,15 @@ # Generated by roxygen2: do not edit by hand export("%>%") +export(ignite_dataset) export(install_tensorflow) +export(kafka_dataset) +export(kinesis_dataset) +export(sequence_file_dataset) export(tf) export(tf_config) +export(tf_version) +import(forge) import(rlang) import(tfdatasets) import(tidyselect) @@ -16,3 +22,4 @@ importFrom(reticulate,tuple) importFrom(tensorflow,install_tensorflow) importFrom(tensorflow,tf) importFrom(tensorflow,tf_config) +importFrom(tensorflow,tf_version) diff --git a/R-package/R/dataset_utils.R b/R-package/R/dataset_utils.R index c8418a83e..d39953b5c 100644 --- a/R-package/R/dataset_utils.R +++ b/R-package/R/dataset_utils.R @@ -1,13 +1,13 @@ as_tf_dataset <- function (dataset) { - if (!is_dataset(dataset)) + if (!is_dataset(dataset)) stop("Provided dataset is not a TensorFlow Dataset") - if (!inherits(dataset, "tf_dataset")) + if (!inherits(dataset, "tf_dataset")) class(dataset) <- c("tf_dataset", class(dataset)) dataset } is_dataset <- function (x) { - inherits(x, "tensorflow.python.data.ops.dataset_ops.Dataset") || is_tfio_dataset(X) + inherits(x, "tensorflow.python.data.ops.dataset_ops.Dataset") || is_tfio_dataset(x) } is_tfio_dataset <- function(x) { diff --git a/R-package/R/ignite_dataset.R b/R-package/R/ignite_dataset.R index 6ee08e10b..3288c6fe8 100644 --- a/R-package/R/ignite_dataset.R +++ b/R-package/R/ignite_dataset.R @@ -42,15 +42,15 @@ ignite_dataset <- function( dataset <- tfio_lib$ignite$IgniteDataset( cache_name = cache_name, host = host, - port = as.integer(port), - local = local, - part = as.integer(part), - page_size = as.integer(page_size), - username = username, - password = password, - certfile = certfile, - keyfile = keyfile, - cert_password = cert_password + port = cast_scalar_integer(port), + local = cast_logical(local), + part = cast_scalar_integer(part), + page_size = cast_scalar_integer(page_size), + username = cast_nullable_string(username), + password = cast_nullable_string(password), + certfile = cast_nullable_string(certfile), + keyfile = cast_nullable_string(keyfile), + cert_password = cast_nullable_string(cert_password) ) as_tf_dataset(dataset) } diff --git a/R-package/R/kafka_dataset.R b/R-package/R/kafka_dataset.R index 09a4b734c..895a3f767 100644 --- a/R-package/R/kafka_dataset.R +++ b/R-package/R/kafka_dataset.R @@ -20,8 +20,8 @@ kafka_dataset <- function( topics = topics, servers = servers, group = group, - eof = eof, - timeout = as.integer(timeout) + eof = cast_logical(eof), + timeout = cast_scalar_integer(timeout) ) as_tf_dataset(dataset) } diff --git a/R-package/R/kinesis_dataset.R b/R-package/R/kinesis_dataset.R index 86b71f5e4..967bf585c 100644 --- a/R-package/R/kinesis_dataset.R +++ b/R-package/R/kinesis_dataset.R @@ -21,8 +21,8 @@ kinesis_dataset <- function( dataset <- tfio_lib$kinesis$KinesisDataset( stream = stream, shard = shard, - read_indefinitely = read_indefinitely, - interval = as.integer(interval) + read_indefinitely = cast_logical(read_indefinitely), + interval = cast_scalar_integer(interval) ) as_tf_dataset(dataset) } diff --git a/R-package/R/package.R b/R-package/R/package.R index 8f93111f2..00c6a2857 100644 --- a/R-package/R/package.R +++ b/R-package/R/package.R @@ -12,6 +12,7 @@ NULL #' @import tidyselect #' @import rlang #' @import tfdatasets +#' @import forge NULL tfio_lib <- NULL @@ -63,25 +64,3 @@ check_tensorflow_version <- function(displayed_warning) { .onDetach <- function(libpath) { } - -# Reusable function for registering a set of methods with S3 manually. The -# methods argument is a list of character vectors, each of which has the form -# c(package, genname, class). -registerMethods <- function(methods) { - lapply(methods, function(method) { - pkg <- method[[1]] - generic <- method[[2]] - class <- method[[3]] - func <- get(paste(generic, class, sep = ".")) - if (pkg %in% loadedNamespaces()) { - registerS3method(generic, class, func, envir = asNamespace(pkg)) - } - setHook( - packageEvent(pkg, "onLoad"), - function(...) { - registerS3method(generic, class, func, envir = asNamespace(pkg)) - } - ) - }) -} - diff --git a/R-package/R/reexports.R b/R-package/R/reexports.R index acd6b6096..73a61a512 100644 --- a/R-package/R/reexports.R +++ b/R-package/R/reexports.R @@ -21,3 +21,7 @@ tensorflow::install_tensorflow #' @importFrom tensorflow tf_config #' @export tensorflow::tf_config + +#' @importFrom tensorflow tf_version +#' @export +tensorflow::tf_version diff --git a/R-package/man/ignite_dataset.Rd b/R-package/man/ignite_dataset.Rd new file mode 100644 index 000000000..5b6f4234b --- /dev/null +++ b/R-package/man/ignite_dataset.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ignite_dataset.R +\name{ignite_dataset} +\alias{ignite_dataset} +\title{Create a \code{IgniteDataset}.} +\usage{ +ignite_dataset(cache_name, host = "localhost", port = 10800, + local = FALSE, part = -1, page_size = 100, username = NULL, + password = NULL, certfile = NULL, keyfile = NULL, + cert_password = NULL) +} +\arguments{ +\item{cache_name}{Cache name to be used as datasource.} + +\item{host}{Apache Ignite Thin Client host to be connected.} + +\item{port}{Apache Ignite Thin Client port to be connected.} + +\item{local}{Local flag that defines to query only local data.} + +\item{part}{Number of partitions to be queried.} + +\item{page_size}{Apache Ignite Thin Client page size.} + +\item{username}{Apache Ignite Thin Client authentication username.} + +\item{password}{Apache Ignite Thin Client authentication password.} + +\item{certfile}{File in PEM format containing the certificate as well as any +number of CA certificates needed to establish the certificate's +authenticity.} + +\item{keyfile}{File containing the private key (otherwise the private key +will be taken from certfile as well).} + +\item{cert_password}{Password to be used if the private key is encrypted and +a password is necessary.} +} +\description{ +Apache Ignite is a memory-centric distributed database, caching, and +processing platform for transactional, analytical, and streaming workloads, +delivering in-memory speeds at petabyte scale. This contrib package +contains an integration between Apache Ignite and TensorFlow. The +integration is based on tf.data from TensorFlow side and Binary Client +Protocol from Apache Ignite side. It allows to use Apache Ignite as a +datasource for neural network training, inference and all other +computations supported by TensorFlow. Ignite Dataset is based on Apache +Ignite Binary Client Protocol. +} diff --git a/R-package/man/kafka_dataset.Rd b/R-package/man/kafka_dataset.Rd new file mode 100644 index 000000000..8ff10110f --- /dev/null +++ b/R-package/man/kafka_dataset.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kafka_dataset.R +\name{kafka_dataset} +\alias{kafka_dataset} +\title{Creates a \code{KafkaDataset}.} +\usage{ +kafka_dataset(topics, servers = "localhost", group = "", eof = FALSE, + timeout = 1000) +} +\arguments{ +\item{topics}{A \code{tf.string} tensor containing one or more subscriptions, in +the format of \code{[topic:partition:offset:length]}, by default length is -1 +for unlimited.} + +\item{servers}{A list of bootstrap servers.} + +\item{group}{The consumer group id.} + +\item{eof}{If True, the kafka reader will stop on EOF.} + +\item{timeout}{The timeout value for the Kafka Consumer to wait (in +millisecond).} +} +\description{ +Creates a \code{KafkaDataset}. +} diff --git a/R-package/man/kinesis_dataset.Rd b/R-package/man/kinesis_dataset.Rd new file mode 100644 index 000000000..986e2c119 --- /dev/null +++ b/R-package/man/kinesis_dataset.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kinesis_dataset.R +\name{kinesis_dataset} +\alias{kinesis_dataset} +\title{Creates a \code{KinesisDataset}.} +\usage{ +kinesis_dataset(stream, shard = "", read_indefinitely = TRUE, + interval = 1e+05) +} +\arguments{ +\item{stream}{A \code{tf.string} tensor containing the name of the stream.} + +\item{shard}{A \code{tf.string} tensor containing the id of the shard.} + +\item{read_indefinitely}{If \code{True}, the Kinesis dataset will keep retry again +on \code{EOF} after the \code{interval} period. If \code{False}, then the dataset will +stop on \code{EOF}. The default value is \code{True}.} + +\item{interval}{The interval for the Kinesis Client to wait before it tries +to get records again (in millisecond).} +} +\description{ +Kinesis is a managed service provided by AWS for data streaming. +This dataset reads messages from Kinesis with each message presented +as a \code{tf.string}. +} diff --git a/R-package/man/reexports.Rd b/R-package/man/reexports.Rd index 8a1dd55e5..fe66c6a57 100644 --- a/R-package/man/reexports.Rd +++ b/R-package/man/reexports.Rd @@ -6,6 +6,7 @@ \alias{tf} \alias{install_tensorflow} \alias{tf_config} +\alias{tf_version} \title{Objects exported from other packages} \keyword{internal} \description{ @@ -13,6 +14,6 @@ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ - \item{tensorflow}{\code{\link[tensorflow]{tf}}, \code{\link[tensorflow]{install_tensorflow}}, \code{\link[tensorflow]{tf_config}}} + \item{tensorflow}{\code{\link[tensorflow]{tf}}, \code{\link[tensorflow]{install_tensorflow}}, \code{\link[tensorflow]{tf_config}}, \code{\link[tensorflow]{tf_version}}} }} diff --git a/R-package/man/sequence_file_dataset.Rd b/R-package/man/sequence_file_dataset.Rd new file mode 100644 index 000000000..5da68ad3b --- /dev/null +++ b/R-package/man/sequence_file_dataset.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hadoop_dataset.R +\name{sequence_file_dataset} +\alias{sequence_file_dataset} +\title{Create a \code{SequenceFileDataset}.} +\usage{ +sequence_file_dataset(filenames) +} +\arguments{ +\item{filenames}{A \code{tf.string} tensor containing one or more filenames.} +} +\description{ +This function allows a user to read data from a hadoop sequence +file. A sequence file consists of (key value) pairs sequentially. At +the moment, \code{org.apache.hadoop.io.Text} is the only serialization type +being supported, and there is no compression support. +} From f38691a7b6b94929860941ab574a0dbaefd8bc63 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 26 Dec 2018 18:16:17 -0500 Subject: [PATCH 05/13] Added R CMD check output files to .gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index b72dc7014..643f50436 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,8 @@ __pycache__ *.pbxproj *.xcworkspacedata .ipynb_checkpoints + +# Auto-generated files by `R CMD check` +tfio.Rcheck/ +tfio_*.tar.gz + From 0c544b19d4db610ae1ff1448e17a2006f90c3353 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 26 Dec 2018 18:18:21 -0500 Subject: [PATCH 06/13] Added .Rproj.user to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 643f50436..3c6151425 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,5 @@ __pycache__ # Auto-generated files by `R CMD check` tfio.Rcheck/ tfio_*.tar.gz +.Rproj.user From c9fe78f26b964e2b1184cc29bc4c6af29488d52f Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 26 Dec 2018 18:21:45 -0500 Subject: [PATCH 07/13] Added scripts folder to Rbuildignore to avoid R CMD check warning --- R-package/.Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/R-package/.Rbuildignore b/R-package/.Rbuildignore index fc28d4b80..31becb436 100644 --- a/R-package/.Rbuildignore +++ b/R-package/.Rbuildignore @@ -1,3 +1,4 @@ ^.*\.Rproj$ ^\.Rproj\.user$ ^man-roxygen/ +scripts From 1cf6d1f80b7028c1e356a58a76bc04d4e20c6284 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 26 Dec 2018 18:42:27 -0500 Subject: [PATCH 08/13] Re-exports reusable methods from tfdatasets package --- R-package/NAMESPACE | 42 +++++++++++++++++ R-package/R/reexports.R | 94 ++++++++++++++++++++++++++++++++++++++ R-package/man/reexports.Rd | 23 ++++++++++ 3 files changed, 159 insertions(+) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 6b42d2eb1..256a822c8 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -1,14 +1,35 @@ # Generated by roxygen2: do not edit by hand export("%>%") +export(dataset_batch) +export(dataset_cache) +export(dataset_concatenate) +export(dataset_filter) +export(dataset_flat_map) +export(dataset_interleave) +export(dataset_map) +export(dataset_map_and_batch) +export(dataset_padded_batch) +export(dataset_prefetch) +export(dataset_prefetch_to_device) +export(dataset_prepare) +export(dataset_repeat) +export(dataset_shard) +export(dataset_shuffle) +export(dataset_shuffle_and_repeat) +export(dataset_skip) +export(dataset_take) export(ignite_dataset) export(install_tensorflow) export(kafka_dataset) export(kinesis_dataset) +export(next_batch) export(sequence_file_dataset) export(tf) export(tf_config) export(tf_version) +export(until_out_of_range) +export(with_dataset) import(forge) import(rlang) import(tfdatasets) @@ -23,3 +44,24 @@ importFrom(tensorflow,install_tensorflow) importFrom(tensorflow,tf) importFrom(tensorflow,tf_config) importFrom(tensorflow,tf_version) +importFrom(tfdatasets,dataset_batch) +importFrom(tfdatasets,dataset_cache) +importFrom(tfdatasets,dataset_concatenate) +importFrom(tfdatasets,dataset_filter) +importFrom(tfdatasets,dataset_flat_map) +importFrom(tfdatasets,dataset_interleave) +importFrom(tfdatasets,dataset_map) +importFrom(tfdatasets,dataset_map_and_batch) +importFrom(tfdatasets,dataset_padded_batch) +importFrom(tfdatasets,dataset_prefetch) +importFrom(tfdatasets,dataset_prefetch_to_device) +importFrom(tfdatasets,dataset_prepare) +importFrom(tfdatasets,dataset_repeat) +importFrom(tfdatasets,dataset_shard) +importFrom(tfdatasets,dataset_shuffle) +importFrom(tfdatasets,dataset_shuffle_and_repeat) +importFrom(tfdatasets,dataset_skip) +importFrom(tfdatasets,dataset_take) +importFrom(tfdatasets,next_batch) +importFrom(tfdatasets,until_out_of_range) +importFrom(tfdatasets,with_dataset) diff --git a/R-package/R/reexports.R b/R-package/R/reexports.R index 73a61a512..4e5c8ff6c 100644 --- a/R-package/R/reexports.R +++ b/R-package/R/reexports.R @@ -25,3 +25,97 @@ tensorflow::tf_config #' @importFrom tensorflow tf_version #' @export tensorflow::tf_version + + +# Re-exports from tfdatasets dataset_iterators + +#' @importFrom tfdatasets next_batch +#' @export +tfdatasets::next_batch + +#' @importFrom tfdatasets with_dataset +#' @export +tfdatasets::with_dataset + +#' @importFrom tfdatasets until_out_of_range +#' @export +tfdatasets::until_out_of_range + + +# Re-exports from tfdatasets dataset_methods + +#' @importFrom tfdatasets dataset_repeat +#' @export +tfdatasets::dataset_repeat + +#' @importFrom tfdatasets dataset_shuffle +#' @export +tfdatasets::dataset_shuffle + +#' @importFrom tfdatasets dataset_shuffle_and_repeat +#' @export +tfdatasets::dataset_shuffle_and_repeat + +#' @importFrom tfdatasets dataset_batch +#' @export +tfdatasets::dataset_batch + +#' @importFrom tfdatasets dataset_cache +#' @export +tfdatasets::dataset_cache + +#' @importFrom tfdatasets dataset_concatenate +#' @export +tfdatasets::dataset_concatenate + +#' @importFrom tfdatasets dataset_take +#' @export +tfdatasets::dataset_take + +#' @importFrom tfdatasets dataset_map +#' @export +tfdatasets::dataset_map + +#' @importFrom tfdatasets dataset_map_and_batch +#' @export +tfdatasets::dataset_map_and_batch + +#' @importFrom tfdatasets dataset_flat_map +#' @export +tfdatasets::dataset_flat_map + +#' @importFrom tfdatasets dataset_prefetch +#' @export +tfdatasets::dataset_prefetch + +#' @importFrom tfdatasets dataset_prefetch_to_device +#' @export +tfdatasets::dataset_prefetch_to_device + +#' @importFrom tfdatasets dataset_filter +#' @export +tfdatasets::dataset_filter + +#' @importFrom tfdatasets dataset_skip +#' @export +tfdatasets::dataset_skip + +#' @importFrom tfdatasets dataset_interleave +#' @export +tfdatasets::dataset_interleave + +#' @importFrom tfdatasets dataset_prefetch +#' @export +tfdatasets::dataset_prefetch + +#' @importFrom tfdatasets dataset_shard +#' @export +tfdatasets::dataset_shard + +#' @importFrom tfdatasets dataset_padded_batch +#' @export +tfdatasets::dataset_padded_batch + +#' @importFrom tfdatasets dataset_prepare +#' @export +tfdatasets::dataset_prepare diff --git a/R-package/man/reexports.Rd b/R-package/man/reexports.Rd index fe66c6a57..5f5f5725c 100644 --- a/R-package/man/reexports.Rd +++ b/R-package/man/reexports.Rd @@ -7,6 +7,27 @@ \alias{install_tensorflow} \alias{tf_config} \alias{tf_version} +\alias{next_batch} +\alias{with_dataset} +\alias{until_out_of_range} +\alias{dataset_repeat} +\alias{dataset_shuffle} +\alias{dataset_shuffle_and_repeat} +\alias{dataset_batch} +\alias{dataset_cache} +\alias{dataset_concatenate} +\alias{dataset_take} +\alias{dataset_map} +\alias{dataset_map_and_batch} +\alias{dataset_flat_map} +\alias{dataset_prefetch} +\alias{dataset_prefetch_to_device} +\alias{dataset_filter} +\alias{dataset_skip} +\alias{dataset_interleave} +\alias{dataset_shard} +\alias{dataset_padded_batch} +\alias{dataset_prepare} \title{Objects exported from other packages} \keyword{internal} \description{ @@ -15,5 +36,7 @@ below to see their documentation. \describe{ \item{tensorflow}{\code{\link[tensorflow]{tf}}, \code{\link[tensorflow]{install_tensorflow}}, \code{\link[tensorflow]{tf_config}}, \code{\link[tensorflow]{tf_version}}} + + \item{tfdatasets}{\code{\link[tfdatasets]{next_batch}}, \code{\link[tfdatasets]{with_dataset}}, \code{\link[tfdatasets]{until_out_of_range}}, \code{\link[tfdatasets]{dataset_repeat}}, \code{\link[tfdatasets]{dataset_shuffle}}, \code{\link[tfdatasets]{dataset_shuffle_and_repeat}}, \code{\link[tfdatasets]{dataset_batch}}, \code{\link[tfdatasets]{dataset_cache}}, \code{\link[tfdatasets]{dataset_concatenate}}, \code{\link[tfdatasets]{dataset_take}}, \code{\link[tfdatasets]{dataset_map}}, \code{\link[tfdatasets]{dataset_map_and_batch}}, \code{\link[tfdatasets]{dataset_flat_map}}, \code{\link[tfdatasets]{dataset_prefetch}}, \code{\link[tfdatasets]{dataset_prefetch_to_device}}, \code{\link[tfdatasets]{dataset_filter}}, \code{\link[tfdatasets]{dataset_skip}}, \code{\link[tfdatasets]{dataset_interleave}}, \code{\link[tfdatasets]{dataset_prefetch}}, \code{\link[tfdatasets]{dataset_shard}}, \code{\link[tfdatasets]{dataset_padded_batch}}, \code{\link[tfdatasets]{dataset_prepare}}} }} From 23bbdd096a62b3bc6eeeb469626d558b44a27733 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Wed, 26 Dec 2018 18:48:33 -0500 Subject: [PATCH 09/13] Fix for is_tfio_dataset --- R-package/R/dataset_utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/dataset_utils.R b/R-package/R/dataset_utils.R index d39953b5c..37bbef49c 100644 --- a/R-package/R/dataset_utils.R +++ b/R-package/R/dataset_utils.R @@ -11,5 +11,5 @@ is_dataset <- function (x) { } is_tfio_dataset <- function(x) { - "tensorflow_io" %in% class(x) + grepl("tensorflow_io", class(x)) } From f3c744263a508be93caaab03484ab20cdbe40687 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Sun, 6 Jan 2019 19:22:13 -0500 Subject: [PATCH 10/13] Add Dockerfile for R env --- R-package/scripts/Dockerfile | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 R-package/scripts/Dockerfile diff --git a/R-package/scripts/Dockerfile b/R-package/scripts/Dockerfile new file mode 100644 index 000000000..f7f0385be --- /dev/null +++ b/R-package/scripts/Dockerfile @@ -0,0 +1,18 @@ +FROM r-base +COPY . . + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + python-dev \ + python-setuptools \ + python-pip && \ + rm -rf /var/lib/apt/lists/* + +# Dependencies +RUN pip install tensorflow-io +RUN Rscript -e 'install.packages(c("Rcpp", "reticulate", "knitr", "tensorflow", "tfdatasets", "forge", "tidyselect"))' + +# tfio package installation +RUN R CMD build R-package/ +RUN R CMD INSTALL tfio_*.gz From 460e0e5a7c75d4b208525c3031fe047bf842acc8 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Sun, 6 Jan 2019 19:23:03 -0500 Subject: [PATCH 11/13] Fix 'attempt to apply non-function' issue --- R-package/R/package.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R-package/R/package.R b/R-package/R/package.R index 00c6a2857..69583bde4 100644 --- a/R-package/R/package.R +++ b/R-package/R/package.R @@ -36,8 +36,7 @@ tfio_lib <- NULL } ) - # TODO: This is commented out for now until we add the wrappers. - # tfio_lib <<- import("tensorflow_io", delay_load = delay_load) + tfio_lib <<- import("tensorflow_io", delay_load = delay_load) } From 612e0b6806faa2bcd7c60c439619239da0336a2a Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Sun, 6 Jan 2019 19:39:44 -0500 Subject: [PATCH 12/13] Added simple testcase for sequence_file_dataset() --- R-package/tests/testthat/test-datasets-ops.R | 5 +++-- R-package/tests/testthat/testdata/string.seq | Bin 0 -> 603 bytes 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100755 R-package/tests/testthat/testdata/string.seq diff --git a/R-package/tests/testthat/test-datasets-ops.R b/R-package/tests/testthat/test-datasets-ops.R index f21796778..c3bdb0891 100644 --- a/R-package/tests/testthat/test-datasets-ops.R +++ b/R-package/tests/testthat/test-datasets-ops.R @@ -2,6 +2,7 @@ context("TensorFlow IO dataset ops") source("utils.R") -test_succeeds("All TensorFlow IO dataset ops work", { - print("Placeholder for now") +test_succeeds("sequence_file_dataset() works successfully", { + sequence_file_dataset("testdata/string.seq") %>% + dataset_repeat(2) }) diff --git a/R-package/tests/testthat/testdata/string.seq b/R-package/tests/testthat/testdata/string.seq new file mode 100755 index 0000000000000000000000000000000000000000..b7175338af3417a8858d66082ab5a616f87cb234 GIT binary patch literal 603 zcmZ|KI|{-85QJet5gQL6mR2H!%;sHM1T1XCM;|m8s|2ygWjukkAl^gpE;a^na2gjD zmVc{nnJ&76^0;@hD%%#hQ)IicteivXtn&Kw|I_+RGJ8Gmqep$dU9PjqaNNXxvz@l< zqPLmO*J*P|G)9x?F?texMqgsU7)T5mLx~Y%Br#@;B_@o?FT(vsxDqLXh(HlU1d1Rc uPy`WyB8UhSK}4VkA_7Ga5h#L)j3N_}QDh=AicCaCk%`DCG7%X?7Q6wu3t-Ox literal 0 HcmV?d00001 From 8db682904214712fb7a7e4d9dd31c01e24f2153e Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Sun, 6 Jan 2019 20:04:38 -0500 Subject: [PATCH 13/13] Add instruction and example in README.md --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index 1cb073033..d3c07fc05 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,22 @@ $ # In docker $ R -e "devtools::test()" ``` +Alternatively, we also provided a reference Dockerfile [here](R-package/scripts/Dockerfile) for you +so that you can use the R package directly for testing. You can build it via: +``` +docker build -t tfio-r-dev -f R-package/scripts/Dockerfile . +``` + +Inside the container, you can start your R session, instantiate a `SequenceFileDataset` +from an example [Hadoop SequenceFile](https://wiki.apache.org/hadoop/SequenceFile) +[string.seq](R-package/tests/testthat/testdata/string.seq), and then use any [transformation functions](https://tensorflow.rstudio.com/tools/tfdatasets/articles/introduction.html#transformations) provided by [tfdatasets package](https://tensorflow.rstudio.com/tools/tfdatasets/) on the dataset like the following: + +```{R} +library(tfio) +dataset <- sequence_file_dataset("R-package/tests/testthat/testdata/string.seq") %>% + dataset_repeat(2) +``` + ## License [Apache License 2.0](LICENSE)