RedisLabs
diff --git a/‎Makefile
Lines changed: 10 additions & 1 deletion b/‎Makefile
Lines changed: 10 additions & 1 deletion
diff --git a/‎doc/configuration.md
Lines changed: 2 additions & 2 deletions b/‎doc/configuration.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/dataframe.md
Lines changed: 98 additions & 32 deletions b/‎doc/dataframe.md
Lines changed: 98 additions & 32 deletions
diff --git a/‎doc/java.md
Lines changed: 5 additions & 5 deletions b/‎doc/java.md
Lines changed: 5 additions & 5 deletions
diff --git a/‎pom.xml
Lines changed: 18 additions & 1 deletion b/‎pom.xml
Lines changed: 18 additions & 1 deletion
diff --git a/‎src/main/scala/com/redislabs/provider/redis/RedisConfig.scala
Lines changed: 2 additions & 2 deletions b/‎src/main/scala/com/redislabs/provider/redis/RedisConfig.scala
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/main/scala/com/redislabs/provider/redis/rdd/RedisRDD.scala
Lines changed: 5 additions & 0 deletions b/‎src/main/scala/com/redislabs/provider/redis/rdd/RedisRDD.scala
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/main/scala/com/redislabs/provider/redis/util/ConnectionUtils.scala
Lines changed: 15 additions & 0 deletions b/‎src/main/scala/com/redislabs/provider/redis/util/ConnectionUtils.scala
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/main/scala/org/apache/spark/sql/redis/BinaryRedisPersistence.scala
Lines changed: 3 additions & 2 deletions b/‎src/main/scala/org/apache/spark/sql/redis/BinaryRedisPersistence.scala
Lines changed: 3 additions & 2 deletions
@@ -85,11 +85,20 @@ stop:
 	make stop-standalone
 	make stop-cluster
 
+restart:
+	make stop
+	make start
+
 test:
 	make start
 	# with --batch-mode maven doesn't print 'Progress: 125/150kB', the progress lines take up 90% of the log and causes
 	# Travis build to fail with 'The job exceeded the maximum log length, and has been terminated'
-	mvn --batch-mode -Dtest=${TEST} clean compile test
+	mvn clean test -B
+	make stop
+
+benchmark:
+	make start
+	mvn clean test -B -Pbenchmark
 	make stop
 
 deploy:
 
@@ -8,8 +8,8 @@ topology from the initial node, so there is no need to provide the rest of the c
 * `spark.redis.auth` - the initial node's AUTH password
 * `spark.redis.db` - optional DB number. Avoid using this, especially in cluster mode.
 * `spark.redis.timeout` - connection timeout in ms, 2000 ms by default
-* `spark.redis.max.pipeline.size` - the maximum number of commands per pipeline (used to batch commands). The default value is 10000.
-* `spark.redis.scan.count` - count option of SCAN command (used to iterate over keys). The default value is 10000.
+* `spark.redis.max.pipeline.size` - the maximum number of commands per pipeline (used to batch commands). The default value is 100.
+* `spark.redis.scan.count` - count option of SCAN command (used to iterate over keys). The default value is 100.
 
 
 
@@ -28,12 +28,13 @@ object DataFrameExample {
   case class Person(name: String, age: Int)
 
   def main(args: Array[String]): Unit = {
-    val conf = new SparkConf().setAppName("redis-df")
-      .setMaster("local[*]")
-      .set("spark.redis.host", "localhost")
-      .set("spark.redis.port", "6379")
-
-    val spark = SparkSession.builder().config(conf).getOrCreate()
+    val spark = SparkSession
+      .builder()
+      .appName("redis-df")
+      .master("local[*]")
+      .config("spark.redis.host", "localhost")
+      .config("spark.redis.port", "6379")
+      .getOrCreate()
 
     val personSeq = Seq(Person("John", 30), Person("Peter", 45))
     val df = spark.createDataFrame(personSeq)
@@ -93,6 +94,25 @@ The keys in Redis:
 2) "person:Peter"
 ```
 
+The keys will not be persisted in Redis hashes
+
+```bash
+127.0.0.1:6379> hgetall person:John
+1) "age"
+2) "30"
+```
+
+In order to load the keys back, you also need to specify
+the key column parameter while reading
+
+```scala
+val df = spark.read
+  .format("org.apache.spark.sql.redis")
+  .option("table", "person")
+  .option("key.column", "name")
+  .load()
+```
+
 ### Save Modes
 
 Spark-redis supports all DataFrame [SaveMode](https://spark.apache.org/docs/latest/sql-programming-guide.html#save-modes)'s: `Append`, 
@@ -142,8 +162,8 @@ It also enables projection query optimization when only a small subset of column
 a limitation with Hash model - it doesn't support nested DataFrame schema. One option to overcome it is making your DataFrame schema flat.
 If it is not possible due to some constraints, you may consider using Binary persistence model.
 
-With the Binary persistence model the DataFrame row is serialized into a byte array and stored as a string in Redis. This implies that 
-storage model is private to spark-redis library and data cannot be easily queried from non-Spark environments. Another drawback 
+With the Binary persistence model the DataFrame row is serialized into a byte array and stored as a string in Redis (the default Java Serialization is used).
+This implies that storage model is private to spark-redis library and data cannot be easily queried from non-Spark environments. Another drawback 
 of Binary model is a larger memory footprint.   
 
 To enable Binary model use `option("model", "binary")`, e.g.
@@ -171,17 +191,18 @@ There are two options how you can read a DataFrame:
 To read a previously saved DataFrame, specify the table name that was used for saving. Example:
 
 ```scala
-object DataFrameTests {
+object DataFrameExample {
 
   case class Person(name: String, age: Int)
 
   def main(args: Array[String]): Unit = {
-    val conf = new SparkConf().setAppName("redis-df")
-      .setMaster("local[*]")
-      .set("spark.redis.host", "localhost")
-      .set("spark.redis.port", "6379")
-
-    val spark = SparkSession.builder().config(conf).getOrCreate()
+    val spark = SparkSession
+          .builder()
+          .appName("redis-df")
+          .master("local[*]")
+          .config("spark.redis.host", "localhost")
+          .config("spark.redis.port", "6379")
+          .getOrCreate()
 
     val personSeq = Seq(Person("John", 30), Person("Peter", 45))
     val df = spark.createDataFrame(personSeq)
@@ -213,9 +234,11 @@ root
 +-----+---+
 | John| 30|
 |Peter| 45|
-+-----+---+ 
++-----+---+
 ```
 
+If they `key.column` option was used for writing, then it should be also used for reading table back. See [Specifying Redis key](#specifying-redis-key) for details.
+
 To read with a Spark SQL:
 
 ```scala
@@ -230,22 +253,63 @@ val loadedDf = spark.sql(s"SELECT * FROM person")
 
 To read Redis Hashes you have to provide keys pattern with `.option("keys.pattern", keysPattern)` option. The DataFrame schema should be explicitly specified or can be inferred from a random row.
 
-An example of explicit schema:
+```bash
+hset person:1 name John age 30
+hset person:2 name Peter age 45
+```
+
+An example of providing an explicit schema and specifying `key.column`:
 
 ```scala
- val df = spark.read
-               .format("org.apache.spark.sql.redis")
-               .schema(
-                  StructType(Array(
-                    StructField("name", StringType),
-                    StructField("age", IntegerType))
-                  )
-               ) 
-               .option("keys.pattern", "person:*")
-               .load()
+val df = spark.read
+              .format("org.apache.spark.sql.redis")
+              .schema(
+                StructType(Array(
+                  StructField("id", IntegerType),
+                  StructField("name", StringType),
+                  StructField("age", IntegerType))
+                )
+              )
+              .option("keys.pattern", "person:*")
+              .option("key.column", "id")
+              .load()
+              
+df.show()
 ```
 
-Another option is to let spark-redis automatically infer schema based on a random row. In this case all columns will have `String` type. Example:
+```bash
++---+-----+---+
+| id| name|age|
++---+-----+---+
+|  1| John| 30|
+|  2|Peter| 45|
++---+-----+---+
+```
+
+Spark-Redis tries to extract the key based on the key pattern:
+- if the pattern ends with `*` and it's the only wildcard, the trailing substring will be extracted
+- otherwise there is no extraction - the key is kept as is, e.g.
+
+    ```scala
+    val df = // code omitted...
+                .option("keys.pattern", "p*:*")
+                .option("key.column", "id")
+                .load()
+    df.show()
+    ```
+
+    ```bash
+    +-----+---+------------+
+    | name|age|          id|
+    +-----+---+------------+
+    | John| 30| person:John|
+    |Peter| 45|person:Peter|
+    +-----+---+------------+
+    ```
+
+Another option is to let spark-redis automatically infer schema based on a random row. In this case all columns will have `String` type. 
+Also we don't specify `key.column` option in this example, so the column `_id` will be created. 
+Example:
 
 ```scala
     val df = spark.read
@@ -262,21 +326,23 @@ The output is:
 root
  |-- name: string (nullable = true)
  |-- age: string (nullable = true)
+ |-- _id: string (nullable = true)
 ```
 
+
 ## DataFrame options
 
 | Name              | Description                                                                               | Type                  | Default |
 | ----------------- | ------------------------------------------------------------------------------------------| --------------------- | ------- |
 | model             | defines Redis model used to persist DataFrame, see [Persistence model](#persistence-model)| `enum [binary, hash]` | `hash`  |
 | partitions.number | number of partitions (applies only when reading dataframe)                                | `Int`                 | `3`     |
-| key.column        | specify unique column used as a Redis key, by default a key is auto-generated             | `String`              | -       |
+| key.column        | when writing - specifies unique column used as a Redis key, by default a key is auto-generated. <br/> When reading - specifies column name to store hash key | `String`              | -       |
 | ttl               | data time to live in `seconds`. Data doesn't expire if `ttl` is less than `1`             | `Int`                 | `0`     |
 | infer.schema      | infer schema from random row, all columns will have `String` type                         | `Boolean`             | `false` |
-| max.pipeline.size | maximum number of commands per pipeline (used to batch commands)                          | `Int`                 | 10000   |
-| scan.count        | count option of SCAN command (used to iterate over keys)                                  | `Int`                 | 10000   |
+| max.pipeline.size | maximum number of commands per pipeline (used to batch commands)                          | `Int`                 | 100     |
+| scan.count        | count option of SCAN command (used to iterate over keys)                                  | `Int`                 | 100     |
 
 
 ## Known limitations
 
- - Nested DataFrame fields are not currently supported with Hash model. Consider making DataFrame schema flat or using Binary persistence model.
+ - Nested DataFrame fields are not currently supported with Hash model. Consider making DataFrame schema flat or using Binary persistence model.
@@ -81,11 +81,11 @@ Dataset<Row> df = spark.createDataFrame(Arrays.asList(
                 new Person("Peter", 40)), Person.class);
 
 df.write()
-        .format("org.apache.spark.sql.redis")
-        .option("table", "person")
-        .option("key.column", "name")
-        .mode(SaveMode.Overwrite)
-        .save();
+  .format("org.apache.spark.sql.redis")
+  .option("table", "person")
+  .option("key.column", "name")
+  .mode(SaveMode.Overwrite)
+  .save();
 ```
 
 ## Streaming
 
@@ -51,6 +51,7 @@
 		<scala.complete.version>${scala.major.version}.12</scala.complete.version>
 		<jedis.version>2.9.0</jedis.version>
 		<spark.version>2.3.1</spark.version>
+		<plugins.scalatest.version>1.0</plugins.scalatest.version>
 	</properties>
 
 	<distributionManagement>
@@ -220,12 +221,13 @@
 			<plugin>
 				<groupId>org.scalatest</groupId>
 				<artifactId>scalatest-maven-plugin</artifactId>
-				<version>1.0</version>
+				<version>${plugins.scalatest.version}</version>
 				<configuration>
 					<reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
 					<junitxml>.</junitxml>
 					<filereports>WDF TestSuite.txt</filereports>
 					<argLine>-XX:MaxPermSize=256m -Xmx2g</argLine>
+					<tagsToExclude>com.redislabs.provider.redis.util.BenchmarkTest</tagsToExclude>
 				</configuration>
 				<executions>
 					<execution>
@@ -303,6 +305,21 @@
 		</dependency>
     </dependencies>
     <profiles>
+		<profile>
+			<id>benchmark</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.scalatest</groupId>
+						<artifactId>scalatest-maven-plugin</artifactId>
+						<version>${plugins.scalatest.version}</version>
+						<configuration combine.self="override">
+							<tagsToInclude>com.redislabs.provider.redis.util.BenchmarkTest</tagsToInclude>
+						</configuration>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
         <profile>
             <id>release</id>
             <build>
 
@@ -86,11 +86,11 @@ case class ReadWriteConfig(scanCount: Int, maxPipelineSize: Int)
 object ReadWriteConfig {
   /** maximum number of commands per pipeline **/
   val MaxPipelineSizeConfKey = "spark.redis.max.pipeline.size"
-  val MaxPipelineSizeDefault = 10000
+  val MaxPipelineSizeDefault = 100
 
   /** count option of SCAN command **/
   val ScanCountConfKey = "spark.redis.scan.count"
-  val ScanCountDefault = 10000
+  val ScanCountDefault = 100
 
   val Default: ReadWriteConfig = ReadWriteConfig(ScanCountDefault, MaxPipelineSizeDefault)
 
 
@@ -475,3 +475,8 @@ trait Keys {
     keys.zip(types).filter(x => x._2 == t).map(x => x._1)
   }
 }
+
+/**
+  * Key utilities to avoid serialization issues.
+  */
+object Keys extends Keys
@@ -0,0 +1,15 @@
+package com.redislabs.provider.redis.util
+
+import redis.clients.jedis.Jedis
+
+/**
+  * @author The Viet Nguyen
+  */
+object ConnectionUtils {
+
+  def withConnection[A](conn: Jedis)(body: Jedis => A): A = {
+    val res = body(conn)
+    conn.close()
+    res
+  }
+}
@@ -25,13 +25,14 @@ class BinaryRedisPersistence extends RedisPersistence[Array[Byte]] {
   override def load(pipeline: Pipeline, key: String, requiredColumns: Seq[String]): Unit =
     pipeline.get(key.getBytes(UTF_8))
 
-  override def encodeRow(value: Row): Array[Byte] = {
+  override def encodeRow(keyName: String, value: Row): Array[Byte] = {
     val fields = value.schema.fields.map(_.name)
     val valuesArray = fields.map(f => value.getAs[Any](f))
     SerializationUtils.serialize(valuesArray)
   }
 
-  override def decodeRow(value: Array[Byte], schema: => StructType, inferSchema: Boolean): Row = {
+  override def decodeRow(keyMap: (String, String), value: Array[Byte], schema: StructType,
+                         requiredColumns: Seq[String]): Row = {
     val valuesArray: Array[Any] = SerializationUtils.deserialize(value)
     new GenericRowWithSchema(valuesArray, schema)
   }
Original file line number	Diff line number	Diff line change
`@@ -475,3 +475,8 @@ trait Keys {`
`475`	`475`	`keys.zip(types).filter(x => x._2 == t).map(x => x._1)`
`476`	`476`	`}`
`477`	`477`	`}`
	`478`	`+`
	`479`	`+/**`
	`480`	`+ * Key utilities to avoid serialization issues.`
	`481`	`+ */`
	`482`	`+object Keys extends Keys`