add avro tutorial

burgerkingeater · burgerkingeater · commit 411673ed64db · 2021-01-03T23:38:08.000-08:00
diff --git a/docs/tutorials/avro.ipynb b/docs/tutorials/avro.ipynb
@@ -0,0 +1,362 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Tce3stUlHN0L"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow IO Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "tuOe1ymfHZPu"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qFdPvlXBOdUN"
+      },
+      "source": [
+        "# Avro Dataset API"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MfBg1C5NB3X0"
+      },
+      "source": [
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/io/tutorials/avro\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/io/blob/master/docs/tutorials/avro.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/io/blob/master/docs/tutorials/avro.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "      <td>\n",
+        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/io/docs/tutorials/avro.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
+        "  </td>\n",
+        "</table>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xHxb-dlhMIzW"
+      },
+      "source": [
+        "## Overview\n",
+        "\n",
+        "The objective of Avro Dataset API is to load Avro formatted data natively into TensorFlow. \n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "upgCc3gXybsA"
+      },
+      "source": [
+        "### Install required Packages"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "uUDYyMZRfkX4"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install tensorflow-io"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "J0ZKhA6s0Pjp"
+      },
+      "source": [
+        "## Usage"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IGnbXuVnSo8T"
+      },
+      "source": [
+        "Download a sample Avro file:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Tu01THzWcE-J"
+      },
+      "outputs": [],
+      "source": [
+        "!curl -OL https://github.com/tensorflow/io/raw/master/docs/tutorials/avro/mnist.avro\n",
+        "!ls -l mnist.avro"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IGnbXuVnSo8T"
+      },
+      "source": [
+        "Download the schema file corresponding for the sample Avro data:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Tu01THzWcE-J"
+      },
+      "outputs": [],
+      "source": [
+        "!curl -OL https://github.com/tensorflow/io/raw/master/docs/tutorials/avro/mnist.avsc\n",
+        "!ls -l mnist.avsc"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nS3eTBvjt-O5"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "import tensorflow_io as tfio\n",
+        "\n",
+        "features = {\n",
+        "    'features[*]': tfio.experimental.columnar.VarLenFeatureWithRank(dtype=tf.int32),\n",
+        "    'label': tf.io.FixedLenFeature(shape=[], dtype=tf.int32, default_value=-100),\n",
+        "    'dataType': tf.io.FixedLenFeature(shape=[], dtype=tf.string)\n",
+        "}\n",
+        "\n",
+        "schema = tf.io.gfile.GFile('mnist.avsc').read()\n",
+        "\n",
+        "dataset = tfio.experimental.columnar.make_avro_record_dataset(file_pattern=['mnist.avro'],\n",
+        "                                                              reader_schema=schema,\n",
+        "                                                              features=features,\n",
+        "                                                              shuffle=False,\n",
+        "                                                              batch_size=3,\n",
+        "                                                              num_epochs=1)\n",
+        "\n",
+        "for _ in dataset:\n",
+        "    print(_['features[*]'])\n",
+        "    print(_['label'])\n",
+        "    print(_['dataType'])\n",
+        "    print(\"--------------------\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "z9GCyPWNuOm7"
+      },
+      "source": [
+        "In the above example, The avro file `mnist.avro` has 3 fields for each record: `features`, which is an array of int, `label`, an int or null, and `dataType`, an enum. Here is what `mnist.avro` looks like\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nS3eTBvjt-O5"
+      },
+      "outputs": [],
+      "source": [
+        "record 1:\n",
+        "{\n",
+        "\"features\" : [0, 0, 0, 1, 4]\n",
+        "\"label\" : null\n",
+        "\"dataType\" : \"TRAINING\"\n",
+        "}\n",
+        "record 2:\n",
+        "{\n",
+        "\"features\" : [0, 0]\n",
+        "\"label\" : {\"int\" : 2}\n",
+        "\"dataType\" : \"TRAINING\"\n",
+        "}\n",
+        "record 3:\n",
+        "{\n",
+        "\"features\" : [0]\n",
+        "\"label\" : {\"int\" : 3}\n",
+        "\"dataType\" : \"VALIDATION\"\n",
+        "}\n",
+        "record 4:\n",
+        "{\n",
+        "\"features\" : [1]\n",
+        "\"label\" : {\"int\" : 4}\n",
+        "\"dataType\" : \"VALIDATION\"\n",
+        "}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "z9GCyPWNuOm7"
+      },
+      "source": [
+        "And the schema of `mnist.avro` which is represented by `mnist.avsc` looks like following \n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nS3eTBvjt-O5"
+      },
+      "outputs": [],
+      "source": [
+        "{ \n",
+        "  \"type\" : \"record\"\n",
+        "  \"name\" : \"ImageDataset\"\n",
+        "  \"fields\" : [ {\n",
+        "  \"name\" : \"features\",\n",
+        "  \"type\" : {\n",
+        "     \"type\" : \"array\",\n",
+        "     \"items\" : \"int\"\n",
+        "    }\n",
+        "  }, {\n",
+        "   \"name\" : \"label\",\n",
+        "   \"type\" : [\"int\", \"null\"]  //label can be int or null\n",
+        "  }, {\n",
+        "   \"name\" : \"dataType\",\n",
+        "   \"type\" : {\n",
+        "     \"type\" : \"enum\",\n",
+        "     \"name\" : \"dataTypes\",\n",
+        "     \"symbols\" : [\"TRAINING\", \"VALIDATION\"]\n",
+        "   }\n",
+        " } ]\n",
+        "}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IF_kYz_o2DH4"
+      },
+      "source": [
+        "The above example converts `mnist.avro` into tensorflow dataset. Each element of the dataset is a dictionary whose key is the feature name, value is the converted sparse or dense tensor. \n",
+        "E.g, it converts `features`, `label`, `dataType` field to a VarLenFeature(SparseTensor), FixedLenFeature(DenseTensor), and FixedLenFeature(DenseTensor) respectively. Since batch_size is 3, it coerce 3 records from `mnist.avro` into one element in the result dataset.\n",
+        "For the first record in `mnist.avro` whose label is null, avro reader replaces it with the specified default value(-100).\n",
+        "In this example, there're 4 records in total in `mnist.avro`. Since batch size is 3, the result dataset contains 3 elements, last of which's batch size is 1. However user is also able to drop the last batch if the size is smaller than batch size by enabling `drop_final_batch`. E.g: \n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nS3eTBvjt-O5"
+      },
+      "outputs": [],
+      "source": [
+        "dataset = tfio.experimental.columnar.make_avro_record_dataset(file_pattern=['mnist.avro'],\n",
+        "                                                              reader_schema=schema,\n",
+        "                                                              features=features,\n",
+        "                                                              shuffle=False,\n",
+        "                                                              batch_size=3,\n",
+        "                                                              drop_final_batch=True,\n",
+        "                                                              num_epochs=1)\n",
+        "\n",
+        "for _ in dataset:\n ",
+        "    print(_)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IF_kYz_o2DH4"
+      },
+      "source": [
+        "Please refer to <a target=\"_blank\" href=\"https://www.tensorflow.org/io/api_docs/python/tfio/experimental/columnar/make_avro_record_dataset\">API doc</a> for the detailed usage of `make_avro_record_dataset.`\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IF_kYz_o2DH4"
+      },
+      "source": [
+        "The avro dataset can parse and coerce any avro data into TensorFlow tensors, including records in records, maps, arrays, branches, and enumerations. The parsing information is passed into the avro dataset implementation as a map where \n",
+        "keys encode how to parse the data \n",
+        "values encode on how to coerce the data into TensorFlow tensors – deciding the primitive type (e.g. bool, int, long, float, double, string) as well as the tensor type (e.g. sparse or dense). A listing of TensorFlow's parser types (see Table 1) and the coercion of primitive types (Table 2) is provided. \n",
+        "\n",
+        "Table 1:\n",
+        "\n",
+        "TensorFlow Parser Types|TensorFlow Tensors|Explanation\n",
+        "----|----|------\n",
+        "tf.FixedLenFeature([], tf.int32)|dense tensor|Parse a fixed length feature; that is all rows have the same constant number of elements, e.g. just one element or an array that has always the same number of elements for each row \n",
+        "tf.SparseFeature(index_key=['key_1st_index', 'key_2nd_index'], value_key='key_value', dtype=tf.int64, size=[20, 50]) |sparse tensor|Parse a sparse feature where each row has a variable length list of indices and values. The 'index_key' identifies the indices. The 'value_key' identifies the value. The 'dtype' is the data type. The 'size' is the expected maximum index value for each index entry\n",
+        "tfio.experimental.columnar.VarLenFeatureWithRank([],tf.int64) |sparse tensor|Parse a variable length feature; that means each data row can have a variable number of elements, e.g. the 1st row has 5 elements, the 2nd row has 7 elements\n",
+        "\n",
+        "Table 2 the supported conversion from Avro types to TensorFlow's types:\n",
+        "\n",
+        "Avro Primitive Type|TensorFlow Primitive Type\n",
+        "----|----\n",
+        "boolean: a binary value|tf.bool\n",
+        "bytes: a sequence of 8-bit unsigned bytes|tf.string\n",
+        "double: double precision 64-bit IEEE floating point number|tf.float64\n",
+        "enum: enumeration type|tf.string using the symbol name\n",
+        "float: single precision 32-bit IEEE floating point number|tf.float32\n",
+        "int: 32-bit signed integer|tf.int32\n",
+        "long: 64-bit signed integer|tf.int64\n",
+        "null: no value|uses default value\n",
+        "string: unicode character sequence|tf.string\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IF_kYz_o2DH4"
+      },
+      "source": [
+        "A comprehensive set of examples of Avro dataset API is provided within <a target=\"_blank\" href=\"https://github.com/tensorflow/io/blob/master/tests/test_parse_avro_eager.py#L580r\">the tests.</a>\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [
+        "Tce3stUlHN0L"
+      ],
+      "name": "avro.ipynb",
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/docs/tutorials/avro/mnist.avro b/docs/tutorials/avro/mnist.avro
diff --git a/docs/tutorials/avro/mnist.avsc b/docs/tutorials/avro/mnist.avsc
@@ -0,0 +1 @@
+{"name": "ImageDataset", "type": "record", "fields": [{"name": "features", "type": {"type": "array", "items": "int"}}, {"name": "label", "type": ["int", "null"]}, {"name": "dataType", "type": {"type": "enum", "name": "dataTypes", "symbols": ["TRAINING", "VALIDATION"]}}]}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"name": "ImageDataset", "type": "record", "fields": [{"name": "features", "type": {"type": "array", "items": "int"}}, {"name": "label", "type": ["int", "null"]}, {"name": "dataType", "type": {"type": "enum", "name": "dataTypes", "symbols": ["TRAINING", "VALIDATION"]}}]}`