From 514a4110c54e3e52ba7b6a798a94562fc6c10bbe Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Tue, 25 May 2021 22:49:16 +0000
Subject: [PATCH 1/3] Initial docs pass

---
 dev/export_images.sh                          |  19 +-
 dev/python_version_test.sh                    |  47 --
 docs/clusters/advanced/self-hosted-images.md  |   4 +-
 docs/clusters/instances/multi.md              |  26 +
 docs/clusters/instances/spot.md               |   2 +
 docs/clusters/management/create.md            |   1 -
 docs/clusters/management/update.md            |   2 +-
 docs/clusters/networking/custom-domain.md     |  10 +-
 docs/clusters/networking/https.md             |   8 +-
 docs/clusters/observability/logging.md        |  12 -
 docs/clusters/observability/metrics.md        |  24 +-
 docs/start.md                                 |   2 +-
 docs/summary.md                               |  28 +-
 docs/workloads/async/async-apis.md            |   6 +-
 docs/workloads/async/autoscaling.md           |  89 +--
 docs/workloads/async/configuration.md         | 162 ++---
 docs/workloads/async/handler.md               | 240 --------
 docs/workloads/async/models.md                | 203 -------
 docs/workloads/async/webhooks.md              |  48 +-
 docs/workloads/batch/batch-apis.md            |   3 +
 docs/workloads/batch/configuration.md         | 119 ++--
 docs/workloads/batch/handler.md               | 125 ----
 docs/workloads/batch/jobs.md                  |  52 +-
 docs/workloads/batch/models.md                | 210 -------
 docs/workloads/dependencies/example.md        |  61 --
 docs/workloads/dependencies/images.md         | 106 ----
 .../workloads/dependencies/python-packages.md | 140 -----
 .../workloads/dependencies/system-packages.md |  62 --
 docs/workloads/realtime/autoscaling.md        |  38 +-
 docs/workloads/realtime/configuration.md      | 186 ++----
 docs/workloads/realtime/handler.md            | 563 ------------------
 docs/workloads/realtime/models.md             | 439 --------------
 .../workloads/realtime/multi-model/caching.md |  14 -
 .../realtime/multi-model/configuration.md     | 118 ----
 .../workloads/realtime/multi-model/example.md |  43 --
 docs/workloads/realtime/parallelism.md        |   9 -
 docs/workloads/realtime/realtime-apis.md      |   3 +
 .../realtime/server-side-batching.md          |  83 ---
 docs/workloads/realtime/statuses.md           |   2 +-
 docs/workloads/realtime/traffic-splitter.md   |  65 ++
 .../traffic-splitter/configuration.md         |  12 -
 .../realtime/traffic-splitter/example.md      |  66 --
 docs/workloads/realtime/troubleshooting.md    |  21 +-
 docs/workloads/task/configuration.md          |  54 +-
 docs/workloads/task/definitions.md            |  91 ---
 docs/workloads/task/jobs.md                   |   6 +-
 docs/workloads/task/task-apis.md              |   3 +
 pkg/consts/consts.go                          |   4 +-
 pkg/types/spec/validations.go                 |   3 +-
 49 files changed, 449 insertions(+), 3185 deletions(-)
 delete mode 100755 dev/python_version_test.sh
 delete mode 100644 docs/workloads/async/handler.md
 delete mode 100644 docs/workloads/async/models.md
 create mode 100644 docs/workloads/batch/batch-apis.md
 delete mode 100644 docs/workloads/batch/handler.md
 delete mode 100644 docs/workloads/batch/models.md
 delete mode 100644 docs/workloads/dependencies/example.md
 delete mode 100644 docs/workloads/dependencies/images.md
 delete mode 100644 docs/workloads/dependencies/python-packages.md
 delete mode 100644 docs/workloads/dependencies/system-packages.md
 delete mode 100644 docs/workloads/realtime/handler.md
 delete mode 100644 docs/workloads/realtime/models.md
 delete mode 100644 docs/workloads/realtime/multi-model/caching.md
 delete mode 100644 docs/workloads/realtime/multi-model/configuration.md
 delete mode 100644 docs/workloads/realtime/multi-model/example.md
 delete mode 100644 docs/workloads/realtime/parallelism.md
 create mode 100644 docs/workloads/realtime/realtime-apis.md
 delete mode 100644 docs/workloads/realtime/server-side-batching.md
 create mode 100644 docs/workloads/realtime/traffic-splitter.md
 delete mode 100644 docs/workloads/realtime/traffic-splitter/configuration.md
 delete mode 100644 docs/workloads/realtime/traffic-splitter/example.md
 delete mode 100644 docs/workloads/task/definitions.md
 create mode 100644 docs/workloads/task/task-apis.md

diff --git a/dev/export_images.sh b/dev/export_images.sh
index 8db4acc676..0d040e777e 100755
--- a/dev/export_images.sh
+++ b/dev/export_images.sh
@@ -40,23 +40,10 @@ for image in "${all_images[@]}"; do
 done
 echo
 
-cuda=("10.0" "10.1" "10.1" "10.2" "10.2" "11.0" "11.1")
-cudnn=("7" "7" "8" "7" "8" "8" "8")
-
 # pull the images from source registry and push them to ECR
 for image in "${all_images[@]}"; do
-    # copy the different cuda/cudnn variations of the python handler image
-    if [ "$image" = "python-handler-gpu" ]; then
-        for i in "${!cuda[@]}"; do
-            full_image="$image:$cortex_version-cuda${cuda[$i]}-cudnn${cudnn[$i]}"
-            echo "copying $full_image from $source_registry to $destination_registry"
-            skopeo copy --src-no-creds "docker://$source_registry/$full_image" "docker://$destination_registry/$full_image"
-            echo
-        done
-    else
-        echo "copying $image:$cortex_version from $source_registry to $destination_registry"
-        skopeo copy --src-no-creds "docker://$source_registry/$image:$cortex_version" "docker://$destination_registry/$image:$cortex_version"
-        echo
-    fi
+    echo "copying $image:$cortex_version from $source_registry to $destination_registry"
+    skopeo copy --src-no-creds "docker://$source_registry/$image:$cortex_version" "docker://$destination_registry/$image:$cortex_version"
+    echo
 done
 echo "done ✓"
diff --git a/dev/python_version_test.sh b/dev/python_version_test.sh
deleted file mode 100755
index ae94f689de..0000000000
--- a/dev/python_version_test.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-# Copyright 2021 Cortex Labs, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# USAGE: ./dev/python_version_test.sh <python version> <env_name>
-# e.g.: ./dev/python_version_test.sh 3.6.9 aws
-
-set -e
-
-ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null && pwd)"
-
-# create a new conda environment based on the supplied python version
-conda create -n env -y
-CONDA_BASE=$(conda info --base)
-source $CONDA_BASE/etc/profile.d/conda.sh
-conda activate env
-conda config --append channels conda-forge
-conda install python=$1 -y
-
-pip install requests
-
-export CORTEX_CLI_PATH=$ROOT/bin/cortex
-
-# install cortex
-cd $ROOT/python/client
-pip install -e .
-
-# run script.py
-python $ROOT/dev/deploy_test.py $2
-
-# clean up conda
-conda deactivate
-conda env remove -n env
-rm -rf $ROOT/python/client/cortex.egg-info
diff --git a/docs/clusters/advanced/self-hosted-images.md b/docs/clusters/advanced/self-hosted-images.md
index 74b581c840..b88f5e55b5 100644
--- a/docs/clusters/advanced/self-hosted-images.md
+++ b/docs/clusters/advanced/self-hosted-images.md
@@ -1,6 +1,6 @@
 # Self-hosted Docker images
 
-Self-hosted Docker images can be useful for reducing the ingress costs, for accelerating image pulls, or for eliminating the dependency on Cortex's public container registry.
+Self-hosting the Cortex cluster's internal Docker images can be useful for reducing the ingress costs, for accelerating image pulls, or for eliminating the dependency on Cortex's public container registry.
 
 In this guide, we'll use [ECR](https://aws.amazon.com/ecr/) as the destination container registry. When an ECR repository resides in the same region as your Cortex cluster, there are no costs incurred when pulling images.
 
@@ -33,7 +33,7 @@ Feel free to modify the script if you would like to export the images to a diffe
 ./cortex/dev/export_images.sh <AWS_REGION> <AWS_ACCOUNT_ID>
 ```
 
-You can now configure Cortex to use your images when creating a cluster (see [here](../management/create.md) for how to specify cluster images) and/or when deploying APIs (see the configuration docs corresponding to your API type for how to specify API images).
+You can now configure Cortex to use your images when creating a cluster (see [here](../management/create.md) for instructions).
 
 ## Cleanup
 
diff --git a/docs/clusters/instances/multi.md b/docs/clusters/instances/multi.md
index b6fd6f7bb1..e34194ed88 100644
--- a/docs/clusters/instances/multi.md
+++ b/docs/clusters/instances/multi.md
@@ -20,11 +20,15 @@ Cortex can be configured to provision different instance types to improve worklo
 node_groups:
   - name: cpu-spot
     instance_type: m5.large
+    min_instances: 0
+    max_instances: 5
     spot: true
     spot_config:
       instance_distribution: [m5a.large, m5d.large, m5n.large, m5ad.large, m5dn.large, m4.large, t3.large, t3a.large, t2.large]
   - name: cpu-on-demand
     instance_type: m5.large
+    min_instances: 0
+    max_instances: 5
 ```
 
 ### On-demand cluster supporting CPU, GPU, and Inferentia
@@ -35,10 +39,16 @@ node_groups:
 node_groups:
   - name: cpu
     instance_type: m5.large
+    min_instances: 0
+    max_instances: 5
   - name: gpu
     instance_type: g4dn.xlarge
+    min_instances: 0
+    max_instances: 5
   - name: inf
     instance_type: inf.xlarge
+    min_instances: 0
+    max_instances: 5
 ```
 
 ### Spot cluster supporting CPU and GPU (with on-demand backup)
@@ -49,16 +59,24 @@ node_groups:
 node_groups:
   - name: cpu-spot
     instance_type: m5.large
+    min_instances: 0
+    max_instances: 5
     spot: true
     spot_config:
       instance_distribution: [m5a.large, m5d.large, m5n.large, m5ad.large, m5dn.large, m4.large, t3.large, t3a.large, t2.large]
   - name: cpu-on-demand
     instance_type: m5.large
+    min_instances: 0
+    max_instances: 5
   - name: gpu-spot
     instance_type: g4dn.xlarge
+    min_instances: 0
+    max_instances: 5
     spot: true
   - name: gpu-on-demand
     instance_type: g4dn.xlarge
+    min_instances: 0
+    max_instances: 5
 ```
 
 ### CPU spot cluster with multiple instance types and on-demand backup
@@ -69,13 +87,21 @@ node_groups:
 node_groups:
   - name: cpu-1
     instance_type: t3.medium
+    min_instances: 0
+    max_instances: 5
     spot: true
   - name: cpu-2
     instance_type: m5.2xlarge
+    min_instances: 0
+    max_instances: 5
     spot: true
   - name: cpu-3
     instance_type: m5.8xlarge
+    min_instances: 0
+    max_instances: 5
     spot: true
   - name: cpu-4
     instance_type: m5.24xlarge
+    min_instances: 0
+    max_instances: 5
 ```
diff --git a/docs/clusters/instances/spot.md b/docs/clusters/instances/spot.md
index 4fc6b1d9f6..1a4c22f554 100644
--- a/docs/clusters/instances/spot.md
+++ b/docs/clusters/instances/spot.md
@@ -43,6 +43,8 @@ There is a spot instance limit associated with your AWS account for each instanc
 node_groups:
   - name: cpu-spot
     instance_type: m5.large
+    min_instances: 0
+    max_instances: 5
     spot: true
     spot_config:
       instance_distribution: [m5a.large, m5d.large, m5n.large, m5ad.large, m5dn.large, m4.large, t3.large, t3a.large, t2.large]
diff --git a/docs/clusters/management/create.md b/docs/clusters/management/create.md
index c2f75e3bad..5fe3a5d51e 100644
--- a/docs/clusters/management/create.md
+++ b/docs/clusters/management/create.md
@@ -104,7 +104,6 @@ image_async_gateway: quay.io/cortexlabs/async-gateway:master
 image_cluster_autoscaler: quay.io/cortexlabs/cluster-autoscaler:master
 image_metrics_server: quay.io/cortexlabs/metrics-server:master
 image_inferentia: quay.io/cortexlabs/inferentia:master
-image_neuron_rtd: quay.io/cortexlabs/neuron-rtd:master
 image_nvidia: quay.io/cortexlabs/nvidia:master
 image_fluent_bit: quay.io/cortexlabs/fluent-bit:master
 image_istio_proxy: quay.io/cortexlabs/istio-proxy:master
diff --git a/docs/clusters/management/update.md b/docs/clusters/management/update.md
index f881ae3ff5..6e144602c3 100644
--- a/docs/clusters/management/update.md
+++ b/docs/clusters/management/update.md
@@ -27,7 +27,7 @@ cortex cluster up cluster.yaml
 In production environments, you can upgrade your cluster without downtime if you have a backend service or DNS in front of your Cortex cluster:
 
 1. Spin up a new cluster. For example: `cortex cluster up new-cluster.yaml --configure-env cortex2` (this will create a CLI environment named `cortex2` for accessing the new cluster).
-1. Re-deploy your APIs in your new cluster. For example, if the name of your CLI environment for your existing cluster is `cortex`, you can use `cortex get --env cortex` to list all running APIs in your cluster, and re-deploy them in the new cluster by changing directories to each API's project folder and running `cortex deploy --env cortex2`. Alternatively, you can run `cortex cluster export --name <previous_cluster_name> --region <region>` to export all of your API specifications, change directories the folder that was exported, and run `cortex deploy --env cortex2 <file_name>` for each API that you want to deploy in the new cluster.
+1. Re-deploy your APIs in your new cluster. For example, if the name of your CLI environment for your existing cluster is `cortex`, you can use `cortex get --env cortex` to list all running APIs in your cluster, and re-deploy them in the new cluster by running `cortex deploy --env cortex2` for each API. Alternatively, you can run `cortex cluster export --name <previous_cluster_name> --region <region>` to export the API specifications for all of your running APIs, change directories the folder that was exported, and run `cortex deploy --env cortex2 <file_name>` for each API that you want to deploy in the new cluster.
 1. Route requests to your new cluster.
     * If you are using a custom domain: update the A record in your Route 53 hosted zone to point to your new cluster's API load balancer.
     * If you have a backend service which makes requests to Cortex: update your backend service to make requests to the new cluster's endpoints.
diff --git a/docs/clusters/networking/custom-domain.md b/docs/clusters/networking/custom-domain.md
index 00d0d2f9d1..f0eb8162c7 100644
--- a/docs/clusters/networking/custom-domain.md
+++ b/docs/clusters/networking/custom-domain.md
@@ -115,13 +115,9 @@ You could run into connectivity issues if you make a request to your API without
 
 To test connectivity, try the following steps:
 
-1. Deploy any api (e.g. examples/pytorch/iris-classifier).
-1. Make a GET request to the your api (e.g. `curl https://api.cortexlabs.dev/iris-classifier` or paste the url into your browser).
-1. If you run into an error such as `curl: (6) Could not resolve host: api.cortexlabs.dev` wait a few minutes and make the GET request from another device that hasn't made a request to that url in a while. A successful request looks like this:
-
-```text
-{"message":"make a request by sending a POST to this endpoint with a json payload",...}
-```
+1. Deploy an api.
+1. Make a request to the your api (e.g. `curl https://api.cortexlabs.dev/my-api` or paste the url into your browser if your API supports GET requests).
+1. If you run into an error such as `curl: (6) Could not resolve host: api.cortexlabs.dev` wait a few minutes and make the request from another device that hasn't made a request to that url in a while.
 
 ## Cleanup
 
diff --git a/docs/clusters/networking/https.md b/docs/clusters/networking/https.md
index 77df0ac276..38a25da691 100644
--- a/docs/clusters/networking/https.md
+++ b/docs/clusters/networking/https.md
@@ -56,13 +56,13 @@ Copy your "Invoke URL"
 You may now use the "Invoke URL" in place of your API load balancer endpoint in your client. For example, this curl request:
 
 ```bash
-curl http://a9eaf69fd125947abb1065f62de59047-81cdebc0275f7d96.elb.us-west-2.amazonaws.com/iris-classifier -X POST -H "Content-Type: application/json" -d @sample.json
+curl http://a9eaf69fd125947abb1065f62de59047-81cdebc0275f7d96.elb.us-west-2.amazonaws.com/my-api -X POST -H "Content-Type: application/json" -d @sample.json
 ```
 
 Would become:
 
 ```bash
-curl https://31qjv48rs6.execute-api.us-west-2.amazonaws.com/dev/iris-classifier -X POST -H "Content-Type: application/json" -d @sample.json
+curl https://31qjv48rs6.execute-api.us-west-2.amazonaws.com/dev/my-api -X POST -H "Content-Type: application/json" -d @sample.json
 ```
 
 ### Cleanup
@@ -134,13 +134,13 @@ Copy your "Invoke URL"
 You may now use the "Invoke URL" in place of your API load balancer endpoint in your client. For example, this curl request:
 
 ```bash
-curl http://a5044e34a352d44b0945adcd455c7fa3-32fa161d3e5bcbf9.elb.us-west-2.amazonaws.com/iris-classifier -X POST -H "Content-Type: application/json" -d @sample.json
+curl http://a5044e34a352d44b0945adcd455c7fa3-32fa161d3e5bcbf9.elb.us-west-2.amazonaws.com/my-api -X POST -H "Content-Type: application/json" -d @sample.json
 ```
 
 Would become:
 
 ```bash
-curl https://lrivodooqh.execute-api.us-west-2.amazonaws.com/dev/iris-classifier -X POST -H "Content-Type: application/json" -d @sample.json
+curl https://lrivodooqh.execute-api.us-west-2.amazonaws.com/dev/my-api -X POST -H "Content-Type: application/json" -d @sample.json
 ```
 
 ### Cleanup
diff --git a/docs/clusters/observability/logging.md b/docs/clusters/observability/logging.md
index fee687c174..1e484649c4 100644
--- a/docs/clusters/observability/logging.md
+++ b/docs/clusters/observability/logging.md
@@ -64,15 +64,3 @@ fields @timestamp, message
 | sort @timestamp asc
 | limit 1000
 ```
-
-## Structured logging
-
-You can use Cortex's logger in your Python code to log in JSON, which will enrich your logs with Cortex's metadata, and
-enable you to add custom metadata to the logs.
-
-See the structured logging docs for each API kind:
-
-- [RealtimeAPI](../../workloads/realtime/handler.md#structured-logging)
-- [AsyncAPI](../../workloads/async/handler.md#structured-logging)
-- [BatchAPI](../../workloads/batch/handler.md#structured-logging)
-- [TaskAPI](../../workloads/task/definitions.md#structured-logging)
diff --git a/docs/clusters/observability/metrics.md b/docs/clusters/observability/metrics.md
index 72ab98ec27..55de3e85de 100644
--- a/docs/clusters/observability/metrics.md
+++ b/docs/clusters/observability/metrics.md
@@ -96,23 +96,23 @@ Currently, we only support 3 different metric types that will be converted to it
 
 ### Pushing metrics
 
- - Counter
+- Counter
 
-    ```python
-    metrics.increment('my_counter', value=1, tags={"tag": "tag_name"})
-    ```
+   ```python
+   metrics.increment('my_counter', value=1, tags={"tag": "tag_name"})
+   ```
 
- - Gauge
+- Gauge
 
-    ```python
-    metrics.gauge('active_connections', value=1001, tags={"tag": "tag_name"})
-    ```
+   ```python
+   metrics.gauge('active_connections', value=1001, tags={"tag": "tag_name"})
+   ```
 
- - Histogram
+- Histogram
 
-    ```python
-    metrics.histogram('inference_time_milliseconds', 120, tags={"tag": "tag_name"})
-    ```
+   ```python
+   metrics.histogram('inference_time_milliseconds', 120, tags={"tag": "tag_name"})
+   ```
 
 ### Metrics client class reference
 
diff --git a/docs/start.md b/docs/start.md
index e0f8e7f5a5..2fa0d570bd 100644
--- a/docs/start.md
+++ b/docs/start.md
@@ -21,7 +21,7 @@ cortex cluster up cluster.yaml
 cortex deploy apis.yaml
 ```
 
-* [RealtimeAPI](workloads/realtime/example.md) - create HTTP/gRPC APIs that respond to requests in real-time.
+* [RealtimeAPI](workloads/realtime/example.md) - create APIs that respond to requests in real-time.
 * [AsyncAPI](workloads/async/example.md) - create APIs that respond to requests asynchronously.
 * [BatchAPI](workloads/batch/example.md) - create APIs that run distributed batch jobs.
 * [TaskAPI](workloads/task/example.md) - create APIs that run jobs on-demand.
diff --git a/docs/summary.md b/docs/summary.md
index f6f5d1db0c..cd40165657 100644
--- a/docs/summary.md
+++ b/docs/summary.md
@@ -29,52 +29,32 @@
 
 ## Workloads
 
-* Realtime APIs
+* [Realtime APIs](workloads/realtime/realtime-apis.md)
   * [Example](workloads/realtime/example.md)
-  * [Handler](workloads/realtime/handler.md)
   * [Configuration](workloads/realtime/configuration.md)
-  * [Parallelism](workloads/realtime/parallelism.md)
   * [Autoscaling](workloads/realtime/autoscaling.md)
-  * [Models](workloads/realtime/models.md)
-  * Multi-model
-    * [Example](workloads/realtime/multi-model/example.md)
-    * [Configuration](workloads/realtime/multi-model/configuration.md)
-    * [Caching](workloads/realtime/multi-model/caching.md)
-  * [Server-side batching](workloads/realtime/server-side-batching.md)
+  * [Traffic Splitter](workloads/realtime/traffic-splitter.md)
   * [Metrics](workloads/realtime/metrics.md)
   * [Statuses](workloads/realtime/statuses.md)
-  * Traffic Splitter
-    * [Example](workloads/realtime/traffic-splitter/example.md)
-    * [Configuration](workloads/realtime/traffic-splitter/configuration.md)
   * [Troubleshooting](workloads/realtime/troubleshooting.md)
 * [Async APIs](workloads/async/async-apis.md)
   * [Example](workloads/async/example.md)
-  * [Handler](workloads/async/handler.md)
   * [Configuration](workloads/async/configuration.md)
-  * [TensorFlow Models](workloads/async/models.md)
   * [Metrics](workloads/async/metrics.md)
   * [Statuses](workloads/async/statuses.md)
   * [Webhooks](workloads/async/webhooks.md)
-* Batch APIs
+* [Batch APIs](workloads/batch/batch-apis.md)
   * [Example](workloads/batch/example.md)
-  * [Handler](workloads/batch/handler.md)
   * [Configuration](workloads/batch/configuration.md)
   * [Jobs](workloads/batch/jobs.md)
-  * [TensorFlow Models](workloads/batch/models.md)
   * [Metrics](workloads/batch/metrics.md)
   * [Statuses](workloads/batch/statuses.md)
-* Task APIs
+* [Task APIs](workloads/task/task-apis.md)
   * [Example](workloads/task/example.md)
-  * [Definition](workloads/task/definitions.md)
   * [Configuration](workloads/task/configuration.md)
   * [Jobs](workloads/task/jobs.md)
   * [Metrics](workloads/task/metrics.md)
   * [Statuses](workloads/task/statuses.md)
-* Dependencies
-  * [Example](workloads/dependencies/example.md)
-  * [Python packages](workloads/dependencies/python-packages.md)
-  * [System packages](workloads/dependencies/system-packages.md)
-  * [Custom images](workloads/dependencies/images.md)
 
 ## Clients
 
diff --git a/docs/workloads/async/async-apis.md b/docs/workloads/async/async-apis.md
index f96e642a8d..724c76fc94 100644
--- a/docs/workloads/async/async-apis.md
+++ b/docs/workloads/async/async-apis.md
@@ -1,4 +1,4 @@
-# AsyncAPI
+# Async APIs
 
 The AsyncAPI kind is designed for asynchronous workloads, in which the user submits a request to start the processing
 and retrieves the result later, either by polling or through a webhook.
@@ -14,7 +14,3 @@ workload status and results. Cortex fully manages the Async Gateway and the queu
 
 AsyncAPI is a good fit for users who want to submit longer workloads (such as video, audio
 or document processing), and do not need the result immediately or synchronously.
-
-{% hint style="info" %}
-AsyncAPI is still in a beta state.
-{% endhint %}
diff --git a/docs/workloads/async/autoscaling.md b/docs/workloads/async/autoscaling.md
index 395d5d6e2e..358b27753c 100644
--- a/docs/workloads/async/autoscaling.md
+++ b/docs/workloads/async/autoscaling.md
@@ -4,6 +4,16 @@ Cortex auto-scales AsyncAPIs on a per-API basis based on your configuration.
 
 ## Autoscaling replicas
 
+### Relevant pod configuration
+
+In addition to the autoscaling configuration options (described below), there are two fields in the pod configuration which are relevant to replica autoscaling:
+
+**`max_concurrency`** (default: 1): The the maximum number of requests that will be concurrently sent into the container by Cortex. If your web server is designed to handle multiple concurrent requests, increasing `max_concurrency` will increase the throughput of a replica (and result in fewer total replicas for a given load).
+
+<br>
+
+### Autoscaling configuration
+
 **`min_replicas`**: The lower bound on how many replicas can be running for an API.
 
 <br>
@@ -12,97 +22,56 @@ Cortex auto-scales AsyncAPIs on a per-API basis based on your configuration.
 
 <br>
 
-**`target_replica_concurrency`** (default: 1): This is the desired number of in-flight requests per replica, and is the
-metric which the autoscaler uses to make scaling decisions. It is recommended to leave this parameter at its default
-value.
-
-Replica concurrency is simply how many requests have been sent to the queue and have not yet finished being processed (also
-referred to as in-flight requests). Therefore, it includes requests which are currently being processed and requests
-which are waiting in the queue.
+**`target_in_flight`** (default: `max_concurrency` in the pod configuration): This is the desired number of in-flight requests per replica, and is the metric which the autoscaler uses to make scaling decisions. The number of in-flight requests is simply how many requests have been submitted and are not yet finished being processed. Therefore, this number includes requests which are actively being processed as well as requests which are waiting in the queue.
 
 The autoscaler uses this formula to determine the number of desired replicas:
 
-`desired replicas = sum(in-flight requests accross all replicas) / target_replica_concurrency`
+`desired replicas = sum(in-flight requests accross all replicas) / target_in_flight`
 
-<br>
-
-**`max_replica_concurrency`** (default: 1024): This is the maximum number of in-queue messages before requests are
-rejected with HTTP error code 503. `max_replica_concurrency` includes requests that are currently being processed as
-well as requests that are waiting in the queue (a replica can actively process one request concurrently, and will hold
-any additional requests in a local queue). Decreasing `max_replica_concurrency` and configuring the client to retry when
-it receives 503 responses will improve queue fairness accross replicas by preventing requests from sitting in long
-queues.
+For example, setting `target_in_flight` to `max_concurrency` (the default) causes the cluster to adjust the number of replicas so that on average, there are no requests waiting in the queue.
 
 <br>
 
-**`window`** (default: 60s): The time over which to average the API in-flight requests (which is the sum of in-flight
-requests in each replica). The longer the window, the slower the autoscaler will react to changes in API wide in-flight
-requests, since it is averaged over the `window`. API wide in-flight requests is calculated every 10 seconds,
-so `window` must be a multiple of 10 seconds.
+**`window`** (default: 60s): The time over which to average the API's in-flight requests. The longer the window, the slower the autoscaler will react to changes in in-flight requests, since it is averaged over the `window`. An API's in-flight requests is calculated every 10 seconds, so `window` must be a multiple of 10 seconds.
 
 <br>
 
-**`downscale_stabilization_period`** (default: 5m): The API will not scale below the highest recommendation made during
-this period. Every 10 seconds, the autoscaler makes a recommendation based on all of the other configuration parameters
-described here. It will then take the max of the current recommendation and all recommendations made during
-the `downscale_stabilization_period`, and use that to determine the final number of replicas to scale to. Increasing
-this value will cause the cluster to react more slowly to decreased traffic, and will reduce thrashing.
+**`downscale_stabilization_period`** (default: 5m): The API will not scale below the highest recommendation made during this period. Every 10 seconds, the autoscaler makes a recommendation based on all of the other configuration parameters described here. It will then take the max of the current recommendation and all recommendations made during the `downscale_stabilization_period`, and use that to determine the final number of replicas to scale to. Increasing this value will cause the cluster to react more slowly to decreased traffic, and will reduce thrashing.
 
 <br>
 
-**`upscale_stabilization_period`** (default: 1m): The API will not scale above the lowest recommendation made during
-this period. Every 10 seconds, the autoscaler makes a recommendation based on all of the other configuration parameters
-described here. It will then take the min of the current recommendation and all recommendations made during
-the `upscale_stabilization_period`, and use that to determine the final number of replicas to scale to. Increasing this
-value will cause the cluster to react more slowly to increased traffic, and will reduce thrashing.
+**`upscale_stabilization_period`** (default: 1m): The API will not scale above the lowest recommendation made during this period. Every 10 seconds, the autoscaler makes a recommendation based on all of the other configuration parameters described here. It will then take the min of the current recommendation and all recommendations made during the `upscale_stabilization_period`, and use that to determine the final number of replicas to scale to. Increasing this value will cause the cluster to react more slowly to increased traffic, and will reduce thrashing.
 
 <br>
 
-**`max_downscale_factor`** (default: 0.75): The maximum factor by which to scale down the API on a single scaling event.
-For example, if `max_downscale_factor` is 0.5 and there are 10 running replicas, the autoscaler will not recommend fewer
-than 5 replicas. Increasing this number will allow the cluster to shrink more quickly in response to dramatic dips in
-traffic.
+**`max_downscale_factor`** (default: 0.75): The maximum factor by which to scale down the API on a single scaling event. For example, if `max_downscale_factor` is 0.5 and there are 10 running replicas, the autoscaler will not recommend fewer than 5 replicas. Increasing this number will allow the cluster to shrink more quickly in response to dramatic dips in traffic.
 
 <br>
 
-**`max_upscale_factor`** (default: 1.5): The maximum factor by which to scale up the API on a single scaling event. For
-example, if `max_upscale_factor` is 10 and there are 5 running replicas, the autoscaler will not recommend more than 50
-replicas. Increasing this number will allow the cluster to grow more quickly in response to dramatic spikes in traffic.
+**`max_upscale_factor`** (default: 1.5): The maximum factor by which to scale up the API on a single scaling event. For example, if `max_upscale_factor` is 10 and there are 5 running replicas, the autoscaler will not recommend more than 50 replicas. Increasing this number will allow the cluster to grow more quickly in response to dramatic spikes in traffic.
 
 <br>
 
-**`downscale_tolerance`** (default: 0.05): Any recommendation falling within this factor below the current number of
-replicas will not trigger a scale down event. For example, if `downscale_tolerance` is 0.1 and there are 20 running
-replicas, a recommendation of 18 or 19 replicas will not be acted on, and the API will remain at 20 replicas. Increasing
-this value will prevent thrashing, but setting it too high will prevent the cluster from maintaining it's optimal size.
+**`downscale_tolerance`** (default: 0.05): Any recommendation falling within this factor below the current number of replicas will not trigger a scale down event. For example, if `downscale_tolerance` is 0.1 and there are 20 running replicas, a recommendation of 18 or 19 replicas will not be acted on, and the API will remain at 20 replicas. Increasing this value will prevent thrashing, but setting it too high will prevent the cluster from maintaining it's optimal size.
 
 <br>
 
-**`upscale_tolerance`** (default: 0.05): Any recommendation falling within this factor above the current number of
-replicas will not trigger a scale up event. For example, if `upscale_tolerance` is 0.1 and there are 20 running
-replicas, a recommendation of 21 or 22 replicas will not be acted on, and the API will remain at 20 replicas. Increasing
-this value will prevent thrashing, but setting it too high will prevent the cluster from maintaining it's optimal size.
+**`upscale_tolerance`** (default: 0.05): Any recommendation falling within this factor above the current number of replicas will not trigger a scale up event. For example, if `upscale_tolerance` is 0.1 and there are 20 running replicas, a recommendation of 21 or 22 replicas will not be acted on, and the API will remain at 20 replicas. Increasing this value will prevent thrashing, but setting it too high will prevent the cluster from maintaining it's optimal size.
 
 <br>
 
 ## Autoscaling instances
 
-Cortex spins up and down instances based on the aggregate resource requests of all APIs. The number of instances will be
-at least `min_instances` and no more than `max_instances` for each node group (configured during installation and modifiable
-via `cortex cluster scale`).
+Cortex spins up and down instances based on the aggregate resource requests of all APIs. The number of instances will be at least `min_instances` and no more than `max_instances` for each node group (configured during installation and modifiable via `cortex cluster scale`).
 
-## Autoscaling responsiveness
+## Overprovisioning
 
-Assuming that `window` and `upscale_stabilization_period` are set to their default values (1 minute), it could take up
-to 2 minutes of increased traffic before an extra replica is requested. As soon as the additional replica is requested,
-the replica request will be visible in the output of `cortex get`, but the replica won't yet be running. If an extra
-instance is required to schedule the newly requested replica, it could take a few minutes for AWS to provision the
-instance (depending on the instance type), plus a few minutes for the newly provisioned instance to download your api
-image and for the api to initialize (via its `__init__()` method).
+The default value for `target_in_flight` is `max_concurrency`, which behaves well in many situations (see above for an explanation of how `target_in_flight` affects autoscaling). However, if your application is sensitive to spikes in traffic or if creating new replicas takes too long (see below), you may find it helpful to maintain extra capacity to handle the increased traffic while new replicas are being created. This can be accomplished by setting `target_in_flight` to a lower value relative to the expected replica's concurrency. The smaller `target_in_flight` is, the more unused capacity your API will have, and the more room it will have to handle sudden increased load. The increased request rate will still trigger the autoscaler, and your API will stabilize again (maintaining the overprovisioned capacity).
+
+For example, if you've determined that each replica in your API can handle 2 concurrent requests, you would typically set `target_in_flight` to 2. In a scenario where your API is receiving 8 concurrent requests on average, the autoscaler would maintain 4 live replicas (8/2 = 4). If you wanted to overprovision by 25%, you could set `target_in_flight` to 1.6, causing the autoscaler maintain 5 live replicas (8/1.6 = 5).
+
+## Autoscaling responsiveness
 
-If you want the autoscaler to react as quickly as possible, set `upscale_stabilization_period` and `window` to their
-minimum values (0s and 10s respectively).
+Assuming that `window` and `upscale_stabilization_period` are set to their default values (1 minute), it could take up to 2 minutes of increased traffic before an extra replica is requested. As soon as the additional replica is requested, the replica request will be visible in the output of `cortex get`, but the replica won't yet be running. If an extra instance is required to schedule the newly requested replica, it could take a few minutes for AWS to provision the instance (depending on the instance type), plus a few minutes for the newly provisioned instance to download your api image and for the api to initialize.
 
-If it takes a long time to initialize your API replica (i.e. install dependencies and run your handler's `__init__()`
-function), consider building your own API image to use instead of the default image. With this approach, you can
-pre-download/build/install any custom dependencies and bake them into the image.
+Keep these delays in mind when considering overprovisioning (see above) and when determining appropriate values for `window` and `upscale_stabilization_period`. If you want the autoscaler to react as quickly as possible, set `upscale_stabilization_period` and `window` to their minimum values (0s and 10s respectively).
diff --git a/docs/workloads/async/configuration.md b/docs/workloads/async/configuration.md
index 595f698a19..01641bc7c1 100644
--- a/docs/workloads/async/configuration.md
+++ b/docs/workloads/async/configuration.md
@@ -1,107 +1,63 @@
 # Configuration
 
 ```yaml
-- name: <string>
-  kind: AsyncAPI
-  handler: # detailed configuration below
-  compute: # detailed configuration below
-  autoscaling: # detailed configuration below
-  update_strategy: # detailed configuration below
-  networking: # detailed configuration below
-```
-
-## Handler
-
-### Python Handler
-
-<!-- CORTEX_VERSION_BRANCH_STABLE x3 -->
-
-```yaml
-handler:
-  type: python
-  path: <string>  # path to a python file with a Handler class definition, relative to the Cortex root (required)
-  dependencies: # (optional)
-    pip: <string>  # relative path to requirements.txt (default: requirements.txt)
-    conda: <string>  # relative path to conda-packages.txt (default: conda-packages.txt)
-    shell: <string>  # relative path to a shell script for system package installation (default: dependencies.sh)
-  config: <string: value>  # arbitrary dictionary passed to the constructor of the Handler class (optional)
-  python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
-  image: <string>  # docker image to use for the handler (default: quay.io/cortexlabs/python-handler-cpu:master, quay.io/cortexlabs/python-handler-gpu:master-cuda10.2-cudnn8, or quay.io/cortexlabs/python-handler-inf:master based on compute)
-  env: <string: string>  # dictionary of environment variables
-  log_level: <string>  # log level that can be "debug", "info", "warning" or "error" (default: "info")
-  shm_size: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
-```
-
-### Tensorflow Handler
-
-<!-- CORTEX_VERSION_BRANCH_STABLE x4 -->
-
-```yaml
-handler:
-  type: tensorflow
-  path: <string>  # path to a python file with a Handler class definition, relative to the Cortex root (required)
-  dependencies: # (optional)
-    pip: <string>  # relative path to requirements.txt (default: requirements.txt)
-    conda: <string>  # relative path to conda-packages.txt (default: conda-packages.txt)
-    shell: <string>  # relative path to a shell script for system package installation (default: dependencies.sh)
-  models:  # (required)
-    path: <string> # S3 path to an exported SavedModel directory (e.g. s3://my-bucket/exported_model/) (either this, 'dir', or 'paths' must be provided)
-    paths:  # list of S3 paths to exported SavedModel directories (either this, 'dir', or 'path' must be provided)
-      - name: <string>  # unique name for the model (e.g. text-generator) (required)
-        path: <string>  # S3 path to an exported SavedModel directory (e.g. s3://my-bucket/exported_model/) (required)
-        signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
-      ...
-    dir: <string>  # S3 path to a directory containing multiple SavedModel directories (e.g. s3://my-bucket/models/) (either this, 'path', or 'paths' must be provided)
-    signature_key:  # name of the signature def to use for prediction (required if your model has more than one signature def)
-  config: <string: value>  # arbitrary dictionary passed to the constructor of the Handler class (optional)
-  python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
-  image: <string>  # docker image to use for the handler (default: quay.io/cortexlabs/tensorflow-handler:master)
-  tensorflow_serving_image: <string>  # docker image to use for the TensorFlow Serving container (default: quay.io/cortexlabs/tensorflow-serving-cpu:master, quay.io/cortexlabs/tensorflow-serving-gpu:master, or quay.io/cortexlabs/tensorflow-serving-inf:master based on compute)
-  env: <string: string>  # dictionary of environment variables
-  log_level: <string>  # log level that can be "debug", "info", "warning" or "error" (default: "info")
-  shm_size: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
-```
-
-## Compute
-
-```yaml
-compute:
-  cpu: <string | int | float>  # CPU request per replica. One unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m)
-  gpu: <int>  # GPU request per replica. One unit of GPU corresponds to one virtual GPU (default: 0)
-  inf: <int>  # Inferentia request per replica. One unit of Inf corresponds to one virtual Inferentia chip (default: 0)
-  mem: <string>  # memory request per replica. One unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null)
-  node_groups: <list:string>  # to select specific node groups (optional)
-```
-
-## Autoscaling
-
-```yaml
-autoscaling:
-  min_replicas: <int>  # minimum number of replicas (default: 1)
-  max_replicas: <int>  # maximum number of replicas (default: 100)
-  init_replicas: <int>  # initial number of replicas (default: <min_replicas>)
-  max_replica_concurrency: <int>  # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
-  target_replica_concurrency: <float>  # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
-  window: <duration>  # the time over which to average the API's concurrency (default: 60s)
-  downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m)
-  upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m)
-  max_downscale_factor: <float>  # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
-  max_upscale_factor: <float>  # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
-  downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
-  upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
-```
-
-## Update strategy
-
-```yaml
-update_strategy:
-  max_surge: <string | int>  # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
-  max_unavailable: <string | int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
-```
-
-## Networking
-
-```yaml
-  networking:
-    endpoint: <string>  # the endpoint for the API (default: <api_name>)
+- name: <string>  # name of the API (required)
+  kind: AsyncAPI  # must be "AsyncAPI" for async APIs (required)
+  pod:  # pod configuration (required)
+    port: <int>  # port to which requests will be sent (default: 8080; exported as $CORTEX_PORT)
+    max_concurrency: <int>  # maximum number of requests that will be concurrently sent into the container (default: 1)
+    containers:  # configurations for the containers to run (at least one constainer must be provided)
+      - name: <string>  # name of the container (required)
+        image: <string>  # docker image to use for the container (required)
+        command: <list[string]>  # entrypoint (default: the docker image's ENTRYPOINT)
+        args: <list[string]>  # arguments to the entrypoint (default: the docker image's CMD)
+        env: <map[string:string]>  # dictionary of environment variables to set in the container (optional)
+        compute:  # compute resource requests (default: see below)
+          cpu: <string|int|float>  # CPU request for the container; one unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m)
+          gpu: <int>  # GPU request for the container; one unit of GPU corresponds to one virtual GPU (default: 0)
+          inf: <int>  # Inferentia request for the container; one unit of inf corresponds to one virtual Inferentia chip (default: 0)
+          mem: <string>  # memory request for the container; one unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null)
+          shm: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
+        readiness_probe:  # periodic probe of container readiness; traffic will not be sent into the pod unless all containers' readiness probes are succeeding (optional)
+          http_get:  # specifies an http endpoint which must respond with status code 200 (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+            path: <string>  # the path to access on the HTTP server (default: /)
+          tcp_socket:  # specifies a port which must be ready to receive traffic (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+          initial_delay_seconds: <int>  # number of seconds after the container has started before the probe is initiated (default: 0)
+          timeout_seconds: <int>  # number of seconds until the probe times out (default: 1)
+          period_seconds: <int>  # how often (in seconds) to perform the probe (default: 10)
+          success_threshold: <int>  # minimum consecutive successes for the probe to be considered successful after having failed (default: 1)
+          failure_threshold: <int>  # minimum consecutive failures for the probe to be considered failed after having succeeded (default: 3)
+        liveness_probe:  # periodic probe of container liveness; container will be restarted if the probe fails (optional)
+          http_get:  # specifies an http endpoint which must respond with status code 200 (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+            path: <string>  # the path to access on the HTTP server (default: /)
+          tcp_socket:  # specifies a port which must be ready to receive traffic (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+          exec:  # specifies a command to run which must exit with code 0 (only one of http_get, tcp_socket, and exec may be specified)
+            command: []  # the command to execute inside the container, which is exec'd (not run inside a shell); the working directory is root ('/') in the container's filesystem (required)
+          initial_delay_seconds: <int>  # number of seconds after the container has started before the probe is initiated (default: 0)
+          timeout_seconds: <int>  # number of seconds until the probe times out (default: 1)
+          period_seconds: <int>  # how often (in seconds) to perform the probe (default: 10)
+          success_threshold: <int>  # minimum consecutive successes for the probe to be considered successful after having failed (default: 1)
+          failure_threshold: <int>  # minimum consecutive failures for the probe to be considered failed after having succeeded (default: 3)
+  autoscaling:  # autoscaling configuration (default: see below)
+    min_replicas: <int>  # minimum number of replicas (default: 1)
+    max_replicas: <int>  # maximum number of replicas (default: 100)
+    init_replicas: <int>  # initial number of replicas (default: <min_replicas>)
+    target_in_flight: <int>  # desired number of in-flight requests per replica (including requests actively being processed as well as queued), which the autoscaler tries to maintain (default: <max_concurrency>)
+    window: <duration>  # duration over which to average the API's in-flight requests per replica (default: 60s)
+    downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m)
+    upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m)
+    max_downscale_factor: <float>  # maximum factor by which to scale down the API on a single scaling event (default: 0.75)
+    max_upscale_factor: <float>  # maximum factor by which to scale up the API on a single scaling event (default: 1.5)
+    downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
+    upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
+  node_groups: <list[string]>  # a list of node groups on which this API can run (default: all node groups are eligible)
+  update_strategy:  # deployment strategy to use when replacing existing replicas with new ones (default: see below)
+    max_surge: <string|int>  # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
+    max_unavailable: <string|int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
+  networking:  # networking configuration (default: see below)
+    endpoint: <string>  # endpoint for the API (default: <api_name>)
 ```
diff --git a/docs/workloads/async/handler.md b/docs/workloads/async/handler.md
deleted file mode 100644
index fe3f15d780..0000000000
--- a/docs/workloads/async/handler.md
+++ /dev/null
@@ -1,240 +0,0 @@
-# Handler implementation
-
-Your handler can be used to process any asynchronous workloads. It can also be used for running ML models using a variety of frameworks such as: PyTorch, ONNX, scikit-learn, XGBoost, TensorFlow (if not using `SavedModel`s), etc.
-
-If you plan on deploying models with TensorFlow in `SavedModel` format, you can also use the [TensorFlow Handler](models.md) that was specifically built for this purpose.
-
-## Project files
-
-Cortex makes all files in the project directory (i.e. the directory which contains `cortex.yaml`) available for use in
-your handler implementation. Python bytecode files (`*.pyc`, `*.pyo`, `*.pyd`), files or folders that start with `.`,
-and the api configuration file (e.g. `cortex.yaml`) are excluded.
-
-The following files can also be added at the root of the project's directory:
-
-* `.cortexignore` file, which follows the same syntax and behavior as a [.gitignore file](https://git-scm.com/docs/gitignore). This may be necessary if you are reaching the size limit for your project directory (32mb).
-* `.env` file, which exports environment variables that can be used in the handler. Each line of this file must follow
-  the `VARIABLE=value` format.
-
-For example, if your directory looks like this:
-
-```text
-./my-classifier/
-├── cortex.yaml
-├── values.json
-├── handler.py
-├── ...
-└── requirements.txt
-```
-
-You can access `values.json` in your Handler class like this:
-
-```python
-# handler.py
-
-import json
-
-class Handler:
-    def __init__(self, config):
-        with open('values.json', 'r') as values_file:
-            values = json.load(values_file)
-        self.values = values
-```
-
-## Interface
-
-```python
-# initialization code and variables can be declared here in global scope
-
-class Handler:
-    def __init__(self, config, metrics_client):
-        """(Required) Called once before the API becomes available. Performs
-        setup such as downloading/initializing the model or downloading a
-        vocabulary.
-
-        Args:
-            config (required): Dictionary passed from API configuration (if
-                specified). This may contain information on where to download
-                the model and/or metadata.
-            metrics_client (optional): The cortex metrics client, which allows
-                you to push custom metrics in order to build custom dashboards
-                in grafana.
-        """
-        pass
-
-    def handle_async(self, payload, request_id):
-        """(Required) Called once per request. Preprocesses the request payload
-        (if necessary), runs the workload, and postprocesses the resulting output
-        (if necessary).
-
-        Args:
-            payload (optional): The request payload (see below for the possible
-                payload types).
-            request_id (optional): The request id string that identifies a workload
-
-        Returns:
-            JSON-serializeable result.
-        """
-        pass
-```
-
-For proper separation of concerns, it is recommended to use the constructor's `config` parameter for information such as
-from where to download the model and initialization files, or any configurable model parameters. You define `config` in
-your API configuration, and it is passed through to your handler's constructor.
-
-Your API can accept requests with different types of payloads. Navigate to the [API requests](#api-requests) section to
-learn about how headers can be used to change the type of `payload` that is passed into your `handle_async` method.
-
-At this moment, the AsyncAPI `handle_async` method can only return `JSON`-parseable objects. Navigate to
-the [API responses](#api-responses) section to learn about how to configure it.
-
-## API requests
-
-The type of the `payload` parameter in `handle_async(self, payload)` can vary based on the content type of the request.
-The `payload` parameter is parsed according to the `Content-Type` header in the request. Here are the parsing rules (see
-below for examples):
-
-1. For `Content-Type: application/json`, `payload` will be the parsed JSON body.
-1. For `Content-Type: text/plain`, `payload` will be a string. `utf-8` encoding is assumed, unless specified otherwise (
-   e.g. via `Content-Type: text/plain; charset=us-ascii`)
-1. For all other `Content-Type` values, `payload` will be the raw `bytes` of the request body.
-
-Here are some examples:
-
-### JSON data
-
-#### Making the request
-
-```bash
-curl http://***.amazonaws.com/my-api \
-    -X POST -H "Content-Type: application/json" \
-    -d '{"key": "value"}'
-```
-
-#### Reading the payload
-
-When sending a JSON payload, the `payload` parameter will be a Python object:
-
-```python
-class Handler:
-    def __init__(self, config):
-        pass
-
-    def handle_async(self, payload):
-        print(payload["key"])  # prints "value"
-```
-
-### Binary data
-
-#### Making the request
-
-```bash
-curl http://***.amazonaws.com/my-api \
-    -X POST -H "Content-Type: application/octet-stream" \
-    --data-binary @object.pkl
-```
-
-#### Reading the payload
-
-Since the `Content-Type: application/octet-stream` header is used, the `payload` parameter will be a `bytes` object:
-
-```python
-import pickle
-
-
-class Handler:
-    def __init__(self, config):
-        pass
-
-    def handle_async(self, payload):
-        obj = pickle.loads(payload)
-        print(obj["key"])  # prints "value"
-```
-
-Here's an example if the binary data is an image:
-
-```python
-from PIL import Image
-import io
-
-
-class Handler:
-    def __init__(self, config):
-        pass
-
-    def handle_async(self, payload):
-        img = Image.open(io.BytesIO(payload))  # read the payload bytes as an image
-        print(img.size)
-```
-
-### Text data
-
-#### Making the request
-
-```bash
-curl http://***.amazonaws.com/my-api \
-    -X POST -H "Content-Type: text/plain" \
-    -d "hello world"
-```
-
-#### Reading the payload
-
-Since the `Content-Type: text/plain` header is used, the `payload` parameter will be a `string` object:
-
-```python
-class Handle:
-    def __init__(self, config):
-        pass
-
-    def handle_async(self, payload):
-        print(payload)  # prints "hello world"
-```
-
-## API responses
-
-The return value of your `handle_async()` method must be a JSON-serializable dictionary. The result for
-each request will remain queryable for 7 days after the request was completed.
-
-## Chaining APIs
-
-It is possible to make requests from one API to another within a Cortex cluster. All running APIs are accessible from
-within the handler at `http://api-<api_name>:8888/`, where `<api_name>` is the name of the API you are making a
-request to.
-
-For example, if there is an api named `text-generator` running in the cluster, you could make a request to it from a
-different API by using:
-
-```python
-import requests
-
-
-class Handler:
-    def handle_async(self, payload):
-        response = requests.post("http://api-text-generator:8888/", json={"text": "machine learning is"})
-        # ...
-```
-
-## Structured logging
-
-You can use Cortex's logger in your handler implemention to log in JSON. This will enrich your logs with Cortex's
-metadata, and you can add custom metadata to the logs by adding key value pairs to the `extra` key when using the
-logger. For example:
-
-```python
-...
-from cortex_internal.lib.log import logger as log
-
-
-class Handler:
-    def handle_async(self, payload):
-        log.info("received payload", extra={"payload": payload})
-```
-
-The dictionary passed in via the `extra` will be flattened by one level. e.g.
-
-```text
-{"asctime": "2021-01-19 15:14:05,291", "levelname": "INFO", "message": "received payload", "process": 235, "payload": "this movie is awesome"}
-```
-
-To avoid overriding essential Cortex metadata, please refrain from specifying the following extra keys: `asctime`
-, `levelname`, `message`, `labels`, and `process`. Log lines greater than 5 MB in size will be ignored.
diff --git a/docs/workloads/async/models.md b/docs/workloads/async/models.md
deleted file mode 100644
index 6f1c4efff5..0000000000
--- a/docs/workloads/async/models.md
+++ /dev/null
@@ -1,203 +0,0 @@
-# TensorFlow Models
-
-In addition to the [standard Python Handler](handler.md), Cortex also supports another handler called the TensorFlow handler, which can be used to deploy TensorFlow models exported as `SavedModel` models.
-
-## Interface
-
-**Uses TensorFlow version 2.3.0 by default**
-
-```python
-class Handler:
-    def __init__(self, config, tensorflow_client, metrics_client):
-        """(Required) Called once before the API becomes available. Performs
-        setup such as downloading/initializing a vocabulary.
-
-        Args:
-            config (required): Dictionary passed from API configuration (if
-                specified).
-            tensorflow_client (required): TensorFlow client which is used to
-                make predictions. This should be saved for use in handle_async().
-            metrics_client (optional): The cortex metrics client, which allows
-                you to push custom metrics in order to build custom dashboards
-                in grafana.
-        """
-        self.client = tensorflow_client
-        # Additional initialization may be done here
-
-    def handle_async(self, payload, request_id):
-        """(Required) Called once per request. Preprocesses the request payload
-        (if necessary), runs inference (e.g. by calling
-        self.client.predict(model_input)), and postprocesses the inference
-        output (if necessary).
-
-        Args:
-            payload (optional): The request payload (see below for the possible
-                payload types).
-            request_id (optional): The request id string that identifies a workload
-
-        Returns:
-            Prediction or a batch of predictions.
-        """
-        pass
-```
-
-<!-- CORTEX_VERSION_MINOR -->
-
-Cortex provides a `tensorflow_client` to your handler's constructor. `tensorflow_client` is an instance
-of [TensorFlowClient](https://github.com/cortexlabs/cortex/tree/master/python/serve/cortex_internal/lib/client/tensorflow.py)
-that manages a connection to a TensorFlow Serving container to make predictions using your model. It should be saved as
-an instance variable in your handler class, and your `handle_async()` function should call `tensorflow_client.predict()` to make
-an inference with your exported TensorFlow model. Preprocessing of the JSON payload and postprocessing of predictions
-can be implemented in your `handle_async()` function as well.
-
-When multiple models are defined using the Handler's `models` field, the `tensorflow_client.predict()` method expects a second argument `model_name` which must hold the name of the model that you want to use for inference (for example: `self.client.predict(payload, "text-generator")`). There is also an optional third argument to specify the model version.
-
-If you need to share files between your handler implementation and the TensorFlow Serving container, you can create a new directory within `/mnt` (e.g. `/mnt/user`) and write files to it. The entire `/mnt` directory is shared between containers, but do not write to any of the directories in `/mnt` that already exist (they are used internally by Cortex).
-
-## `predict` method
-
-Inference is performed by using the `predict` method of the `tensorflow_client` that's passed to the handler's constructor:
-
-```python
-def predict(model_input, model_name, model_version) -> dict:
-    """
-    Run prediction.
-
-    Args:
-        model_input: Input to the model.
-        model_name (optional): Name of the model to retrieve (when multiple models are deployed in an API).
-            When handler.models.paths is specified, model_name should be the name of one of the models listed in the API config.
-            When handler.models.dir is specified, model_name should be the name of a top-level directory in the models dir.
-        model_version (string, optional): Version of the model to retrieve. Can be omitted or set to "latest" to select the highest version.
-
-    Returns:
-        dict: TensorFlow Serving response converted to a dictionary.
-    """
-```
-
-## Specifying models
-
-Whenever a model path is specified in an API configuration file, it should be a path to an S3 prefix which contains your exported model. Directories may include a single model, or multiple folders each with a single model (note that a "single model" need not be a single file; there can be multiple files for a single model). When multiple folders are used, the folder names must be integer values, and will be interpreted as the model version. Model versions can be any integer, but are typically integer timestamps. It is always assumed that the highest version number is the latest version of your model.
-
-### API spec
-
-#### Single model
-
-The most common pattern is to serve a single model per API. The path to the model is specified in the `path` field in the `handler.models` configuration. For example:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: AsyncAPI
-  handler:
-    # ...
-    type: tensorflow
-    models:
-      path: s3://my-bucket/models/text-generator/
-```
-
-#### Multiple models
-
-It is possible to serve multiple models from a single API. The paths to the models are specified in the api configuration, either via the `models.paths` or `models.dir` field in the `handler` configuration. For example:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: AsyncAPI
-  handler:
-    # ...
-    type: tensorflow
-    models:
-      paths:
-        - name: iris-classifier
-          path: s3://my-bucket/models/text-generator/
-        # ...
-```
-
-or:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: AsyncAPI
-  handler:
-    # ...
-    type: tensorflow
-    models:
-      dir: s3://my-bucket/models/
-```
-
-When using the `models.paths` field, each path must be a valid model directory (see above for valid model directory structures).
-
-When using the `models.dir` field, the directory provided may contain multiple subdirectories, each of which is a valid model directory. For example:
-
-```text
-  s3://my-bucket/models/
-  ├── text-generator
-  |   └── * (model files)
-  └── sentiment-analyzer
-      ├── 24753823/
-      |   └── * (model files)
-      └── 26234288/
-          └── * (model files)
-```
-
-In this case, there are two models in the directory, one of which is named "text-generator", and the other is named "sentiment-analyzer".
-
-### Structure
-
-#### On CPU/GPU
-
-The model path must be a SavedModel export:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── saved_model.pb
-  └── variables/
-      ├── variables.index
-      ├── variables.data-00000-of-00003
-      ├── variables.data-00001-of-00003
-      └── variables.data-00002-of-...
-```
-
-or for a versioned model:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── 1523423423/  (version number, usually a timestamp)
-  |   ├── saved_model.pb
-  |   └── variables/
-  |       ├── variables.index
-  |       ├── variables.data-00000-of-00003
-  |       ├── variables.data-00001-of-00003
-  |       └── variables.data-00002-of-...
-  └── 2434389194/  (version number, usually a timestamp)
-      ├── saved_model.pb
-      └── variables/
-          ├── variables.index
-          ├── variables.data-00000-of-00003
-          ├── variables.data-00001-of-00003
-          └── variables.data-00002-of-...
-```
-
-#### On Inferentia
-
-When Inferentia models are used, the directory structure is slightly different:
-
-```text
-  s3://my-bucket/models/text-generator/
-  └── saved_model.pb
-```
-
-or for a versioned model:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── 1523423423/  (version number, usually a timestamp)
-  |   └── saved_model.pb
-  └── 2434389194/  (version number, usually a timestamp)
-      └── saved_model.pb
-```
diff --git a/docs/workloads/async/webhooks.md b/docs/workloads/async/webhooks.md
index a9c085c2ad..f822725a7f 100644
--- a/docs/workloads/async/webhooks.md
+++ b/docs/workloads/async/webhooks.md
@@ -8,42 +8,44 @@ completion or failure, and the URL known in advance is some other service that w
 
 ## Example
 
-Below is a guideline for implementing webhooks for an `AsyncAPI` workload.
+Below is an example implementing webhooks for an `AsyncAPI` workload using FastAPI.
 
 ```python
+import os
 import time
-from datetime import datetime
-
 import requests
+from datetime import datetime
+from fastapi import FastAPI, Header
 
 STATUS_COMPLETED = "completed"
 STATUS_FAILED = "failed"
 
+webhook_url = os.getenv("WEBHOOK_URL")  # the webhook url is set as an environment variable
+
+app = FastAPI()
+
 
-class Handler:
-    def __init__(self, config):
-        self.webhook_url = config["webhook_url"]  # the webhook url is passed in the config
+@app.post("/")
+async def handle(x_cortex_request_id=Header(None)):
+    try:
+        time.sleep(60)  # simulates a long workload
+        send_report(x_cortex_request_id, STATUS_COMPLETED, result={"data": "hello"})
+    except Exception as err:
+        send_report(x_cortex_request_id, STATUS_FAILED)
+        raise err  # the original exception should still be raised
 
-    def handle_async(self, payload, request_id):
-        try:
-            time.sleep(60)  # simulates a long workload
-            self.send_report(request_id, STATUS_COMPLETED, result={"data": "hello"})
-        except Exception as err:
-            self.send_report(request_id, STATUS_FAILED)
-            raise err  # the original exception should still be raised!
 
-    # this is a utility method
-    def send_report(self, request_id, status, result=None):
-        response = {"id": request_id, "status": status}
+def send_report(request_id, status, result=None):
+    response = {"id": request_id, "status": status}
 
-        if result is not None and status == STATUS_COMPLETED:
-            timestamp = datetime.utcnow().isoformat()
-            response.update({"result": result, "timestamp": timestamp})
+    if result is not None and status == STATUS_COMPLETED:
+        timestamp = datetime.utcnow().isoformat()
+        response.update({"result": result, "timestamp": timestamp})
 
-        try:
-            requests.post(url=self.webhook_url, json=response)
-        except Exception:
-            pass
+    try:
+        requests.post(url=webhook_url, json=response)
+    except Exception:
+        pass
 ```
 
 ## Development
diff --git a/docs/workloads/batch/batch-apis.md b/docs/workloads/batch/batch-apis.md
new file mode 100644
index 0000000000..ab71f0be03
--- /dev/null
+++ b/docs/workloads/batch/batch-apis.md
@@ -0,0 +1,3 @@
+# Batch APIs
+
+Batch APIs run distributed and fault-tolerant batch processing jobs on demand. They can be used for batch inference or data processing workloads.
diff --git a/docs/workloads/batch/configuration.md b/docs/workloads/batch/configuration.md
index f7b6ecc9c4..4b9337a95a 100644
--- a/docs/workloads/batch/configuration.md
+++ b/docs/workloads/batch/configuration.md
@@ -1,79 +1,48 @@
 # Configuration
 
 ```yaml
-- name: <string>
-  kind: BatchAPI
-  handler: # detailed configuration below
-  compute: # detailed configuration below
-  networking: # detailed configuration below
-```
-
-## Handler
-
-### Python Handler
-
-<!-- CORTEX_VERSION_BRANCH_STABLE x2 -->
-```yaml
-handler:
-  type: python
-  path: <string>  # path to a python file with a Handler class definition, relative to the Cortex root (required)
-  config: <string: value>  # arbitrary dictionary passed to the constructor of the Handler class (can be overridden by config passed in job submission) (optional)
-  python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
-  image: <string> # docker image to use for the handler (default: quay.io/cortexlabs/python-handler-cpu:master or quay.io/cortexlabs/python-handler-gpu:master-cuda10.2-cudnn8 based on compute)
-  env: <string: string>  # dictionary of environment variables
-  log_level: <string>  # log level that can be "debug", "info", "warning" or "error" (default: "info")
-  shm_size: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
-  dependencies: # (optional)
-    pip: <string>  # relative path to requirements.txt (default: requirements.txt)
-    conda: <string>  # relative path to conda-packages.txt (default: conda-packages.txt)
-    shell: <string>  # relative path to a shell script for system package installation (default: dependencies.sh)
-```
-
-### TensorFlow Handler
-
-<!-- CORTEX_VERSION_BRANCH_STABLE x3 -->
-```yaml
-handler:
-  type: tensorflow
-  path: <string>  # path to a python file with a Handler class definition, relative to the Cortex root (required)
-  models:  # use this to serve a single model or multiple ones
-    path: <string>  # S3 path to an exported model (e.g. s3://my-bucket/exported_model) (either this or 'paths' field must be provided)
-    paths:  # (either this or 'path' must be provided)
-      - name: <string> # unique name for the model (e.g. text-generator) (required)
-        path: <string>  # S3 path to an exported model (e.g. s3://my-bucket/exported_model) (required)
-        signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
-      ...
-    signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
-  server_side_batching:  # (optional)
-    max_batch_size: <int>  # the maximum number of requests to aggregate before running inference
-    batch_interval: <duration>  # the maximum amount of time to spend waiting for additional requests before running inference on the batch of requests
-  config: <string: value>  # arbitrary dictionary passed to the constructor of the Handler class (can be overridden by config passed in job submission) (optional)
-  python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
-  image: <string> # docker image to use for the handler (default: quay.io/cortexlabs/tensorflow-handler:master)
-  tensorflow_serving_image: <string> # docker image to use for the TensorFlow Serving container (default: quay.io/cortexlabs/tensorflow-serving-cpu:master or quay.io/cortexlabs/tensorflow-serving-gpu:master based on compute)
-  env: <string: string>  # dictionary of environment variables
-  log_level: <string>  # log level that can be "debug", "info", "warning" or "error" (default: "info")
-  shm_size: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
-  dependencies: # (optional)
-    pip: <string>  # relative path to requirements.txt (default: requirements.txt)
-    conda: <string>  # relative path to conda-packages.txt (default: conda-packages.txt)
-    shell: <string>  # relative path to a shell script for system package installation (default: dependencies.sh)
-```
-
-## Compute
-
-```yaml
-compute:
-  cpu: <string | int | float>  # CPU request per worker. One unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m)
-  gpu: <int>  # GPU request per worker. One unit of GPU corresponds to one virtual GPU (default: 0)
-  inf: <int>  # Inferentia request per replica. One unit of Inf corresponds to one virtual Inferentia chip (default: 0)
-  mem: <string>  # memory request per worker. One unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null)
-  node_groups: <list:string>  # to select specific node groups (optional)
-```
-
-## Networking
-
-```yaml
-networking:
-  endpoint: <string>  # the endpoint for the API (default: <api_name>)
+- name: <string>  # name of the API (required)
+  kind: BatchAPI  # must be "BatchAPI" for batch APIs (required)
+  pod:  # pod configuration (required)
+    port: <int>  # port to which requests will be sent (default: 8080; exported as $CORTEX_PORT)
+    max_concurrency: <int>  # maximum number of requests that will be concurrently sent into the container (default: 1)
+    containers:  # configurations for the containers to run (at least one constainer must be provided)
+      - name: <string>  # name of the container (required)
+        image: <string>  # docker image to use for the container (required)
+        command: <list[string]>  # entrypoint (required)
+        args: <list[string]>  # arguments to the entrypoint (default: no args)
+        env: <map[string:string]>  # dictionary of environment variables to set in the container (optional)
+        compute:  # compute resource requests (default: see below)
+          cpu: <string|int|float>  # CPU request for the container; one unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m)
+          gpu: <int>  # GPU request for the container; one unit of GPU corresponds to one virtual GPU (default: 0)
+          inf: <int>  # Inferentia request for the container; one unit of inf corresponds to one virtual Inferentia chip (default: 0)
+          mem: <string>  # memory request for the container; one unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null)
+          shm: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
+        readiness_probe:  # periodic probe of container readiness; traffic will not be sent into the pod unless all containers' readiness probes are succeeding (optional)
+          http_get:  # specifies an http endpoint which must respond with status code 200 (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+            path: <string>  # the path to access on the HTTP server (default: /)
+          tcp_socket:  # specifies a port which must be ready to receive traffic (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+          initial_delay_seconds: <int>  # number of seconds after the container has started before the probe is initiated (default: 0)
+          timeout_seconds: <int>  # number of seconds until the probe times out (default: 1)
+          period_seconds: <int>  # how often (in seconds) to perform the probe (default: 10)
+          success_threshold: <int>  # minimum consecutive successes for the probe to be considered successful after having failed (default: 1)
+          failure_threshold: <int>  # minimum consecutive failures for the probe to be considered failed after having succeeded (default: 3)
+        liveness_probe:  # periodic probe of container liveness; container will be restarted if the probe fails (optional)
+          http_get:  # specifies an http endpoint which must respond with status code 200 (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+            path: <string>  # the path to access on the HTTP server (default: /)
+          tcp_socket:  # specifies a port which must be ready to receive traffic (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+          exec:  # specifies a command to run which must exit with code 0 (only one of http_get, tcp_socket, and exec may be specified)
+            command: []  # the command to execute inside the container, which is exec'd (not run inside a shell); the working directory is root ('/') in the container's filesystem (required)
+          initial_delay_seconds: <int>  # number of seconds after the container has started before the probe is initiated (default: 0)
+          timeout_seconds: <int>  # number of seconds until the probe times out (default: 1)
+          period_seconds: <int>  # how often (in seconds) to perform the probe (default: 10)
+          success_threshold: <int>  # minimum consecutive successes for the probe to be considered successful after having failed (default: 1)
+          failure_threshold: <int>  # minimum consecutive failures for the probe to be considered failed after having succeeded (default: 3)
+  node_groups: <list[string]>  # a list of node groups on which this API can run (default: all node groups are eligible)
+  networking:  # networking configuration (default: see below)
+    endpoint: <string>  # endpoint for the API (default: <api_name>)
 ```
diff --git a/docs/workloads/batch/handler.md b/docs/workloads/batch/handler.md
deleted file mode 100644
index 35eaeda7d5..0000000000
--- a/docs/workloads/batch/handler.md
+++ /dev/null
@@ -1,125 +0,0 @@
-# Handler implementation
-
-Batch APIs run distributed and fault-tolerant batch processing jobs on-demand. They can be used for batch inference or data processing workloads. It can also be used for running ML models using a variety of frameworks such as: PyTorch, ONNX, scikit-learn, XGBoost, TensorFlow (if not using `SavedModel`s), etc.
-
-If you plan on deploying models with TensorFlow in `SavedModel` format and run inferences in batches, you can also use the [TensorFlow Handler](models.md) that was specifically built for this purpose.
-
-## Project files
-
-Cortex makes all files in the project directory (i.e. the directory which contains `cortex.yaml`) available for use in your Handler class implementation. Python bytecode files (`*.pyc`, `*.pyo`, `*.pyd`), files or folders that start with `.`, and the api configuration file (e.g. `cortex.yaml`) are excluded.
-
-The following files can also be added at the root of the project's directory:
-
-* `.cortexignore` file, which follows the same syntax and behavior as a [.gitignore file](https://git-scm.com/docs/gitignore). This may be necessary if you are reaching the size limit for your project directory (32mb).
-* `.env` file, which exports environment variables that can be used in the handler class. Each line of this file must follow the `VARIABLE=value` format.
-
-For example, if your directory looks like this:
-
-```text
-./my-classifier/
-├── cortex.yaml
-├── values.json
-├── handler.py
-├── ...
-└── requirements.txt
-```
-
-You can access `values.json` in your Handler class like this:
-
-```python
-# handler.py
-
-import json
-
-class Handler:
-    def __init__(self, config):
-        with open('values.json', 'r') as values_file:
-            values = json.load(values_file)
-        self.values = values
-```
-
-## Interface
-
-```python
-# initialization code and variables can be declared here in global scope
-
-class Handler:
-    def __init__(self, config, job_spec):
-        """(Required) Called once during each worker initialization. Performs
-        setup such as downloading/initializing the model or downloading a
-        vocabulary.
-
-        Args:
-            config (required): Dictionary passed from API configuration (if
-                specified) merged with configuration passed in with Job
-                Submission API. If there are conflicting keys, values in
-                configuration specified in Job submission takes precedence.
-            job_spec (optional): Dictionary containing the following fields:
-                "job_id": A unique ID for this job
-                "api_name": The name of this batch API
-                "config": The config that was provided in the job submission
-                "workers": The number of workers for this job
-                "total_batch_count": The total number of batches in this job
-                "start_time": The time that this job started
-        """
-        pass
-
-    def handle_batch(self, payload, batch_id):
-        """(Required) Called once per batch. Preprocesses the batch payload (if
-        necessary), runs inference, postprocesses the inference output (if
-        necessary), and writes the results to storage (i.e. S3 or a
-        database, if desired).
-
-        Args:
-            payload (required): a batch (i.e. a list of one or more samples).
-            batch_id (optional): uuid assigned to this batch.
-        Returns:
-            Nothing
-        """
-        pass
-
-    def on_job_complete(self):
-        """(Optional) Called once after all batches in the job have been
-        processed. Performs post job completion tasks such as aggregating
-        results, executing web hooks, or triggering other jobs.
-        """
-        pass
-```
-
-## Structured logging
-
-You can use Cortex's logger in your handler implemention to log in JSON. This will enrich your logs with Cortex's metadata, and you can add custom metadata to the logs by adding key value pairs to the `extra` key when using the logger. For example:
-
-```python
-...
-from cortex_internal.lib.log import logger as cortex_logger
-
-class Handler:
-    def handle_batch(self, payload, batch_id):
-        ...
-        cortex_logger.info("completed processing batch", extra={"batch_id": batch_id, "confidence": confidence})
-```
-
-The dictionary passed in via the `extra` will be flattened by one level. e.g.
-
-```text
-{"asctime": "2021-01-19 15:14:05,291", "levelname": "INFO", "message": "completed processing batch", "process": 235, "batch_id": "iuasyd8f7", "confidence": 0.97}
-```
-
-To avoid overriding essential Cortex metadata, please refrain from specifying the following extra keys: `asctime`, `levelname`, `message`, `labels`, and `process`. Log lines greater than 5 MB in size will be ignored.
-
-## Cortex Python client
-
-A default [Cortex Python client](../../clients/python.md#cortex.client.client) environment has been configured for your API. This can be used for deploying/deleting/updating or submitting jobs to your running cluster based on the execution flow of your batch handler. For example:
-
-```python
-import cortex
-
-class Handler:
-    def on_job_complete(self):
-        ...
-        # get client pointing to the default environment
-        client = cortex.client()
-        # deploy API in the existing cluster using the artifacts in the previous step
-        client.deploy(...)
-```
diff --git a/docs/workloads/batch/jobs.md b/docs/workloads/batch/jobs.md
index 632da17870..751c3fa85e 100644
--- a/docs/workloads/batch/jobs.md
+++ b/docs/workloads/batch/jobs.md
@@ -1,6 +1,6 @@
 # BatchAPI jobs
 
-## Get the TaskAPI endpoint
+## Get the Batch API's endpoint
 
 ```bash
 cortex get <batch_api_name>
@@ -42,7 +42,7 @@ POST <batch_api_endpoint>:
         ],
         "batch_size": <int>,  # the number of items per batch (the handle_batch() function is called once per batch) (required)
     }
-    "config": {               # custom fields for this specific job (will override values in `config` specified in your api configuration) (optional)
+    "config": {               # arbitrary input for this specific job (written to /cortex/spec/job.json in the API containers) (optional)
         "string": <any>
     }
 }
@@ -81,19 +81,19 @@ If a single S3 file contains a lot of samples/rows, try the next submission stra
 ```yaml
 POST <batch_api_endpoint>:
 {
-    "workers": <int>,            # the number of workers to allocate for this job (required)
-    "timeout": <int>,            # duration in seconds since the submission of a job before it is terminated (optional)
+    "workers": <int>,               # the number of workers to allocate for this job (required)
+    "timeout": <int>,               # duration in seconds since the submission of a job before it is terminated (optional)
     "sqs_dead_letter_queue": {      # specify a queue to redirect failed batches (optional)
         "arn": <string>,            # arn of dead letter queue e.g. arn:aws:sqs:us-west-2:123456789:failed.fifo
         "max_receive_count": <int>  # number of a times a batch is allowed to be handled by a worker before it is considered to be failed and transferred to the dead letter queue (must be >= 1)
     },
     "file_path_lister": {
-        "s3_paths": [<string>],  # can be S3 prefixes or complete S3 paths (required)
-        "includes": [<string>],  # glob patterns (optional)
-        "excludes": [<string>],  # glob patterns (optional)
-        "batch_size": <int>,     # the number of S3 file paths per batch (the handle_batch() function is called once per batch) (required)
+        "s3_paths": [<string>],     # can be S3 prefixes or complete S3 paths (required)
+        "includes": [<string>],     # glob patterns (optional)
+        "excludes": [<string>],     # glob patterns (optional)
+        "batch_size": <int>,        # the number of S3 file paths per batch (the handle_batch() function is called once per batch) (required)
     }
-    "config": {                  # custom fields for this specific job (will override values in `config` specified in your api configuration) (optional)
+    "config": {                     # arbitrary input for this specific job (written to /cortex/spec/job.json in the API containers) (optional)
         "string": <any>
     }
 }
@@ -131,19 +131,19 @@ This submission pattern is useful in the following scenarios:
 ```yaml
 POST <batch_api_endpoint>:
 {
-    "workers": <int>,            # the number of workers to allocate for this job (required)
-    "timeout": <int>,            # duration in seconds since the submission of a job before it is terminated (optional)
+    "workers": <int>,               # the number of workers to allocate for this job (required)
+    "timeout": <int>,               # duration in seconds since the submission of a job before it is terminated (optional)
     "sqs_dead_letter_queue": {      # specify a queue to redirect failed batches (optional)
         "arn": <string>,            # arn of dead letter queue e.g. arn:aws:sqs:us-west-2:123456789:failed.fifo
         "max_receive_count": <int>  # number of a times a batch is allowed to be handled by a worker before it is considered to be failed and transferred to the dead letter queue (must be >= 1)
     },
     "delimited_files": {
-        "s3_paths": [<string>],  # can be S3 prefixes or complete S3 paths (required)
-        "includes": [<string>],  # glob patterns (optional)
-        "excludes": [<string>],  # glob patterns (optional)
-        "batch_size": <int>,     # the number of json objects per batch (the handle_batch() function is called once per batch) (required)
+        "s3_paths": [<string>],     # can be S3 prefixes or complete S3 paths (required)
+        "includes": [<string>],     # glob patterns (optional)
+        "excludes": [<string>],     # glob patterns (optional)
+        "batch_size": <int>,        # the number of json objects per batch (the handle_batch() function is called once per batch) (required)
     }
-    "config": {                  # custom fields for this specific job (will override values in `config` specified in your api configuration) (optional)
+    "config": {                     # arbitrary input for this specific job (written to /cortex/spec/job.json in the API containers) (optional)
         "string": <any>
     }
 }
@@ -188,19 +188,19 @@ RESPONSE:
         "api_id": <string>,
         "sqs_url": <string>,
         "status": <string>,
-        "batches_in_queue": <int>        # number of batches remaining in the queue
+        "batches_in_queue": <int>   # number of batches remaining in the queue
         "batch_metrics": {
-            "succeeded": <int>           # number of succeeded batches
-            "failed": int                # number of failed attempts
+            "succeeded": <int>      # number of succeeded batches
+            "failed": int           # number of failed attempts
             "avg_time_per_batch": <float> (optional)  # average time spent working on a batch (only considers successful attempts)
         },
-        "worker_counts": {               # worker counts are only available while a job is running
-            "pending": <int>,            # number of workers that are waiting for compute resources to be provisioned
-            "initializing": <int>,       # number of workers that are initializing (downloading images or running your handler's init function)
-            "running": <int>,            # number of workers that are actively working on batches from the queue
-            "succeeded": <int>,          # number of workers that have completed after verifying that the queue is empty
-            "failed": <int>,             # number of workers that have failed
-            "stalled": <int>,            # number of workers that have been stuck in pending for more than 10 minutes
+        "worker_counts": {          # worker counts are only available while a job is running
+            "pending": <int>,       # number of workers that are waiting for compute resources to be provisioned
+            "initializing": <int>,  # number of workers that are initializing (downloading images or running your handler's init function)
+            "running": <int>,       # number of workers that are actively working on batches from the queue
+            "succeeded": <int>,     # number of workers that have completed after verifying that the queue is empty
+            "failed": <int>,        # number of workers that have failed
+            "stalled": <int>,       # number of workers that have been stuck in pending for more than 10 minutes
         },
         "created_time": <string>
         "start_time": <string>
diff --git a/docs/workloads/batch/models.md b/docs/workloads/batch/models.md
deleted file mode 100644
index 9f248482ed..0000000000
--- a/docs/workloads/batch/models.md
+++ /dev/null
@@ -1,210 +0,0 @@
-# TensorFlow Models
-
-In addition to the [standard Python Handler](handler.md), Cortex also supports another handler called the TensorFlow handler, which can be used to run TensorFlow models exported as `SavedModel` models.
-
-## Interface
-
-**Uses TensorFlow version 2.3.0 by default**
-
-```python
-class Handler:
-    def __init__(self, tensorflow_client, config, job_spec):
-        """(Required) Called once during each worker initialization. Performs
-        setup such as downloading/initializing the model or downloading a
-        vocabulary.
-
-        Args:
-            tensorflow_client (required): TensorFlow client which is used to
-                make predictions. This should be saved for use in handle_batch().
-            config (required): Dictionary passed from API configuration (if
-                specified) merged with configuration passed in with Job
-                Submission API. If there are conflicting keys, values in
-                configuration specified in Job submission takes precedence.
-            job_spec (optional): Dictionary containing the following fields:
-                "job_id": A unique ID for this job
-                "api_name": The name of this batch API
-                "config": The config that was provided in the job submission
-                "workers": The number of workers for this job
-                "total_batch_count": The total number of batches in this job
-                "start_time": The time that this job started
-        """
-        self.client = tensorflow_client
-        # Additional initialization may be done here
-
-    def handle_batch(self, payload, batch_id):
-        """(Required) Called once per batch. Preprocesses the batch payload (if
-        necessary), runs inference (e.g. by calling
-        self.client.predict(model_input)), postprocesses the inference output
-        (if necessary), and writes the predictions to storage (i.e. S3 or a
-        database, if desired).
-
-        Args:
-            payload (required): a batch (i.e. a list of one or more samples).
-            batch_id (optional): uuid assigned to this batch.
-        Returns:
-            Nothing
-        """
-        pass
-
-    def on_job_complete(self):
-        """(Optional) Called once after all batches in the job have been
-        processed. Performs post job completion tasks such as aggregating
-        results, executing web hooks, or triggering other jobs.
-        """
-        pass
-```
-
-<!-- CORTEX_VERSION_MINOR -->
-Cortex provides a `tensorflow_client` to your Handler class' constructor. `tensorflow_client` is an instance of [TensorFlowClient](https://github.com/cortexlabs/cortex/tree/master/python/serve/cortex_internal/lib/client/tensorflow.py) that manages a connection to a TensorFlow Serving container to make predictions using your model. It should be saved as an instance variable in your Handler class, and your `handle_batch()` function should call `tensorflow_client.predict()` to make an inference with your exported TensorFlow model. Preprocessing of the JSON payload and postprocessing of predictions can be implemented in your `handle_batch()` function as well.
-
-When multiple models are defined using the Handler's `models` field, the `tensorflow_client.predict()` method expects a second argument `model_name` which must hold the name of the model that you want to use for inference (for example: `self.client.predict(payload, "text-generator")`). There is also an optional third argument to specify the model version.
-
-If you need to share files between your handler implementation and the TensorFlow Serving container, you can create a new directory within `/mnt` (e.g. `/mnt/user`) and write files to it. The entire `/mnt` directory is shared between containers, but do not write to any of the directories in `/mnt` that already exist (they are used internally by Cortex).
-
-## `predict` method
-
-Inference is performed by using the `predict` method of the `tensorflow_client` that's passed to the handler's constructor:
-
-```python
-def predict(model_input, model_name, model_version) -> dict:
-    """
-    Run prediction.
-
-    Args:
-        model_input: Input to the model.
-        model_name (optional): Name of the model to retrieve (when multiple models are deployed in an API).
-            When handler.models.paths is specified, model_name should be the name of one of the models listed in the API config.
-            When handler.models.dir is specified, model_name should be the name of a top-level directory in the models dir.
-        model_version (string, optional): Version of the model to retrieve. Can be omitted or set to "latest" to select the highest version.
-
-    Returns:
-        dict: TensorFlow Serving response converted to a dictionary.
-    """
-```
-
-## Specifying models
-
-Whenever a model path is specified in an API configuration file, it should be a path to an S3 prefix which contains your exported model. Directories may include a single model, or multiple folders each with a single model (note that a "single model" need not be a single file; there can be multiple files for a single model). When multiple folders are used, the folder names must be integer values, and will be interpreted as the model version. Model versions can be any integer, but are typically integer timestamps. It is always assumed that the highest version number is the latest version of your model.
-
-### API spec
-
-#### Single model
-
-The most common pattern is to serve a single model per API. The path to the model is specified in the `path` field in the `handler.models` configuration. For example:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: BatchAPI
-  handler:
-    # ...
-    type: tensorflow
-    models:
-      path: s3://my-bucket/models/text-generator/
-```
-
-#### Multiple models
-
-It is possible to serve multiple models from a single API. The paths to the models are specified in the api configuration, either via the `models.paths` or `models.dir` field in the `handler` configuration. For example:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: BatchAPI
-  handler:
-    # ...
-    type: tensorflow
-    models:
-      paths:
-        - name: iris-classifier
-          path: s3://my-bucket/models/text-generator/
-        # ...
-```
-
-or:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: BatchAPI
-  handler:
-    # ...
-    type: tensorflow
-    models:
-      dir: s3://my-bucket/models/
-```
-
-When using the `models.paths` field, each path must be a valid model directory (see above for valid model directory structures).
-
-When using the `models.dir` field, the directory provided may contain multiple subdirectories, each of which is a valid model directory. For example:
-
-```text
-  s3://my-bucket/models/
-  ├── text-generator
-  |   └── * (model files)
-  └── sentiment-analyzer
-      ├── 24753823/
-      |   └── * (model files)
-      └── 26234288/
-          └── * (model files)
-```
-
-In this case, there are two models in the directory, one of which is named "text-generator", and the other is named "sentiment-analyzer".
-
-### Structure
-
-#### On CPU/GPU
-
-The model path must be a SavedModel export:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── saved_model.pb
-  └── variables/
-      ├── variables.index
-      ├── variables.data-00000-of-00003
-      ├── variables.data-00001-of-00003
-      └── variables.data-00002-of-...
-```
-
-or for a versioned model:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── 1523423423/  (version number, usually a timestamp)
-  |   ├── saved_model.pb
-  |   └── variables/
-  |       ├── variables.index
-  |       ├── variables.data-00000-of-00003
-  |       ├── variables.data-00001-of-00003
-  |       └── variables.data-00002-of-...
-  └── 2434389194/  (version number, usually a timestamp)
-      ├── saved_model.pb
-      └── variables/
-          ├── variables.index
-          ├── variables.data-00000-of-00003
-          ├── variables.data-00001-of-00003
-          └── variables.data-00002-of-...
-```
-
-#### On Inferentia
-
-When Inferentia models are used, the directory structure is slightly different:
-
-```text
-  s3://my-bucket/models/text-generator/
-  └── saved_model.pb
-```
-
-or for a versioned model:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── 1523423423/  (version number, usually a timestamp)
-  |   └── saved_model.pb
-  └── 2434389194/  (version number, usually a timestamp)
-      └── saved_model.pb
-```
diff --git a/docs/workloads/dependencies/example.md b/docs/workloads/dependencies/example.md
deleted file mode 100644
index 2fcc1b330f..0000000000
--- a/docs/workloads/dependencies/example.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# Deploy a project
-
-You can deploy an API by providing a project directory. Cortex will save the project directory and make it available during API initialization.
-
-```bash
-project/
-  ├── model.py
-  ├── util.py
-  ├── handler.py
-  ├── requirements.txt
-  └── ...
-```
-
-You can define your Handler class in a separate python file and import code from your project.
-
-```python
-# handler.py
-
-from model import MyModel
-
-class Handler:
-    def __init__(self, config):
-        model = MyModel()
-
-    def handle_post(payload):
-        return model(payload)
-```
-
-## Deploy using the Python Client
-
-```python
-import cortex
-
-api_spec = {
-    "name": "text-generator",
-    "kind": "RealtimeAPI",
-    "handler": {
-        "type": "python",
-        "path": "handler.py"
-    }
-}
-
-cx = cortex.client("cortex")
-cx.deploy(api_spec, project_dir=".")
-```
-
-## Deploy using the CLI
-
-```yaml
-# api.yaml
-
-- name: text-generator
-  kind: RealtimeAPI
-  handler:
-    type: python
-    path: handler.py
-```
-
-```bash
-cortex deploy api.yaml
-```
diff --git a/docs/workloads/dependencies/images.md b/docs/workloads/dependencies/images.md
deleted file mode 100644
index fc9792e857..0000000000
--- a/docs/workloads/dependencies/images.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# Docker images
-
-Cortex includes a default set of Docker images with pre-installed Python and system packages but you can build custom images for use in your APIs. Common reasons to do this are to avoid installing dependencies during replica initialization, to have smaller images, and/or to mirror images to your ECR registry (for speed and reliability).
-
-## Create a Dockerfile
-
-```bash
-mkdir my-api && cd my-api && touch Dockerfile
-```
-
-Cortex's base Docker images are listed below. Depending on the Cortex Handler and compute type specified in your API configuration, choose one of these images to use as the base for your Docker image:
-
-<!-- CORTEX_VERSION_BRANCH_STABLE x10 -->
-* Python Handler (CPU): `quay.io/cortexlabs/python-handler-cpu:master`
-* Python Handler (GPU): choose one of the following:
-  * `quay.io/cortexlabs/python-handler-gpu:master-cuda10.0-cudnn7`
-  * `quay.io/cortexlabs/python-handler-gpu:master-cuda10.1-cudnn7`
-  * `quay.io/cortexlabs/python-handler-gpu:master-cuda10.1-cudnn8`
-  * `quay.io/cortexlabs/python-handler-gpu:master-cuda10.2-cudnn7`
-  * `quay.io/cortexlabs/python-handler-gpu:master-cuda10.2-cudnn8`
-  * `quay.io/cortexlabs/python-handler-gpu:master-cuda11.0-cudnn8`
-  * `quay.io/cortexlabs/python-handler-gpu:master-cuda11.1-cudnn8`
-* Python Handler (Inferentia): `quay.io/cortexlabs/python-handler-inf:master`
-* TensorFlow Handler (CPU, GPU, Inferentia): `quay.io/cortexlabs/tensorflow-handler:master`
-
-The sample `Dockerfile` below inherits from Cortex's Python CPU serving image, and installs 3 packages. `tree` is a system package and `pandas` and `rdkit` are Python packages.
-
-<!-- CORTEX_VERSION_BRANCH_STABLE -->
-```dockerfile
-# Dockerfile
-
-FROM quay.io/cortexlabs/python-handler-cpu:master
-
-RUN apt-get update \
-    && apt-get install -y tree \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
-
-RUN pip install --no-cache-dir pandas \
-    && conda install -y conda-forge::rdkit \
-    && conda clean -a
-```
-
-If you need to upgrade the Python Runtime version on your image, you can follow this procedure:
-
-<!-- CORTEX_VERSION_BRANCH_STABLE -->
-
-```Dockerfile
-# Dockerfile
-
-FROM quay.io/cortexlabs/python-handler-cpu:master
-
-# upgrade python runtime version
-RUN conda update -n base -c defaults conda
-RUN conda install -n env python=3.8.5
-
-# re-install cortex core dependencies
-RUN /usr/local/cortex/install-core-dependencies.sh
-
-# ...
-```
-
-## Build your image
-
-```bash
-docker build . -t org/my-api:latest
-```
-
-## Push your image to a container registry
-
-You can push your built Docker image to a public registry of your choice (e.g. Docker Hub), or to a private registry on ECR or Docker Hub.
-
-For example, to use ECR, first create a repository to store your image:
-
-```bash
-# We create a repository in ECR
-
-export AWS_REGION="***"
-export REGISTRY_URL="***"  # this will be in the format "<aws_account_id>.dkr.ecr.<aws_region>.amazonaws.com"
-
-aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin $REGISTRY_URL
-
-aws ecr create-repository --repository-name=org/my-api --region=$AWS_REGION
-# take note of repository url
-```
-
-Build and tag your image, and push it to your ECR repository:
-
-```bash
-docker build . -t org/my-api:latest -t <repository_url>:latest
-
-docker push <repository_url>:latest
-```
-
-## Configure your API
-
-```yaml
-# cortex.yaml
-
-- name: my-api
-  ...
-  handler:
-    image: <repository_url>:latest
-  ...
-```
-
-Note: for TensorFlow Handlers, two containers run together to serve requests: one runs your Handler code (`quay.io/cortexlabs/tensorflow-handler`), and the other is TensorFlow serving to load the SavedModel (`quay.io/cortexlabs/tensorflow-serving-gpu` or `quay.io/cortexlabs/tensorflow-serving-cpu`). There's a second available field `tensorflow_serving_image` that can be used to override the TensorFlow Serving image. Both of the default serving images (`quay.io/cortexlabs/tensorflow-serving-gpu` and `quay.io/cortexlabs/tensorflow-serving-cpu`) are based on the official TensorFlow Serving image (`tensorflow/serving`). Unless a different version of TensorFlow Serving is required, the TensorFlow Serving image shouldn't have to be overridden, since it's only used to load the SavedModel and does not run your Handler code.
diff --git a/docs/workloads/dependencies/python-packages.md b/docs/workloads/dependencies/python-packages.md
deleted file mode 100644
index 7178b92527..0000000000
--- a/docs/workloads/dependencies/python-packages.md
+++ /dev/null
@@ -1,140 +0,0 @@
-# Python packages
-
-## PyPI packages
-
-You can install your required PyPI packages and import them in your Python files using pip. Cortex looks for
-a `requirements.txt` file in the top level Cortex project directory (i.e. the directory which contains `cortex.yaml`):
-
-```text
-./my-classifier/
-├── cortex.yaml
-├── handler.py
-├── ...
-└── requirements.txt
-```
-
-If you want to use `conda` to install your python packages, see the [Conda section](#conda-packages) below.
-
-Note that some packages are pre-installed by default (see "pre-installed packages" for your handler type in the
-Realtime API Handler documentation and Batch API Handler documentation).
-
-## Private PyPI packages
-
-To install packages from a private PyPI index, create a `pip.conf` inside the same directory as `requirements.txt`, and
-add the following contents:
-
-```text
-[global]
-extra-index-url = https://<username>:<password>@<my-private-index>.com/pip
-```
-
-In same directory, create a [`dependencies.sh` script](system-packages.md) and add the following:
-
-```bash
-cp pip.conf /etc/pip.conf
-```
-
-You may now add packages to `requirements.txt` which are found in the private index.
-
-## GitHub packages
-
-You can also install public/private packages from git registries (such as GitHub) by adding them to `requirements.txt`.
-Here's an example for GitHub:
-
-```text
-# requirements.txt
-
-# public access
-git+https://github.com/<username>/<repo name>.git@<tag or branch name>#egg=<package name>
-
-# private access
-git+https://<personal access token>@github.com/<username>/<repo name>.git@<tag or branch name>#egg=<package name>
-```
-
-On GitHub, you can generate a personal access token by
-following [these steps](https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line)
-.
-
-## Installing with Setup
-
-Python packages can also be installed by providing a `setup.py` that describes your project's modules. Here's an example
-directory structure:
-
-```text
-./my-classifier/
-├── cortex.yaml
-├── handler.py
-├── ...
-├── mypkg
-│   └── __init__.py
-├── requirements.txt
-└── setup.py
-```
-
-In this case, `requirements.txt` will have this form:
-
-```text
-# requirements.txt
-
-.
-```
-
-## Conda packages
-
-Cortex supports installing Conda packages. We recommend only using Conda when your required packages are not available
-in PyPI. Cortex looks for a `conda-packages.txt` file in the top level Cortex project directory (i.e. the directory
-which contains `cortex.yaml`):
-
-```text
-./my-classifier/
-├── cortex.yaml
-├── handler.py
-├── ...
-└── conda-packages.txt
-```
-
-The `conda-packages.txt` file follows the format of `conda list --export`. Each line of `conda-packages.txt` should
-follow this pattern: `[channel::]package[=version[=buildid]]`.
-
-Here's an example of `conda-packages.txt`:
-
-```text
-conda-forge::rdkit
-conda-forge::pygpu
-```
-
-In situations where both `requirements.txt` and `conda-packages.txt` are provided, Cortex installs Conda packages
-in `conda-packages.txt` followed by PyPI packages in `requirements.txt`. Conda and Pip package managers install packages
-and dependencies independently. You may run into situations where Conda and pip package managers install different
-versions of the same package because they install and resolve dependencies independently from one another. To resolve
-package version conflicts, it may be in your best interest to specify their exact versions in `conda-packages.txt`.
-
-The current version of Python is `3.6.9`. Updating Python to a different version is possible with Conda, but there are
-no guarantees that Cortex's web server will continue functioning correctly. If there's a change in Python's version, the
-necessary core packages for the web server will be reinstalled. If you are using a custom base image, any other Python
-packages that are built in to the image won't be accessible at runtime.
-
-Check the [best practices](https://www.anaconda.com/using-pip-in-a-conda-environment/) on using `pip` inside `conda`.
-
-## Customizing Dependency Paths
-
-Cortex allows you to specify different dependency paths other than the default ones. This can be useful when deploying
-different versions of the same API (e.g. CPU vs GPU dependencies).
-
-To customize the path for your dependencies, you can specify `handler.dependencies` in your API's configuration file. You can set
-one or more fields to specify the path for each dependency type. Each path should be a relative path with respect to the current file.
-
-For example:
-
-```yaml
-# cortex.yaml
-
-- name: my-classifier
-  kind: RealtimeAPI
-  handler:
-    (...)
-    dependencies:
-      pip: requirement-gpu.txt
-      conda: conda-packages-gpu.txt
-      shell: dependencies-gpu.sh
-```
diff --git a/docs/workloads/dependencies/system-packages.md b/docs/workloads/dependencies/system-packages.md
deleted file mode 100644
index 4f2b60c76b..0000000000
--- a/docs/workloads/dependencies/system-packages.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# System packages
-
-Cortex looks for a file named `dependencies.sh` in the top level Cortex project directory (i.e. the directory which contains `cortex.yaml`). For example:
-
-```text
-./my-classifier/
-├── cortex.yaml
-├── handler.py
-├── ...
-└── dependencies.sh
-```
-
-`dependencies.sh` is executed with `bash` shell during the initialization of each replica (before installing Python packages in `requirements.txt` or `conda-packages.txt`). Typical use cases include installing required system packages to be used in your Handler, building Python packages from source, etc. If initialization time is a concern, see [Docker images](images.md) for how to build and use custom Docker images.
-
-Here is an example `dependencies.sh`, which installs the `tree` utility:
-
-```bash
-apt-get update && apt-get install -y tree
-```
-
-The `tree` utility can now be called inside your `handler.py`:
-
-```python
-# handler.py
-
-import subprocess
-
-class Handler:
-    def __init__(self, config):
-        subprocess.run(["tree"])
-    ...
-```
-
-If you need to upgrade the Python Runtime version on your image, you can do so in your `dependencies.sh` file:
-
-```bash
-# upgrade python runtime version
-conda update -n base -c defaults conda
-conda install -n env python=3.8.5
-
-# re-install cortex core dependencies
-/usr/local/cortex/install-core-dependencies.sh
-```
-
-## Customizing Dependency Paths
-
-Cortex allows you to specify a path for this script other than `dependencies.sh`. This can be useful when deploying
-different versions of the same API (e.g. CPU vs GPU dependencies). The path should be a relative path with respect
-to the API configuration file, and is specified via `handler.dependencies.shell`.
-
-For example:
-
-```yaml
-# cortex.yaml
-
-- name: my-classifier
-  kind: RealtimeAPI
-  handler:
-    (...)
-    dependencies:
-      shell: dependencies-gpu.sh
-```
diff --git a/docs/workloads/realtime/autoscaling.md b/docs/workloads/realtime/autoscaling.md
index ce46c218a4..0b53bf84e5 100644
--- a/docs/workloads/realtime/autoscaling.md
+++ b/docs/workloads/realtime/autoscaling.md
@@ -1,34 +1,42 @@
 # Autoscaling
 
-Cortex autoscales your web services on a per-API basis based on your configuration.
+Cortex autoscales each API independently based on its configuration.
 
 ## Autoscaling replicas
 
-**`min_replicas`**: The lower bound on how many replicas can be running for an API.
+### Relevant pod configuration
 
-<br>
+In addition to the autoscaling configuration options (described below), there are two fields in the pod configuration which are relevant to replica autoscaling:
 
-**`max_replicas`**: The upper bound on how many replicas can be running for an API.
+**`max_concurrency`** (default: 1): The the maximum number of requests that will be concurrently sent into the container by Cortex. If your web server is designed to handle multiple concurrent requests, increasing `max_concurrency` will increase the throughput of a replica (and result in fewer total replicas for a given load).
 
 <br>
 
-**`target_replica_concurrency`** (default: `processes_per_replica` * `threads_per_process`): This is the desired number of in-flight requests per replica, and is the metric which the autoscaler uses to make scaling decisions.
+**`max_queue_length`** (default: 100): The the maximum number of requests which will be queued by the replica (beyond `max_concurrency`) before requests are rejected with HTTP error code 503. For long-running APIs, decreasing `max_replica_concurrency` and configuring the client to retry when it receives 503 responses will improve queue fairness accross replicas by preventing requests from sitting in long queues.
 
-Replica concurrency is simply how many requests have been sent to a replica and have not yet been responded to (also referred to as in-flight requests). Therefore, it includes requests which are currently being processed and requests which are waiting in the replica's queue.
+<br>
 
-The autoscaler uses this formula to determine the number of desired replicas:
+### Autoscaling configuration
+
+**`min_replicas`**: The lower bound on how many replicas can be running for an API.
 
-`desired replicas = sum(in-flight requests accross all replicas) / target_replica_concurrency`
+<br>
 
-For example, setting `target_replica_concurrency` to `processes_per_replica` * `threads_per_process` (the default) causes the cluster to adjust the number of replicas so that on average, requests are immediately processed without waiting in a queue, and processes/threads are never idle.
+**`max_replicas`**: The upper bound on how many replicas can be running for an API.
 
 <br>
 
-**`max_replica_concurrency`** (default: 1024): This is the maximum number of in-flight requests per replica before requests are rejected with HTTP error code 503. `max_replica_concurrency` includes requests that are currently being processed as well as requests that are waiting in the replica's queue (a replica can actively process `processes_per_replica` * `threads_per_process` requests concurrently, and will hold any additional requests in a local queue). Decreasing `max_replica_concurrency` and configuring the client to retry when it receives 503 responses will improve queue fairness accross replicas by preventing requests from sitting in long queues.
+**`target_in_flight`** (default: `max_concurrency` in the pod configuration): This is the desired number of in-flight requests per replica, and is the metric which the autoscaler uses to make scaling decisions. The number of in-flight requests is simply how many requests have been sent to a replica and have not yet been responded to. Therefore, this number includes requests which are actively being processed as well as requests which are waiting in the replica's queue.
+
+The autoscaler uses this formula to determine the number of desired replicas:
+
+`desired replicas = sum(in-flight requests accross all replicas) / target_in_flight`
+
+For example, setting `target_in_flight` to `max_concurrency` (the default) causes the cluster to adjust the number of replicas so that on average, requests are immediately processed without waiting in a queue.
 
 <br>
 
-**`window`** (default: 60s): The time over which to average the API wide in-flight requests (which is the sum of in-flight requests in each replica). The longer the window, the slower the autoscaler will react to changes in API wide in-flight requests, since it is averaged over the `window`. API wide in-flight requests is calculated every 10 seconds, so `window` must be a multiple of 10 seconds.
+**`window`** (default: 60s): The time over which to average the API's in-flight requests (which is the sum of in-flight requests in each replica). The longer the window, the slower the autoscaler will react to changes in in-flight requests, since it is averaged over the `window`. An API's in-flight requests is calculated every 10 seconds, so `window` must be a multiple of 10 seconds.
 
 <br>
 
@@ -62,14 +70,12 @@ Cortex spins up and down instances based on the aggregate resource requests of a
 
 ## Overprovisioning
 
-The default value for `target_replica_concurrency` is `processes_per_replica` * `threads_per_process`, which behaves well in many situations (see above for an explanation of how `target_replica_concurrency` affects autoscaling). However, if your application is sensitive to spikes in traffic or if creating new replicas takes too long (see below), you may find it helpful to maintain extra capacity to handle the increased traffic while new replicas are being created. This can be accomplished by setting `target_replica_concurrency` to a lower value relative to the expected replica's concurrency. The smaller `target_replica_concurrency` is, the more unused capacity your API will have, and the more room it will have to handle sudden increased load. The increased request rate will still trigger the autoscaler, and your API will stabilize again (maintaining the overprovisioned capacity).
+The default value for `target_in_flight` is `max_concurrency`, which behaves well in many situations (see above for an explanation of how `target_in_flight` affects autoscaling). However, if your application is sensitive to spikes in traffic or if creating new replicas takes too long (see below), you may find it helpful to maintain extra capacity to handle the increased traffic while new replicas are being created. This can be accomplished by setting `target_in_flight` to a lower value relative to the expected replica's concurrency. The smaller `target_in_flight` is, the more unused capacity your API will have, and the more room it will have to handle sudden increased load. The increased request rate will still trigger the autoscaler, and your API will stabilize again (maintaining the overprovisioned capacity).
 
-For example, if you've determined that each replica in your API can handle 2 requests, you would set `target_replica_concurrency` to 2. In a scenario where your API is receiving 8 concurrent requests on average, the autoscaler would maintain 4 live replicas (8/2 = 4). If you wanted to overprovision by 25%, you can set `target_replica_concurrency` to 1.6 causing the autoscaler maintain 5 live replicas (8/1.6 = 5).
+For example, if you've determined that each replica in your API can handle 2 concurrent requests, you would typically set `target_in_flight` to 2. In a scenario where your API is receiving 8 concurrent requests on average, the autoscaler would maintain 4 live replicas (8/2 = 4). If you wanted to overprovision by 25%, you could set `target_in_flight` to 1.6, causing the autoscaler maintain 5 live replicas (8/1.6 = 5).
 
 ## Autoscaling responsiveness
 
-Assuming that `window` and `upscale_stabilization_period` are set to their default values (1 minute), it could take up to 2 minutes of increased traffic before an extra replica is requested. As soon as the additional replica is requested, the replica request will be visible in the output of `cortex get`, but the replica won't yet be running. If an extra instance is required to schedule the newly requested replica, it could take a few minutes for AWS to provision the instance (depending on the instance type), plus a few minutes for the newly provisioned instance to download your api image and for the api to initialize (via its `__init__()` method).
+Assuming that `window` and `upscale_stabilization_period` are set to their default values (1 minute), it could take up to 2 minutes of increased traffic before an extra replica is requested. As soon as the additional replica is requested, the replica request will be visible in the output of `cortex get`, but the replica won't yet be running. If an extra instance is required to schedule the newly requested replica, it could take a few minutes for AWS to provision the instance (depending on the instance type), plus a few minutes for the newly provisioned instance to download your api image and for the api to initialize.
 
 Keep these delays in mind when considering overprovisioning (see above) and when determining appropriate values for `window` and `upscale_stabilization_period`. If you want the autoscaler to react as quickly as possible, set `upscale_stabilization_period` and `window` to their minimum values (0s and 10s respectively).
-
-If it takes a long time to initialize your API replica (i.e. install dependencies and run your handler's `__init__()` function), consider building your own API image to use instead of the default image. With this approach, you can pre-download/build/install any custom dependencies and bake them into the image.
diff --git a/docs/workloads/realtime/configuration.md b/docs/workloads/realtime/configuration.md
index 81f8e2a63f..9d2bf85bf6 100644
--- a/docs/workloads/realtime/configuration.md
+++ b/docs/workloads/realtime/configuration.md
@@ -1,128 +1,66 @@
 # Configuration
 
 ```yaml
-- name: <string>
-  kind: RealtimeAPI
-  handler: # detailed configuration below
-  compute: # detailed configuration below
-  autoscaling: # detailed configuration below
-  update_strategy: # detailed configuration below
-  networking: # detailed configuration below
-```
-
-## Handler
-
-### Python Handler
-
-<!-- CORTEX_VERSION_BRANCH_STABLE x3 -->
-```yaml
-handler:
-  type: python
-  path: <string>  # path to a python file with a Handler class definition, relative to the Cortex root (required)
-  protobuf_path: <string>  # path to a protobuf file (required if using gRPC)
-  dependencies:  # (optional)
-    pip: <string>  # relative path to requirements.txt (default: requirements.txt)
-    conda: <string>  # relative path to conda-packages.txt (default: conda-packages.txt)
-    shell: <string>  # relative path to a shell script for system package installation (default: dependencies.sh)
-  multi_model_reloading:  # use this to serve one or more models with live reloading (optional)
-    path: <string> # S3 path to an exported model directory (e.g. s3://my-bucket/exported_model/) (either this, 'dir', or 'paths' must be provided if 'multi_model_reloading' is specified)
-    paths:  # list of S3 paths to exported model directories (either this, 'dir', or 'path' must be provided if 'multi_model_reloading' is specified)
-      - name: <string>  # unique name for the model (e.g. text-generator) (required)
-        path: <string>  # S3 path to an exported model directory (e.g. s3://my-bucket/exported_model/) (required)
-      ...
-    dir: <string>  # S3 path to a directory containing multiple models (e.g. s3://my-bucket/models/) (either this, 'path', or 'paths' must be provided if 'multi_model_reloading' is specified)
-    cache_size: <int>  # the number models to keep in memory (optional; all models are kept in memory by default)
-    disk_cache_size: <int>  # the number of models to keep on disk (optional; all models are kept on disk by default)
-  server_side_batching:  # (optional)
-    max_batch_size: <int>  # the maximum number of requests to aggregate before running inference
-    batch_interval: <duration>  # the maximum amount of time to spend waiting for additional requests before running inference on the batch of requests
-  processes_per_replica: <int>  # the number of parallel serving processes to run on each replica (default: 1)
-  threads_per_process: <int>  # the number of threads per process (default: 1)
-  config: <string: value>  # arbitrary dictionary passed to the constructor of the Handler class (optional)
-  python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
-  image: <string>  # docker image to use for the Handler class (default: quay.io/cortexlabs/python-handler-cpu:master, quay.io/cortexlabs/python-handler-gpu:master-cuda10.2-cudnn8, or quay.io/cortexlabs/python-handler-inf:master based on compute)
-  env: <string: string>  # dictionary of environment variables
-  log_level: <string>  # log level that can be "debug", "info", "warning" or "error" (default: "info")
-  shm_size: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
-```
-
-### TensorFlow Handler
-
-<!-- CORTEX_VERSION_BRANCH_STABLE x4 -->
-```yaml
-handler:
-  type: tensorflow
-  path: <string>  # path to a python file with a Handler class definition, relative to the Cortex root (required)
-  protobuf_path: <string>  # path to a protobuf file (required if using gRPC)
-  dependencies:  # (optional)
-    pip: <string>  # relative path to requirements.txt (default: requirements.txt)
-    conda: <string>  # relative path to conda-packages.txt (default: conda-packages.txt)
-    shell: <string>  # relative path to a shell script for system package installation (default: dependencies.sh)
-  models:  # (required)
-    path: <string> # S3 path to an exported SavedModel directory (e.g. s3://my-bucket/exported_model/) (either this, 'dir', or 'paths' must be provided)
-    paths:  # list of S3 paths to exported SavedModel directories (either this, 'dir', or 'path' must be provided)
-      - name: <string>  # unique name for the model (e.g. text-generator) (required)
-        path: <string>  # S3 path to an exported SavedModel directory (e.g. s3://my-bucket/exported_model/) (required)
-        signature_key: <string>  # name of the signature def to use for prediction (required if your model has more than one signature def)
-      ...
-    dir: <string>  # S3 path to a directory containing multiple SavedModel directories (e.g. s3://my-bucket/models/) (either this, 'path', or 'paths' must be provided)
-    signature_key:  # name of the signature def to use for prediction (required if your model has more than one signature def)
-    cache_size: <int>  # the number models to keep in memory (optional; all models are kept in memory by default)
-    disk_cache_size: <int>  # the number of models to keep on disk (optional; all models are kept on disk by default)
-  server_side_batching:  # (optional)
-    max_batch_size: <int>  # the maximum number of requests to aggregate before running inference
-    batch_interval: <duration>  # the maximum amount of time to spend waiting for additional requests before running inference on the batch of requests
-  processes_per_replica: <int>  # the number of parallel serving processes to run on each replica (default: 1)
-  threads_per_process: <int>  # the number of threads per process (default: 1)
-  config: <string: value>  # arbitrary dictionary passed to the constructor of the Handler class (optional)
-  python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
-  image: <string>  # docker image to use for the handler (default: quay.io/cortexlabs/tensorflow-handler:master)
-  tensorflow_serving_image: <string>  # docker image to use for the TensorFlow Serving container (default: quay.io/cortexlabs/tensorflow-serving-cpu:master, quay.io/cortexlabs/tensorflow-serving-gpu:master, or quay.io/cortexlabs/tensorflow-serving-inf:master based on compute)
-  env: <string: string>  # dictionary of environment variables
-  log_level: <string>  # log level that can be "debug", "info", "warning" or "error" (default: "info")
-  shm_size: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
-```
-
-## Compute
-
-```yaml
-compute:
-  cpu: <string | int | float>  # CPU request per replica. One unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m)
-  gpu: <int>  # GPU request per replica. One unit of GPU corresponds to one virtual GPU (default: 0)
-  inf: <int>  # Inferentia request per replica. One unit of Inf corresponds to one virtual Inferentia chip (default: 0)
-  mem: <string>  # memory request per replica. One unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null)
-  node_groups: <list:string>  # to select specific node groups (optional)
-```
-
-## Autoscaling
-
-```yaml
-autoscaling:
-  min_replicas: <int>  # minimum number of replicas (default: 1)
-  max_replicas: <int>  # maximum number of replicas (default: 100)
-  init_replicas: <int>  # initial number of replicas (default: <min_replicas>)
-  max_replica_concurrency: <int>  # the maximum number of in-flight requests per replica before requests are rejected with error code 503 (default: 1024)
-  target_replica_concurrency: <float>  # the desired number of in-flight requests per replica, which the autoscaler tries to maintain (default: processes_per_replica * threads_per_process)
-  window: <duration>  # the time over which to average the API's concurrency (default: 60s)
-  downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m)
-  upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m)
-  max_downscale_factor: <float>  # the maximum factor by which to scale down the API on a single scaling event (default: 0.75)
-  max_upscale_factor: <float>  # the maximum factor by which to scale up the API on a single scaling event (default: 1.5)
-  downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
-  upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
-```
-
-## Update strategy
-
-```yaml
-update_strategy:
-  max_surge: <string | int>  # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
-  max_unavailable: <string | int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
-```
-
-## Networking
-
-```yaml
-  networking:
-    endpoint: <string>  # the endpoint for the API (default: <api_name>)
+- name: <string>  # name of the API (required)
+  kind: RealtimeAPI  # must be "RealtimeAPI" for realtime APIs (required)
+  pod:  # pod configuration (required)
+    port: <int>  # port to which requests will be sent (default: 8080; exported as $CORTEX_PORT)
+    max_concurrency: <int>  # maximum number of requests that will be concurrently sent into the container (default: 1)
+    max_queue_length: <int>  # maximum number of requests per replica which will be queued (beyond max_concurrency) before requests are rejected with error code 503 (default: 100)
+    containers:  # configurations for the containers to run (at least one constainer must be provided)
+      - name: <string>  # name of the container (required)
+        image: <string>  # docker image to use for the container (required)
+        command: <list[string]>  # entrypoint (default: the docker image's ENTRYPOINT)
+        args: <list[string]>  # arguments to the entrypoint (default: the docker image's CMD)
+        env: <map[string:string]>  # dictionary of environment variables to set in the container (optional)
+        compute:  # compute resource requests (default: see below)
+          cpu: <string|int|float>  # CPU request for the container; one unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m)
+          gpu: <int>  # GPU request for the container; one unit of GPU corresponds to one virtual GPU (default: 0)
+          inf: <int>  # Inferentia request for the container; one unit of inf corresponds to one virtual Inferentia chip (default: 0)
+          mem: <string>  # memory request for the container; one unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null)
+          shm: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
+        readiness_probe:  # periodic probe of container readiness; traffic will not be sent into the pod unless all containers' readiness probes are succeeding (optional)
+          http_get:  # specifies an http endpoint which must respond with status code 200 (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+            path: <string>  # the path to access on the HTTP server (default: /)
+          tcp_socket:  # specifies a port which must be ready to receive traffic (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+          exec:  # specifies a command to run which must exit with code 0 (only one of http_get, tcp_socket, and exec may be specified)
+            command: []  # the command to execute inside the container, which is exec'd (not run inside a shell); the working directory is root ('/') in the container's filesystem (required)
+          initial_delay_seconds: <int>  # number of seconds after the container has started before the probe is initiated (default: 0)
+          timeout_seconds: <int>  # number of seconds until the probe times out (default: 1)
+          period_seconds: <int>  # how often (in seconds) to perform the probe (default: 10)
+          success_threshold: <int>  # minimum consecutive successes for the probe to be considered successful after having failed (default: 1)
+          failure_threshold: <int>  # minimum consecutive failures for the probe to be considered failed after having succeeded (default: 3)
+        liveness_probe:  # periodic probe of container liveness; container will be restarted if the probe fails (optional)
+          http_get:  # specifies an http endpoint which must respond with status code 200 (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+            path: <string>  # the path to access on the HTTP server (default: /)
+          tcp_socket:  # specifies a port which must be ready to receive traffic (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+          exec:  # specifies a command to run which must exit with code 0 (only one of http_get, tcp_socket, and exec may be specified)
+            command: []  # the command to execute inside the container, which is exec'd (not run inside a shell); the working directory is root ('/') in the container's filesystem (required)
+          initial_delay_seconds: <int>  # number of seconds after the container has started before the probe is initiated (default: 0)
+          timeout_seconds: <int>  # number of seconds until the probe times out (default: 1)
+          period_seconds: <int>  # how often (in seconds) to perform the probe (default: 10)
+          success_threshold: <int>  # minimum consecutive successes for the probe to be considered successful after having failed (default: 1)
+          failure_threshold: <int>  # minimum consecutive failures for the probe to be considered failed after having succeeded (default: 3)
+  autoscaling:  # autoscaling configuration (default: see below)
+    min_replicas: <int>  # minimum number of replicas (default: 1)
+    max_replicas: <int>  # maximum number of replicas (default: 100)
+    init_replicas: <int>  # initial number of replicas (default: <min_replicas>)
+    target_in_flight: <int>  # desired number of in-flight requests per replica (including requests actively being processed as well as queued), which the autoscaler tries to maintain (default: <max_concurrency>)
+    window: <duration>  # duration over which to average the API's in-flight requests per replica (default: 60s)
+    downscale_stabilization_period: <duration>  # the API will not scale below the highest recommendation made during this period (default: 5m)
+    upscale_stabilization_period: <duration>  # the API will not scale above the lowest recommendation made during this period (default: 1m)
+    max_downscale_factor: <float>  # maximum factor by which to scale down the API on a single scaling event (default: 0.75)
+    max_upscale_factor: <float>  # maximum factor by which to scale up the API on a single scaling event (default: 1.5)
+    downscale_tolerance: <float>  # any recommendation falling within this factor below the current number of replicas will not trigger a scale down event (default: 0.05)
+    upscale_tolerance: <float>  # any recommendation falling within this factor above the current number of replicas will not trigger a scale up event (default: 0.05)
+  node_groups: <list[string]>  # a list of node groups on which this API can run (default: all node groups are eligible)
+  update_strategy:  # deployment strategy to use when replacing existing replicas with new ones (default: see below)
+    max_surge: <string|int>  # maximum number of replicas that can be scheduled above the desired number of replicas during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%) (set to 0 to disable rolling updates)
+    max_unavailable: <string|int>  # maximum number of replicas that can be unavailable during an update; can be an absolute number, e.g. 5, or a percentage of desired replicas, e.g. 10% (default: 25%)
+  networking:  # networking configuration (default: see below)
+    endpoint: <string>  # endpoint for the API (default: <api_name>)
 ```
diff --git a/docs/workloads/realtime/handler.md b/docs/workloads/realtime/handler.md
deleted file mode 100644
index cfe3e4edfa..0000000000
--- a/docs/workloads/realtime/handler.md
+++ /dev/null
@@ -1,563 +0,0 @@
-# Handler implementation
-
-Realtime APIs respond to requests in real-time and autoscale based on in-flight request volumes. They can be used for realtime inference or data processing workloads.
-
-If you plan on deploying ML models and run realtime inferences, check out the [Models](models.md) page. Cortex provides out-of-the-box support for a variety of frameworks such as: PyTorch, ONNX, scikit-learn, XGBoost, TensorFlow, etc.
-
-The response type of the handler can vary depending on your requirements, see [HTTP API responses](#http-responses) and [gRPC API responses](#grpc-responses) below.
-
-## Project files
-
-Cortex makes all files in the project directory (i.e. the directory which contains `cortex.yaml`) available for use in your Handler class implementation. Python bytecode files (`*.pyc`, `*.pyo`, `*.pyd`), files or folders that start with `.`, and the api configuration file (e.g. `cortex.yaml`) are excluded.
-
-The following files can also be added at the root of the project's directory:
-
-* `.cortexignore` file, which follows the same syntax and behavior as a [.gitignore file](https://git-scm.com/docs/gitignore). This may be necessary if you are reaching the size limit for your project directory (32mb).
-* `.env` file, which exports environment variables that can be used in the handler. Each line of this file must follow the `VARIABLE=value` format.
-
-For example, if your directory looks like this:
-
-```text
-./my-classifier/
-├── cortex.yaml
-├── values.json
-├── handler.py
-├── ...
-└── requirements.txt
-```
-
-You can access `values.json` in your handler class like this:
-
-```python
-# handler.py
-
-import json
-
-class Handler:
-    def __init__(self, config):
-        with open('values.json', 'r') as values_file:
-            values = json.load(values_file)
-        self.values = values
-```
-
-## HTTP
-
-### Handler
-
-```python
-# initialization code and variables can be declared here in global scope
-
-class Handler:
-    def __init__(self, config):
-        """(Required) Called once before the API becomes available. Performs
-        setup such as downloading/initializing the model or downloading a
-        vocabulary.
-
-        Args:
-            config (required): Dictionary passed from API configuration (if
-                specified). This may contain information on where to download
-                the model and/or metadata.
-        """
-        pass
-
-    def handle_<HTTP-VERB>(self, payload, query_params, headers):
-        """(Required) Called once per request. Preprocesses the request payload
-        (if necessary), runs workload, and postprocesses the workload output
-        (if necessary).
-
-        Args:
-            payload (optional): The request payload (see below for the possible
-                payload types).
-            query_params (optional): A dictionary of the query parameters used
-                in the request.
-            headers (optional): A dictionary of the headers sent in the request.
-
-        Returns:
-            Result or a batch of results.
-        """
-        pass
-```
-
-Your `Handler` class can implement methods for each of the following HTTP methods: POST, GET, PUT, PATCH, DELETE. Therefore, the respective methods in the `Handler` definition can be `handle_post`, `handle_get`, `handle_put`, `handle_patch`, and `handle_delete`.
-
-For proper separation of concerns, it is recommended to use the constructor's `config` parameter for information such as from where to download the model and initialization files, or any configurable model parameters. You define `config` in your API configuration, and it is passed through to your Handler's constructor.
-
-Your API can accept requests with different types of payloads such as `JSON`-parseable, `bytes` or `starlette.datastructures.FormData` data. See [HTTP API requests](#http-requests) to learn about how headers can be used to change the type of `payload` that is passed into your handler method.
-
-Your handler method can return different types of objects such as `JSON`-parseable, `string`, and `bytes` objects. See [HTTP API responses](#http-responses) to learn about how to configure your handler method to respond with different response codes and content-types.
-
-### Callbacks
-
-A callback is a function that starts running in the background after the results have been sent back to the client. They are meant to be short-lived.
-
-Each handler method of your class can implement callbacks. To do this, when returning the result(s) from your handler method, also make sure to return a 2-element tuple in which the first element are your results that you want to return and the second element is a callable object that takes no arguments.
-
-You can implement a callback like in the following example:
-
-```python
-def handle_post(self, payload):
-    def _callback():
-        print("message that gets printed after the response is sent back to the user")
-    return "results", _callback
-```
-
-### HTTP requests
-
-The type of the `payload` parameter in `handle_<HTTP-VERB>(self, payload)` can vary based on the content type of the request. The `payload` parameter is parsed according to the `Content-Type` header in the request. Here are the parsing rules (see below for examples):
-
-1. For `Content-Type: application/json`, `payload` will be the parsed JSON body.
-1. For `Content-Type: multipart/form-data` / `Content-Type: application/x-www-form-urlencoded`, `payload` will be `starlette.datastructures.FormData` (key-value pairs where the values are strings for text data, or `starlette.datastructures.UploadFile` for file uploads; see [Starlette's documentation](https://www.starlette.io/requests/#request-files)).
-1. For `Content-Type: text/plain`, `payload` will be a string. `utf-8` encoding is assumed, unless specified otherwise (e.g. via `Content-Type: text/plain; charset=us-ascii`)
-1. For all other `Content-Type` values, `payload` will be the raw `bytes` of the request body.
-
-Here are some examples:
-
-#### JSON data
-
-##### Making the request
-
-```bash
-curl http://***.amazonaws.com/my-api \
-    -X POST -H "Content-Type: application/json" \
-    -d '{"key": "value"}'
-```
-
-##### Reading the payload
-
-When sending a JSON payload, the `payload` parameter will be a Python object:
-
-```python
-class Handler:
-    def __init__(self, config):
-        pass
-
-    def handle_post(self, payload):
-        print(payload["key"])  # prints "value"
-```
-
-#### Binary data
-
-##### Making the request
-
-```bash
-curl http://***.amazonaws.com/my-api \
-    -X POST -H "Content-Type: application/octet-stream" \
-    --data-binary @object.pkl
-```
-
-##### Reading the payload
-
-Since the `Content-Type: application/octet-stream` header is used, the `payload` parameter will be a `bytes` object:
-
-```python
-import pickle
-
-class Handler:
-    def __init__(self, config):
-        pass
-
-    def handle_post(self, payload):
-        obj = pickle.loads(payload)
-        print(obj["key"])  # prints "value"
-```
-
-Here's an example if the binary data is an image:
-
-```python
-from PIL import Image
-import io
-
-class Handler:
-    def __init__(self, config):
-        pass
-
-    def handle_post(self, payload, headers):
-        img = Image.open(io.BytesIO(payload))  # read the payload bytes as an image
-        print(img.size)
-```
-
-#### Form data (files)
-
-##### Making the request
-
-```bash
-curl http://***.amazonaws.com/my-api \
-    -X POST \
-    -F "text=@text.txt" \
-    -F "object=@object.pkl" \
-    -F "image=@image.png"
-```
-
-##### Reading the payload
-
-When sending files via form data, the `payload` parameter will be `starlette.datastructures.FormData` (key-value pairs where the values are `starlette.datastructures.UploadFile`, see [Starlette's documentation](https://www.starlette.io/requests/#request-files)). Either `Content-Type: multipart/form-data` or `Content-Type: application/x-www-form-urlencoded` can be used (typically `Content-Type: multipart/form-data` is used for files, and is the default in the examples above).
-
-```python
-from PIL import Image
-import pickle
-
-class Handler:
-    def __init__(self, config):
-        pass
-
-    def handle_post(self, payload):
-        text = payload["text"].file.read()
-        print(text.decode("utf-8"))  # prints the contents of text.txt
-
-        obj = pickle.load(payload["object"].file)
-        print(obj["key"])  # prints "value" assuming `object.pkl` is a pickled dictionary {"key": "value"}
-
-        img = Image.open(payload["image"].file)
-        print(img.size)  # prints the dimensions of image.png
-```
-
-#### Form data (text)
-
-##### Making the request
-
-```bash
-curl http://***.amazonaws.com/my-api \
-    -X POST \
-    -d "key=value"
-```
-
-##### Reading the payload
-
-When sending text via form data, the `payload` parameter will be `starlette.datastructures.FormData` (key-value pairs where the values are strings, see [Starlette's documentation](https://www.starlette.io/requests/#request-files)). Either `Content-Type: multipart/form-data` or `Content-Type: application/x-www-form-urlencoded` can be used (typically `Content-Type: application/x-www-form-urlencoded` is used for text, and is the default in the examples above).
-
-```python
-class Handler:
-    def __init__(self, config):
-        pass
-
-    def handle_post(self, payload):
-        print(payload["key"])  # will print "value"
-```
-
-#### Text data
-
-##### Making the request
-
-```bash
-curl http://***.amazonaws.com/my-api \
-    -X POST -H "Content-Type: text/plain" \
-    -d "hello world"
-```
-
-##### Reading the payload
-
-Since the `Content-Type: text/plain` header is used, the `payload` parameter will be a `string` object:
-
-```python
-class Handler:
-    def __init__(self, config):
-        pass
-
-    def handle_post(self, payload):
-        print(payload)  # prints "hello world"
-```
-
-### HTTP responses
-
-The response of your `handle_<HTTP-VERB>()` method may be:
-
-1. A JSON-serializable object (*lists*, *dictionaries*, *numbers*, etc.)
-
-1. A `string` object (e.g. `"class 1"`)
-
-1. A `bytes` object (e.g. `bytes(4)` or `pickle.dumps(obj)`)
-
-1. An instance of [starlette.responses.Response](https://www.starlette.io/responses/#response)
-
-## gRPC
-
-To serve your API using the gRPC protocol, make sure the `handler.protobuf_path` field in your API configuration is pointing to a protobuf file. When the API gets deployed, Cortex will compile the protobuf file for its use when serving the API.
-
-### Python Handler
-
-#### Interface
-
-```python
-# initialization code and variables can be declared here in global scope
-
-class Handler:
-    def __init__(self, config, module_proto_pb2):
-        """(Required) Called once before the API becomes available. Performs
-        setup such as downloading/initializing the model or downloading a
-        vocabulary.
-
-        Args:
-            config (required): Dictionary passed from API configuration (if
-                specified). This may contain information on where to download
-                the model and/or metadata.
-            module_proto_pb2 (required): Loaded Python module containing the
-                class definitions of the messages defined in the protobuf
-                file (`handler.protobuf_path`).
-        """
-        self.module_proto_pb2 = module_proto_pb2
-
-    def <RPC-METHOD-NAME>(self, payload, context):
-        """(Required) Called once per request. Preprocesses the request payload
-        (if necessary), runs workload, and postprocesses the workload output
-        (if necessary).
-
-        Args:
-            payload (optional): The request payload (see below for the possible
-                payload types).
-            context (optional): gRPC context.
-
-        Returns:
-            Result (when streaming is not used).
-
-        Yield:
-            Result (when streaming is used).
-        """
-        pass
-```
-
-Your `Handler` class must implement the RPC methods found in the protobuf. Your protobuf must have a single service defined, which can have any name. If your service has 2 RPC methods called `Info` and `Predict` methods, then your `Handler` class must also implement these methods like in the above `Handler` template.
-
-For proper separation of concerns, it is recommended to use the constructor's `config` parameter for information such as from where to download the model and initialization files, or any configurable model parameters. You define `config` in your API configuration, and it is passed through to your Handler class' constructor.
-
-Your API can only accept the type that has been specified in the protobuf definition of your service's method. See [gRPC API requests](#grpc-requests) for how to construct gRPC requests.
-
-Your handler method(s) can only return the type that has been specified in the protobuf definition of your service's method(s). See [gRPC API responses](#grpc-responses) for how to handle gRPC responses.
-
-### gRPC requests
-
-Assuming the following service:
-
-```protobuf
-# handler.proto
-
-syntax = "proto3";
-package sample_service;
-
-service Handler {
-    rpc Predict (Sample) returns (Response);
-}
-
-message Sample {
-    string a = 1;
-}
-
-message Response {
-    string b = 1;
-}
-```
-
-The handler implementation will also have a corresponding `Predict` method defined that represents the RPC method in the above protobuf service. The name(s) of the RPC method(s) is not enforced by Cortex.
-
-The type of the `payload` parameter passed into `Predict(self, payload)` will match that of the `Sample` message defined in the `handler.protobuf_path` file. For this example, we'll assume that the above protobuf file was specified for the API.
-
-#### Simple request
-
-The service method must look like this:
-
-```protobuf
-...
-rpc Predict (Sample) returns (Response);
-...
-```
-
-##### Making the request
-
-```python
-import grpc, handler_pb2, handler_pb2_grpc
-
-stub = handler_pb2_grpc.HandlerStub(grpc.insecure_channel("***.amazonaws.com:80"))
-stub.Predict(handler_pb2.Sample(a="text"))
-```
-
-##### Reading the payload
-
-In the `Predict` method, you'll read the value like this:
-
-```python
-...
-def Predict(self, payload):
-    print(payload.a)
-...
-```
-
-#### Streaming request
-
-The service method must look like this:
-
-```protobuf
-...
-rpc Predict (stream Sample) returns (Response);
-...
-```
-
-##### Making the request
-
-```python
-import grpc, handler_pb2, handler_pb2_grpc
-
-def generate_iterator(sample_list):
-    for sample in sample_list:
-        yield sample
-
-stub = handler_pb2_grpc.HandlerStub(grpc.insecure_channel("***.amazonaws.com:80"))
-stub.Predict(handler_pb2.Sample(generate_iterator(["a", "b", "c", "d"])))
-```
-
-##### Reading the payload
-
-In the `Predict` method, you'll read the streamed values like this:
-
-```python
-...
-def Predict(self, payload):
-    for item in payload:
-        print(item.a)
-...
-```
-
-### gRPC responses
-
-Assuming the following service:
-
-```protobuf
-# handler.proto
-
-syntax = "proto3";
-package sample_service;
-
-service Handler {
-    rpc Predict (Sample) returns (Response);
-}
-
-message Sample {
-    string a = 1;
-}
-
-message Response {
-    string b = 1;
-}
-```
-
-The handler implementation will also have a corresponding `Predict` method defined that represents the RPC method in the above protobuf service. The name(s) of the RPC method(s) is not enforced by Cortex.
-
-The type of the value that you return in your `Predict()` method must match the `Response` message defined in the `handler.protobuf_path` file. For this example, we'll assume that the above protobuf file was specified for the API.
-
-#### Simple response
-
-The service method must look like this:
-
-```protobuf
-...
-rpc Predict (Sample) returns (Response);
-...
-```
-
-##### Making the request
-
-```python
-import grpc, handler_pb2, handler_pb2_grpc
-
-stub = handler_pb2_grpc.HandlerStub(grpc.insecure_channel("***.amazonaws.com:80"))
-r = stub.Predict(handler_pb2.Sample())
-```
-
-##### Returning the response
-
-In the `Predict` method, you'll return the value like this:
-
-```python
-...
-def Predict(self, payload):
-    return self.proto_module_pb2.Response(b="text")
-...
-```
-
-#### Streaming response
-
-The service method must look like this:
-
-```protobuf
-...
-rpc Predict (Sample) returns (stream Response);
-...
-```
-
-##### Making the request
-
-```python
-import grpc, handler_pb2, handler_pb2_grpc
-
-def generate_iterator(sample_list):
-    for sample in sample_list:
-        yield sample
-
-stub = handler_pb2_grpc.HandlerStub(grpc.insecure_channel("***.amazonaws.com:80"))
-for r in stub.Predict(handler_pb2.Sample())):
-    print(r.b)
-```
-
-##### Returning the response
-
-In the `Predict` method, you'll return the streamed values like this:
-
-```python
-...
-def Predict(self, payload):
-    for text in ["a", "b", "c", "d"]:
-        yield self.proto_module_pb2.Response(b=text)
-...
-```
-
-## Chaining APIs
-
-It is possible to make requests from one API to another within a Cortex cluster. All running APIs are accessible from within the handler implementation at `http://api-<api_name>:8888/`, where `<api_name>` is the name of the API you are making a request to.
-
-For example, if there is an api named `text-generator` running in the cluster, you could make a request to it from a different API by using:
-
-```python
-import requests
-
-class Handler:
-    def handle_post(self, payload):
-        response = requests.post("http://api-text-generator:8888/", json={"text": "machine learning is"})
-        # ...
-```
-
-Note that the autoscaling configuration (i.e. `target_replica_concurrency`) for the API that is making the request should be modified with the understanding that requests will still be considered "in-flight" with the first API as the request is being fulfilled in the second API (during which it will also be considered "in-flight" with the second API).
-
-## Structured logging
-
-You can use Cortex's logger in your handler implemention to log in JSON. This will enrich your logs with Cortex's metadata, and you can add custom metadata to the logs by adding key value pairs to the `extra` key when using the logger. For example:
-
-```python
-...
-from cortex_internal.lib.log import logger as cortex_logger
-
-class Handler:
-    def handle_post(self, payload):
-        cortex_logger.info("received payload", extra={"payload": payload})
-```
-
-The dictionary passed in via the `extra` will be flattened by one level. e.g.
-
-```text
-{"asctime": "2021-01-19 15:14:05,291", "levelname": "INFO", "message": "received payload", "process": 235, "payload": "this movie is awesome"}
-```
-
-To avoid overriding essential Cortex metadata, please refrain from specifying the following extra keys: `asctime`, `levelname`, `message`, `labels`, and `process`. Log lines greater than 5 MB in size will be ignored.
-
-## Cortex Python client
-
-A default [Cortex Python client](../../clients/python.md#cortex.client.client) environment has been configured for your API. This can be used for deploying/deleting/updating or submitting jobs to your running cluster based on the execution flow of your handler. For example:
-
-```python
-import cortex
-
-class Handler:
-    def __init__(self, config):
-        ...
-        # get client pointing to the default environment
-        client = cortex.client()
-        # get the existing apis in the cluster for something important to you
-        existing_apis = client.list_apis()
-```
diff --git a/docs/workloads/realtime/models.md b/docs/workloads/realtime/models.md
deleted file mode 100644
index 6f8685fa80..0000000000
--- a/docs/workloads/realtime/models.md
+++ /dev/null
@@ -1,439 +0,0 @@
-# Models
-
-Live model reloading is a mechanism that periodically checks for updated models in the model path(s) provided in `handler.models`. It is automatically enabled for all handler types, including the Python handler type (as long as model paths are specified via `multi_model_reloading` in the `handler` configuration).
-
-The following is a list of events that will trigger the API to update its model(s):
-
-* A new model is added to the model directory.
-* A model is removed from the model directory.
-* A model changes its directory structure.
-* A file in the model directory is updated in-place.
-
-## Python Handler
-
-To use live model reloading with the Python handler, the model path(s) must be specified in the API's `handler` configuration, via the `multi_model_reloading` field. When models are specified in this manner, your `Handler` class must implement the `load_model()` function, and models can be retrieved by using the `get_model()` method of the `model_client` that's passed into your handler's constructor.
-
-### Example
-
-```python
-class Handler:
-    def __init__(self, config, model_client):
-        self.client = model_client
-
-    def load_model(self, model_path):
-        # model_path is a path to your model's directory on disk
-        return load_from_disk(model_path)
-
-    def handle_post(self, payload):
-      model = self.client.get_model()
-      return model.predict(payload)
-```
-
-When multiple models are being served in an API, `model_client.get_model()` can accept a model name:
-
-```python
-class Handler:
-    # ...
-
-    def handle_post(self, payload, query_params):
-      model = self.client.get_model(query_params["model"])
-      return model.predict(payload)
-```
-
-`model_client.get_model()` can also accept a model version if a version other than the highest is desired:
-
-```python
-class Handler:
-    # ...
-
-    def handle_post(self, payload, query_params):
-      model = self.client.get_model(query_params["model"], query_params["version"])
-      return model.predict(payload)
-```
-
-### Interface
-
-```python
-# initialization code and variables can be declared here in global scope
-
-class Handler:
-    def __init__(self, config, model_client):
-        """(Required) Called once before the API becomes available. Performs
-        setup such as downloading/initializing the model or downloading a
-        vocabulary.
-
-        Args:
-            config (required): Dictionary passed from API configuration (if
-                specified). This may contain information on where to download
-                the model and/or metadata.
-            model_client (required): Python client which is used to retrieve
-                models for prediction. This should be saved for use in the handler method.
-                Required when `handler.multi_model_reloading` is specified in
-                the api configuration.
-        """
-        self.client = model_client
-
-    def load_model(self, model_path):
-        """Called by Cortex to load a model when necessary.
-
-        This method is required when `handler.multi_model_reloading`
-        field is specified in the api configuration.
-
-        Warning: this method must not make any modification to the model's
-        contents on disk.
-
-        Args:
-            model_path: The path to the model on disk.
-
-        Returns:
-            The loaded model from disk. The returned object is what
-            self.client.get_model() will return.
-        """
-        pass
-
-    # define any handler methods for HTTP/gRPC workloads here
-```
-
-<!-- CORTEX_VERSION_MINOR -->
-When explicit model paths are specified in the Python handler's API configuration, Cortex provides a `model_client` to your Handler's constructor. `model_client` is an instance of [ModelClient](https://github.com/cortexlabs/cortex/tree/master/python/serve/cortex_internal/lib/client/python.py) that is used to load model(s) (it calls the `load_model()` method of your handler, which must be defined when using explicit model paths). It should be saved as an instance variable in your handler class, and your handler method should call `model_client.get_model()` to load your model for inference. Preprocessing of the JSON/gRPC payload and postprocessing of predictions can be implemented in your handler method as well.
-
-When multiple models are defined using the Handler's `multi_model_reloading` field, the `model_client.get_model()` method expects an argument `model_name` which must hold the name of the model that you want to load (for example: `self.client.get_model("text-generator")`). There is also an optional second argument to specify the model version.
-
-### `load_model` method
-
-The `load_model()` method that you implement in your `Handler` can return anything that you need to make a prediction. There is one caveat: whatever the return value is, it must be unloadable from memory via the `del` keyword. The following frameworks have been tested to work:
-
-* PyTorch (CPU & GPU)
-* ONNX (CPU & GPU)
-* Sklearn/MLFlow (CPU)
-* Numpy (CPU)
-* Pandas (CPU)
-* Caffe (not tested, but should work on CPU & GPU)
-
-Python data structures containing these types are also supported (e.g. lists and dicts).
-
-The `load_model()` method takes a single argument, which is a path (on disk) to the model to be loaded. Your `load_model()` method is called behind the scenes by Cortex when you call the `model_client`'s `get_model()` method. Cortex is responsible for downloading your model from S3 onto the local disk before calling `load_model()` with the local path. Whatever `load_model()` returns will be the exact return value of `model_client.get_model()`. Here is the schema for `model_client.get_model()`:
-
-```python
-def get_model(model_name, model_version):
-    """
-    Retrieve a model for inference.
-
-    Args:
-        model_name (optional): Name of the model to retrieve (when multiple models are deployed in an API).
-            When handler.models.paths is specified, model_name should be the name of one of the models listed in the API config.
-            When handler.models.dir is specified, model_name should be the name of a top-level directory in the models dir.
-        model_version (string, optional): Version of the model to retrieve. Can be omitted or set to "latest" to select the highest version.
-
-    Returns:
-        The value that's returned by your handler's load_model() method.
-    """
-```
-
-### Specifying models
-
-Whenever a model path is specified in an API configuration file, it should be a path to an S3 prefix which contains your exported model. Directories may include a single model, or multiple folders each with a single model (note that a "single model" need not be a single file; there can be multiple files for a single model). When multiple folders are used, the folder names must be integer values, and will be interpreted as the model version. Model versions can be any integer, but are typically integer timestamps. It is always assumed that the highest version number is the latest version of your model.
-
-#### API spec
-
-##### Single model
-
-The most common pattern is to serve a single model per API. The path to the model is specified in the `path` field in the `handler.multi_model_reloading` configuration. For example:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: RealtimeAPI
-  handler:
-    # ...
-    type: python
-    multi_model_reloading:
-      path: s3://my-bucket/models/text-generator/
-```
-
-##### Multiple models
-
-It is possible to serve multiple models from a single API. The paths to the models are specified in the api configuration, either via the `multi_model_reloading.paths` or `multi_model_reloading.dir` field in the `handler` configuration. For example:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: RealtimeAPI
-  handler:
-    # ...
-    type: python
-    multi_model_reloading:
-      paths:
-        - name: iris-classifier
-          path: s3://my-bucket/models/text-generator/
-        # ...
-```
-
-or:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: RealtimeAPI
-  handler:
-    # ...
-    type: python
-    multi_model_reloading:
-      dir: s3://my-bucket/models/
-```
-
-It is also not necessary to specify the `multi_model_reloading` section at all, since you can download and load the model in your handler's `__init__()` function. That said, it is necessary to use the `multi_model_reloading` field to take advantage of live model reloading or multi-model caching.
-
-When using the `multi_model_reloading.paths` field, each path must be a valid model directory (see above for valid model directory structures).
-
-When using the `multi_model_reloading.dir` field, the directory provided may contain multiple subdirectories, each of which is a valid model directory. For example:
-
-```text
-  s3://my-bucket/models/
-  ├── text-generator
-  |   └── * (model files)
-  └── sentiment-analyzer
-      ├── 24753823/
-      |   └── * (model files)
-      └── 26234288/
-          └── * (model files)
-```
-
-In this case, there are two models in the directory, one of which is named "text-generator", and the other is named "sentiment-analyzer".
-
-#### Structure
-
-Any model structure is accepted. Here is an example:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── model.pkl
-  └── data.txt
-```
-
-or for a versioned model:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── 1523423423/  (version number, usually a timestamp)
-  |   ├── model.pkl
-  |   └── data.txt
-  └── 2434389194/  (version number, usually a timestamp)
-      ├── model.pkl
-      └── data.txt
-```
-
-## TensorFlow Handler
-
-In addition to the [standard Python Handler](handler.md), Cortex also supports another handler called the TensorFlow handler, which can be used to run TensorFlow models exported as `SavedModel` models. When using the TensorFlow handler, the model path(s) must be specified in the API's `handler` configuration, via the `models` field.
-
-### Example
-
-```python
-class Handler:
-    def __init__(self, tensorflow_client, config):
-        self.client = tensorflow_client
-
-    def handle_post(self, payload):
-      return self.client.predict(payload)
-```
-
-When multiple models are being served in an API, `tensorflow_client.predict()` can accept a model name:
-
-```python
-class Handler:
-    # ...
-
-    def handle_post(self, payload, query_params):
-      return self.client.predict(payload, query_params["model"])
-```
-
-`tensorflow_client.predict()` can also accept a model version if a version other than the highest is desired:
-
-```python
-class Handler:
-    # ...
-
-    def handle_post(self, payload, query_params):
-      return self.client.predict(payload, query_params["model"], query_params["version"])
-```
-
-Note: when using Inferentia models with the TensorFlow handler type, live model reloading is only supported if `handler.processes_per_replica` is set to 1 (the default value).
-
-### Interface
-
-```python
-class Handler:
-    def __init__(self, tensorflow_client, config):
-        """(Required) Called once before the API becomes available. Performs
-        setup such as downloading/initializing a vocabulary.
-
-        Args:
-            tensorflow_client (required): TensorFlow client which is used to
-                make predictions. This should be saved for use in the handler method.
-            config (required): Dictionary passed from API configuration (if
-                specified).
-        """
-        self.client = tensorflow_client
-        # Additional initialization may be done here
-
-    # define any handler methods for HTTP/gRPC workloads here
-```
-
-<!-- CORTEX_VERSION_MINOR -->
-Cortex provides a `tensorflow_client` to your Handler's constructor. `tensorflow_client` is an instance of [TensorFlowClient](https://github.com/cortexlabs/cortex/tree/master/python/serve/cortex_internal/lib/client/tensorflow.py) that manages a connection to a TensorFlow Serving container to make predictions using your model. It should be saved as an instance variable in your Handler class, and your handler method should call `tensorflow_client.predict()` to make an inference with your exported TensorFlow model. Preprocessing of the JSON payload and postprocessing of predictions can be implemented in your handler method as well.
-
-When multiple models are defined using the Handler's `models` field, the `tensorflow_client.predict()` method expects a second argument `model_name` which must hold the name of the model that you want to use for inference (for example: `self.client.predict(payload, "text-generator")`). There is also an optional third argument to specify the model version.
-
-If you need to share files between your handler implementation and the TensorFlow Serving container, you can create a new directory within `/mnt` (e.g. `/mnt/user`) and write files to it. The entire `/mnt` directory is shared between containers, but do not write to any of the directories in `/mnt` that already exist (they are used internally by Cortex).
-
-### `predict` method
-
-Inference is performed by using the `predict` method of the `tensorflow_client` that's passed to the handler's constructor:
-
-```python
-def predict(model_input, model_name, model_version) -> dict:
-    """
-    Run prediction.
-
-    Args:
-        model_input: Input to the model.
-        model_name (optional): Name of the model to retrieve (when multiple models are deployed in an API).
-            When handler.models.paths is specified, model_name should be the name of one of the models listed in the API config.
-            When handler.models.dir is specified, model_name should be the name of a top-level directory in the models dir.
-        model_version (string, optional): Version of the model to retrieve. Can be omitted or set to "latest" to select the highest version.
-
-    Returns:
-        dict: TensorFlow Serving response converted to a dictionary.
-    """
-```
-
-### Specifying models
-
-Whenever a model path is specified in an API configuration file, it should be a path to an S3 prefix which contains your exported model. Directories may include a single model, or multiple folders each with a single model (note that a "single model" need not be a single file; there can be multiple files for a single model). When multiple folders are used, the folder names must be integer values, and will be interpreted as the model version. Model versions can be any integer, but are typically integer timestamps. It is always assumed that the highest version number is the latest version of your model.
-
-#### API spec
-
-##### Single model
-
-The most common pattern is to serve a single model per API. The path to the model is specified in the `path` field in the `handler.models` configuration. For example:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: RealtimeAPI
-  handler:
-    # ...
-    type: tensorflow
-    models:
-      path: s3://my-bucket/models/text-generator/
-```
-
-##### Multiple models
-
-It is possible to serve multiple models from a single API. The paths to the models are specified in the api configuration, either via the `models.paths` or `models.dir` field in the `handler` configuration. For example:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: RealtimeAPI
-  handler:
-    # ...
-    type: tensorflow
-    models:
-      paths:
-        - name: iris-classifier
-          path: s3://my-bucket/models/text-generator/
-        # ...
-```
-
-or:
-
-```yaml
-# cortex.yaml
-
-- name: iris-classifier
-  kind: RealtimeAPI
-  handler:
-    # ...
-    type: tensorflow
-    models:
-      dir: s3://my-bucket/models/
-```
-
-When using the `models.paths` field, each path must be a valid model directory (see above for valid model directory structures).
-
-When using the `models.dir` field, the directory provided may contain multiple subdirectories, each of which is a valid model directory. For example:
-
-```text
-  s3://my-bucket/models/
-  ├── text-generator
-  |   └── * (model files)
-  └── sentiment-analyzer
-      ├── 24753823/
-      |   └── * (model files)
-      └── 26234288/
-          └── * (model files)
-```
-
-In this case, there are two models in the directory, one of which is named "text-generator", and the other is named "sentiment-analyzer".
-
-#### Structure
-
-##### On CPU/GPU
-
-The model path must be a SavedModel export:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── saved_model.pb
-  └── variables/
-      ├── variables.index
-      ├── variables.data-00000-of-00003
-      ├── variables.data-00001-of-00003
-      └── variables.data-00002-of-...
-```
-
-or for a versioned model:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── 1523423423/  (version number, usually a timestamp)
-  |   ├── saved_model.pb
-  |   └── variables/
-  |       ├── variables.index
-  |       ├── variables.data-00000-of-00003
-  |       ├── variables.data-00001-of-00003
-  |       └── variables.data-00002-of-...
-  └── 2434389194/  (version number, usually a timestamp)
-      ├── saved_model.pb
-      └── variables/
-          ├── variables.index
-          ├── variables.data-00000-of-00003
-          ├── variables.data-00001-of-00003
-          └── variables.data-00002-of-...
-```
-
-##### On Inferentia
-
-When Inferentia models are used, the directory structure is slightly different:
-
-```text
-  s3://my-bucket/models/text-generator/
-  └── saved_model.pb
-```
-
-or for a versioned model:
-
-```text
-  s3://my-bucket/models/text-generator/
-  ├── 1523423423/  (version number, usually a timestamp)
-  |   └── saved_model.pb
-  └── 2434389194/  (version number, usually a timestamp)
-      └── saved_model.pb
-```
diff --git a/docs/workloads/realtime/multi-model/caching.md b/docs/workloads/realtime/multi-model/caching.md
deleted file mode 100644
index 3177f68ca0..0000000000
--- a/docs/workloads/realtime/multi-model/caching.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# Multi-model caching
-
-Multi-model caching allows each replica to serve more models than would fit into its memory by keeping a specified number of models in memory (and disk) at a time. When the in-memory model limit is reached, the least recently accessed model is evicted from the cache. This can be useful when you have many models, and some models are frequently accessed while a larger portion of them are rarely used, or when running on smaller instances to control costs.
-
-The model cache is a two-layer cache, configured by the following parameters in the `handler.models` configuration:
-
-* `cache_size` sets the number of models to keep in memory
-* `disk_cache_size` sets the number of models to keep on disk (must be greater than or equal to `cache_size`)
-
-Both of these fields must be specified, in addition to either the `dir` or `paths` field (which specifies the model paths, see [models](../../realtime/models.md) for documentation). Multi-model caching is only supported if `handler.processes_per_replica` is set to 1 (the default value).
-
-## Out of memory errors
-
-Cortex runs a background process every 10 seconds that counts the number of models in memory and on disk, and evicts the least recently used models if the count exceeds `cache_size` / `disk_cache_size`. If many new models are requested between executions of the process, there may be more models in memory and/or on disk than the configured `cache_size` or `disk_cache_size` limits which could lead to out of memory errors.
diff --git a/docs/workloads/realtime/multi-model/configuration.md b/docs/workloads/realtime/multi-model/configuration.md
deleted file mode 100644
index c74d950f3d..0000000000
--- a/docs/workloads/realtime/multi-model/configuration.md
+++ /dev/null
@@ -1,118 +0,0 @@
-# Configuration
-
-## Python Handler
-
-### Specifying models in API configuration
-
-#### `cortex.yaml`
-
-The directory `s3://cortex-examples/sklearn/mpg-estimator/linreg/` contains 4 different versions of the model.
-
-```yaml
-- name: mpg-estimator
-  kind: RealtimeAPI
-  handler:
-    type: python
-    path: handler.py
-    models:
-      path: s3://cortex-examples/sklearn/mpg-estimator/linreg/
-```
-
-#### `handler.py`
-
-```python
-import mlflow.sklearn
-
-
-class Handler:
-    def __init__(self, config, python_client):
-        self.client = python_client
-
-    def load_model(self, model_path):
-        return mlflow.sklearn.load_model(model_path)
-
-    def handle_post(self, payload, query_params):
-        model_version = query_params.get("version")
-
-        # model_input = ...
-
-        model = self.client.get_model(model_version=model_version)
-        result = model.predict(model_input)
-
-        return {"prediction": result, "model": {"version": model_version}}
-```
-
-### Without specifying models in API configuration
-
-#### `cortex.yaml`
-
-```yaml
-- name: text-analyzer
-  kind: RealtimeAPI
-  handler:
-    type: python
-    path: handler.py
-    ...
-```
-
-#### `handler.py`
-
-```python
-class Handler:
-    def __init__(self, config):
-        self.analyzer = initialize_model("sentiment-analysis")
-        self.summarizer = initialize_model("summarization")
-
-    def handle_post(self, query_params, payload):
-        model_name = query_params.get("model")
-        model_input = payload["text"]
-
-        # ...
-
-        if model_name == "analyzer":
-            results = self.analyzer(model_input)
-            predicted_label = postprocess(results)
-            return {"label": predicted_label}
-        elif model_name == "summarizer":
-            results = self.summarizer(model_input)
-            predicted_label = postprocess(results)
-            return {"label": predicted_label}
-        else:
-            return JSONResponse({"error": f"unknown model: {model_name}"}, status_code=400)
-```
-
-## TensorFlow Handler
-
-### `cortex.yaml`
-
-```yaml
-- name: multi-model-classifier
-  kind: RealtimeAPI
-  handler:
-    type: tensorflow
-    path: handler.py
-    models:
-      paths:
-        - name: inception
-          path: s3://cortex-examples/tensorflow/image-classifier/inception/
-        - name: iris
-          path: s3://cortex-examples/tensorflow/iris-classifier/nn/
-        - name: resnet50
-          path: s3://cortex-examples/tensorflow/resnet50/
-      ...
-```
-
-### `handler.py`
-
-```python
-class Handler:
-    def __init__(self, tensorflow_client, config):
-        self.client = tensorflow_client
-
-    def handle_post(self, payload, query_params):
-        model_name = query_params["model"]
-        model_input = preprocess(payload["url"])
-        results = self.client.predict(model_input, model_name)
-        predicted_label = postprocess(results)
-        return {"label": predicted_label}
-```
diff --git a/docs/workloads/realtime/multi-model/example.md b/docs/workloads/realtime/multi-model/example.md
deleted file mode 100644
index d09fc790c9..0000000000
--- a/docs/workloads/realtime/multi-model/example.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# Multi-model API
-
-Deploy several models in a single API to improve resource utilization efficiency.
-
-## Define a multi-model API
-
-```python
-# multi_model.py
-
-import cortex
-
-class Handler:
-    def __init__(self, config):
-        from transformers import pipeline
-        self.analyzer = pipeline(task="sentiment-analysis")
-
-        import wget
-        import fasttext
-        wget.download(
-            "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin", "/tmp/model"
-        )
-        self.language_identifier = fasttext.load_model("/tmp/model")
-
-    def handle_post(self, query_params, payload):
-        model = query_params.get("model")
-        if model == "sentiment":
-            return self.analyzer(payload["text"])[0]
-        elif model == "language":
-            return self.language_identifier.predict(payload["text"])[0][0][-2:]
-
-requirements = ["tensorflow", "transformers", "wget", "fasttext"]
-
-api_spec = {"name": "multi-model", "kind": "RealtimeAPI"}
-
-cx = cortex.client("cortex")
-cx.deploy_realtime_api(api_spec, handler=Handler, requirements=requirements)
-```
-
-## Deploy
-
-```bash
-python multi_model.py
-```
diff --git a/docs/workloads/realtime/parallelism.md b/docs/workloads/realtime/parallelism.md
deleted file mode 100644
index f271805f20..0000000000
--- a/docs/workloads/realtime/parallelism.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Replica parallelism
-
-Replica parallelism can be configured with the following fields in the `handler` configuration:
-
-* `processes_per_replica` (default: 1): Each replica runs a web server with `processes_per_replica` processes. For APIs running with multiple CPUs per replica, using 1-3 processes per unit of CPU generally leads to optimal throughput. For example, if `cpu` is 2, a value between 2 and 6 `processes_per_replica` is reasonable. The optimal number will vary based on the workload's characteristics and the CPU compute request for the API.
-
-* `threads_per_process` (default: 1): Each process uses a thread pool of size `threads_per_process` to process requests. For applications that are not CPU intensive such as high I/O (e.g. downloading files), GPU-based inference, or Inferentia-based inference, increasing the number of threads per process can increase throughput. For CPU-bound applications such as running your model inference on a CPU, using 1 thread per process is recommended to avoid unnecessary context switching. Some applications are not thread-safe, and therefore must be run with 1 thread per process.
-
-`processes_per_replica` * `threads_per_process` represents the total number of requests that your replica can work on concurrently. For example, if `processes_per_replica` is 2 and `threads_per_process` is 2, and the replica was hit with 5 concurrent requests, 4 would immediately begin to be processed, and 1 would be waiting for a thread to become available. If the replica were hit with 3 concurrent requests, all three would begin processing immediately.
diff --git a/docs/workloads/realtime/realtime-apis.md b/docs/workloads/realtime/realtime-apis.md
new file mode 100644
index 0000000000..ebfc7fc8d0
--- /dev/null
+++ b/docs/workloads/realtime/realtime-apis.md
@@ -0,0 +1,3 @@
+# Realtime APIs
+
+Realtime APIs respond to requests in real-time and autoscale based on in-flight request volumes.
diff --git a/docs/workloads/realtime/server-side-batching.md b/docs/workloads/realtime/server-side-batching.md
deleted file mode 100644
index f46646d0d7..0000000000
--- a/docs/workloads/realtime/server-side-batching.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# Server-side batching
-
-Server-side batching is the process of aggregating multiple real-time requests into a single batch execution, which increases throughput at the expense of latency. Inference is triggered when either a maximum number of requests have been received, or when a certain amount of time has passed since receiving the first request, whichever comes first. Once a threshold is reached, the handling function is run on the received requests and responses are returned individually back to the clients. This process is transparent to the clients.
-
-The Python and TensorFlow handlers allow for the use of the following 2 fields in the `server_side_batching` configuration:
-
-* `max_batch_size`: The maximum number of requests to aggregate before running inference. This is an instrument for controlling throughput. The maximum size can be achieved if `batch_interval` is long enough to collect `max_batch_size` requests.
-
-* `batch_interval`: The maximum amount of time to spend waiting for additional requests before running inference on the batch of requests. If fewer than `max_batch_size` requests are received after waiting the full `batch_interval`, then inference will run on the requests that have been received. This is an instrument for controlling latency.
-
-{% hint style="note" %}
-Server-side batching is not supported for APIs that use the gRPC protocol.
-{% endhint %}
-
-{% hint style="note" %}
-Server-side batching is only supported on the `handle_post` method.
-{% endhint %}
-
-## Python Handler
-
-When using server-side batching with the Python handler, the arguments that are passed into your handler's  function will be lists: `payload` will be a list of payloads, `query_params` will be a list of query parameter dictionaries, and `headers` will be a list of header dictionaries. The lists will all have the same length, where a particular index across all arguments corresponds to a single request (i.e. `payload[2]`, `query_params[2]`, and `headers[2]` correspond to a single request). Your handle function must return a list of responses in the same order that they were received (i.e. the 3rd element in returned list must be the response associated with `payload[2]`).
-
-## TensorFlow Handler
-
-In order to use server-side batching with the TensorFlow handler, the only requirement is that model's graph must be built such that batches can be accepted as input/output. No modifications to your handler implementation are required.
-
-The following is an example of how the input `x` and the output `y` of the graph could be shaped to be compatible with server-side batching:
-
-```python
-batch_size = None
-sample_shape = [340, 240, 3] # i.e. RGB image
-output_shape = [1000] # i.e. image labels
-
-with graph.as_default():
-    # ...
-    x = tf.placeholder(tf.float32, shape=[batch_size] + sample_shape, name="input")
-    y = tf.placeholder(tf.float32, shape=[batch_size] + output_shape, name="output")
-    # ...
-```
-
-### Troubleshooting
-
-Errors will be encountered if the model hasn't been built for batching.
-
-The following error is an example of what happens when the input shape doesn't accommodate batching - e.g. when its shape is `[height, width, 3]` instead of `[batch_size, height, width, 3]`:
-
-```text
-Batching session Run() input tensors must have at least one dimension.
-```
-
-Here is another example of setting the output shape inappropriately for batching - e.g. when its shape is `[labels]` instead of `[batch_size, labels]`:
-
-```text
-Batched output tensor has 0 dimensions.
-```
-
-The solution to these errors is to incorporate into the model's graph another dimension (a placeholder for batch size) placed on the first position for both its input and output.
-
-The following is an example of how the input `x` and the output `y` of the graph could be shaped to be compatible with server-side batching:
-
-```python
-batch_size = None
-sample_shape = [340, 240, 3] # i.e. RGB image
-output_shape = [1000] # i.e. image labels
-
-with graph.as_default():
-    # ...
-    x = tf.placeholder(tf.float32, shape=[batch_size] + sample_shape, name="input")
-    y = tf.placeholder(tf.float32, shape=[batch_size] + output_shape, name="output")
-    # ...
-```
-
-## Optimization
-
-When optimizing for both throughput and latency, you will likely want keep the `max_batch_size` to a relatively small value. Even though a higher `max_batch_size` with a low `batch_interval` (when there are many requests coming in) can offer a significantly higher throughput, the overall latency could be quite large. The reason is that for a request to get back a response, it has to wait until the entire batch is processed, which means that the added latency due to the `batch_interval` can pale in comparison. For instance, let's assume that a single request takes 50ms, and that when the batch size is set to 128, the processing time for a batch is 1280ms (i.e. 10ms per sample). So while the throughput is now 5 times higher, it takes 1280ms + `batch_interval` to get back a response (instead of 50ms). This is the trade-off with server-side batching.
-
-When optimizing for maximum throughput, a good rule of thumb is to follow these steps:
-
-1. Determine the maximum throughput of one API replica when `server_side_batching` is not enabled (same as if `max_batch_size` were set to 1). This can be done with a load test (make sure to set `max_replicas` to 1 to disable autoscaling).
-1. Determine the highest `batch_interval` with which you are still comfortable for your application. Keep in mind that the batch interval is not the only component of the overall latency - the inference on the batch and the pre/post processing also have to occur.
-1. Multiply the maximum throughput from step 1 by the `batch_interval` from step 2. The result is a number which you can assign to `max_batch_size`.
-1. Run the load test again. If the inference fails with that batch size (e.g. due to running out of GPU or RAM memory), then reduce `max_batch_size` to a level that works (reduce `batch_interval` by the same factor).
-1. Use the load test to determine the peak throughput of the API replica. Multiply the observed throughput by the `batch_interval` to calculate the average batch size. If the average batch size coincides with `max_batch_size`, then it might mean that the throughput could still be further increased by increasing `max_batch_size`. If it's lower, then it means that `batch_interval` is triggering the inference before `max_batch_size` requests have been aggregated. If modifying both `max_batch_size` and `batch_interval` doesn't improve the throughput, then the service may be bottlenecked by something else (e.g. CPU, network IO, `processes_per_replica`, `threads_per_process`, etc).
diff --git a/docs/workloads/realtime/statuses.md b/docs/workloads/realtime/statuses.md
index f61ea52d7b..2ee32aca40 100644
--- a/docs/workloads/realtime/statuses.md
+++ b/docs/workloads/realtime/statuses.md
@@ -5,6 +5,6 @@
 | live                  | API is deployed and ready to serve requests (at least one replica is running) |
 | updating              | API is updating |
 | error                 | API was not created due to an error; run `cortex logs <name>` to view the logs |
-| error (image pull)    | API was not created because one of the specified Docker images was inaccessible at runtime; check that your API's docker images exist and are accessible via your cluster operator's AWS credentials |
+| error (image pull)    | API was not created because one of the specified Docker images was inaccessible at runtime; check that your API's docker images exist and are accessible via your cluster's AWS credentials |
 | error (out of memory) | API was terminated due to excessive memory usage; try allocating more memory to the API and re-deploying |
 | compute unavailable   | API could not start due to insufficient memory, CPU, GPU, or Inf in the cluster; some replicas may be ready |
diff --git a/docs/workloads/realtime/traffic-splitter.md b/docs/workloads/realtime/traffic-splitter.md
new file mode 100644
index 0000000000..ae9094127b
--- /dev/null
+++ b/docs/workloads/realtime/traffic-splitter.md
@@ -0,0 +1,65 @@
+# Traffic Splitter
+
+Traffic Splitters can be used to expose multiple RealtimeAPIs as a single endpoint for A/B tests, multi-armed bandits, or canary deployments.
+
+## Configuration
+
+```yaml
+- name: <string>  # Traffic Splitter name (required)
+  kind: TrafficSplitter
+  networking:
+    endpoint: <string>  # the endpoint for the Traffic Splitter (default: <name>)
+  apis:  # list of Realtime APIs to target
+    - name: <string>  # name of a Realtime API that is already running or is included in the same configuration file (required)
+      weight: <int>   # percentage of traffic to route to the Realtime API (all non-shadow weights must sum to 100) (required)
+      shadow: <bool>  # duplicate incoming traffic and send fire-and-forget to this api (only one shadow per traffic splitter) (default: false)
+```
+
+## Example
+
+This example showcases Cortex's Python client, but these steps can also be performed by using the Cortex CLI with YAML files.
+
+### Deploy a traffic splitter
+
+```python
+traffic_splitter_spec = {
+    "name": "sentiment-analyzer",
+    "kind": "TrafficSplitter",
+    "apis": [
+        {"name": "sentiment-analyzer-a", "weight": 50},
+        {"name": "sentiment-analyzer-b", "weight": 50},
+    ],
+}
+
+cx.deploy(traffic_splitter_spec)
+```
+
+### Update the weights
+
+```python
+new_traffic_splitter_spec = {
+    "name": "sentiment-analyzer",
+    "kind": "TrafficSplitter",
+    "apis": [
+        {"name": "sentiment-analyzer-a", "weight": 1},
+        {"name": "sentiment-analyzer-b", "weight": 99},
+    ],
+}
+
+cx.deploy(new_traffic_splitter_spec)
+```
+
+### Update the APIs
+
+```python
+new_traffic_splitter_spec = {
+    "name": "sentiment-analyzer",
+    "kind": "TrafficSplitter",
+    "apis": [
+        {"name": "sentiment-analyzer-b", "weight": 50},
+        {"name": "sentiment-analyzer-c", "weight": 50},
+    ],
+}
+
+cx.deploy(new_traffic_splitter_spec)
+```
diff --git a/docs/workloads/realtime/traffic-splitter/configuration.md b/docs/workloads/realtime/traffic-splitter/configuration.md
deleted file mode 100644
index a498a569a4..0000000000
--- a/docs/workloads/realtime/traffic-splitter/configuration.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# Configuration
-
-```yaml
-- name: <string>  # Traffic Splitter name (required)
-  kind: TrafficSplitter
-  networking:
-    endpoint: <string>  # the endpoint for the Traffic Splitter (default: <name>)
-  apis:  # list of Realtime APIs to target
-    - name: <string>  # name of a Realtime API that is already running or is included in the same configuration file (required)
-      weight: <int>   # percentage of traffic to route to the Realtime API (all non-shadow weights must sum to 100) (required)
-      shadow: <bool>  # duplicate incoming traffic and send fire-and-forget to this api (only one shadow per traffic splitter) (default: false)
-```
diff --git a/docs/workloads/realtime/traffic-splitter/example.md b/docs/workloads/realtime/traffic-splitter/example.md
deleted file mode 100644
index bb4fa6d28b..0000000000
--- a/docs/workloads/realtime/traffic-splitter/example.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# TrafficSplitter
-
-Expose multiple RealtimeAPIs as a single endpoint for A/B tests, multi-armed bandits, or canary deployments.
-
-## Deploy APIs
-
-```python
-import cortex
-
-class Handler:
-    def __init__(self, config):
-        from transformers import pipeline
-        self.model = pipeline(task="text-generation")
-
-    def handle_post(self, payload):
-        return self.model(payload["text"])[0]
-
-requirements = ["tensorflow", "transformers"]
-
-api_spec_cpu = {
-    "name": "text-generator-cpu",
-    "kind": "RealtimeAPI",
-    "compute": {
-        "cpu": 1,
-    },
-}
-
-api_spec_gpu = {
-    "name": "text-generator-gpu",
-    "kind": "RealtimeAPI",
-    "compute": {
-        "gpu": 1,
-    },
-}
-
-cx = cortex.client("cortex")
-cx.deploy_realtime_api(api_spec_cpu, handler=Handler, requirements=requirements)
-cx.deploy_realtime_api(api_spec_gpu, handler=Handler, requirements=requirements)
-```
-
-## Deploy a traffic splitter
-
-```python
-traffic_splitter_spec = {
-    "name": "text-generator",
-    "kind": "TrafficSplitter",
-    "apis": [
-        {"name": "text-generator-cpu", "weight": 50},
-        {"name": "text-generator-gpu", "weight": 50},
-    ],
-}
-
-cx.deploy(traffic_splitter_spec)
-```
-
-## Update the weights of the traffic splitter
-
-```python
-traffic_splitter_spec = cx.get_api("text-generator")["spec"]["submitted_api_spec"]
-
-# send 99% of the traffic to text-generator-gpu
-traffic_splitter_spec["apis"][0]["weight"] = 1
-traffic_splitter_spec["apis"][1]["weight"] = 99
-
-cx.deploy(traffic_splitter_spec)
-```
diff --git a/docs/workloads/realtime/troubleshooting.md b/docs/workloads/realtime/troubleshooting.md
index d2d441510f..7ab9bdca35 100644
--- a/docs/workloads/realtime/troubleshooting.md
+++ b/docs/workloads/realtime/troubleshooting.md
@@ -1,13 +1,13 @@
 # Troubleshooting
 
-## 404 or 503 error responses from API requests
+## 503 error responses from API requests
 
-When making requests to your API, it's possible to get a `{"message":"Not Found"}` error message (with HTTP status code `404`), or a `no healthy upstream` error message (with HTTP status code `503`). This means that there are currently no live replicas running for your API. This could happen for a few reasons:
+When making requests to your API, it's possible to get a `no healthy upstream` error message (with HTTP status code `503`). This means that there are currently no live replicas running for your API. This could happen for a few reasons:
 
-1. It's possible that your API is simply not ready yet. You can check the status of your API with `cortex get API_NAME`, and stream the logs with `cortex logs API_NAME`.
-1. Your API may have errored during initialization or while responding to a previous request. `cortex get API_NAME` will show the status of your API, and you can view the logs with `cortex logs API_NAME`.
+1. It's possible that your API is simply not ready yet. You can check the status of your API with `cortex get API_NAME`, and stream the logs for a single replica (at random) with `cortex logs API_NAME`.
+1. Your API may have errored during initialization or while responding to a previous request. `cortex get API_NAME` will show the status of your API, and you can view the logs for all replicas via Cloudwatch Logs Insights.
 
-It is also possible to receive a `{"message":"Service Unavailable"}` error message (with HTTP status code `503`) if you are using API Gateway in front of your API endpoints and if your request exceeds API Gateway's 29 second timeout. If the request is exceeding the API Gateway timeout, your client should receive the `{"message":"Service Unavailable"}` response ~29 seconds after making the request. To confirm that this is the issue, you can modify your handle function to immediately return a response (e.g. `return "ok"`), re-deploy your API, wait for the update to complete, and try making a request. If your client successfully receives the "ok" response, it is likely that the API Gateway timeout is occurring. You can either modify your handler implementation to take less time, run on faster hardware (e.g. GPUs), or don't use API Gateway (there is no timeout when using the API's endpoint).
+If you are using API Gateway in front of your API endpoints, it is also possible to receive a `{"message":"Service Unavailable"}` error message (with HTTP status code `503`) after 29 seconds if your request exceeds API Gateway's 29 second timeout. If this is the case, you can either modify your code to take less time, run on faster hardware (e.g. GPUs), or don't use API Gateway (there is no timeout when using the API's endpoint directly).
 
 ## API is stuck updating
 
@@ -23,7 +23,7 @@ When you created your Cortex cluster, you configured `max_instances` for each no
 
 You can check the current value of `max_instances` for the selected node group by running `cortex cluster info --config cluster.yaml` (or `cortex cluster info --name <CLUSTER-NAME> --region <CLUSTER-REGION>` if you have the name and region of the cluster).
 
-Once you have the name and region of the cluster, you can update `max_instances` by specifying the desired number of `max_instances` for your node group with `cortex cluster scale --name <CLUSTER-NAME> --region <CLUSTER-REGION> --node-group <NG-NAME> --min-instances <MIN-INSTANCES> --max-instances <MAX-INSTANCES>`.
+Once you have the name and region of the cluster, you can update `max_instances` by specifying the desired number of `max_instances` for your node group with `cortex cluster scale --name <CLUSTER-NAME> --region <CLUSTER-REGION> --node-group <NG-NAME> --max-instances <MAX-INSTANCES>`.
 
 ## Check your AWS auto scaling group activity history
 
@@ -58,13 +58,12 @@ Here is an example: You set `max_instances` to 1, or your AWS account limits you
 
 If you're running in a development environment, this rolling update behavior can be undesirable.
 
-You can disable rolling updates for your API in your API configuration (e.g. in `cortex.yaml`): set `max_surge` to 0 (in the `update_strategy` configuration). E.g.:
+You can disable rolling updates for your API in your API configuration: set `max_surge` to 0 in the `update_strategy` section, E.g.:
 
 ```yaml
-- name: text-generator
-  handler:
-    type: python
-    ...
+- name: my-api
+  kind: RealtimeAPI  # must be "RealtimeAPI" for realtime APIs (required)
+  # ...
   update_strategy:
     max_surge: 0
 ```
diff --git a/docs/workloads/task/configuration.md b/docs/workloads/task/configuration.md
index 84c31312c6..6eabb1d7f7 100644
--- a/docs/workloads/task/configuration.md
+++ b/docs/workloads/task/configuration.md
@@ -1,27 +1,35 @@
 # Configuration
 
-<!-- CORTEX_VERSION_BRANCH_STABLE x3 -->
 ```yaml
-- name: <string>  # API name (required)
-  kind: TaskAPI
-  definition:
-    path: <string>  # path to a python file with a Task class definition, relative to the Cortex root (required)
-    config: <string: value>  # arbitrary dictionary passed to the callable method of the Task class (can be overridden by config passed in job submission) (optional)
-    dependencies: # (optional)
-      pip: <string>  # relative path to requirements.txt (default: requirements.txt)
-      conda: <string>  # relative path to conda-packages.txt (default: conda-packages.txt)
-      shell: <string>  # relative path to a shell script for system package installation (default: dependencies.sh)
-    python_path: <string>  # path to the root of your Python folder that will be appended to PYTHONPATH (default: folder containing cortex.yaml)
-    shm_size: <string> # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
-    image: <string> # docker image to use for the Task (default: quay.io/cortexlabs/python-handler-cpu:master, quay.io/cortexlabs/python-handler-gpu:master-cuda10.2-cudnn8, or quay.io/cortexlabs/python-handler-inf:master based on compute)
-    env: <string: string>  # dictionary of environment variables
-    log_level: <string>  # log level that can be "debug", "info", "warning" or "error" (default: "info")
-  networking:
-    endpoint: <string>  # the endpoint for the API (default: <api_name>)
-  compute:
-    cpu: <string | int | float>  # CPU request per worker. One unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m)
-    gpu: <int>  # GPU request per worker. One unit of GPU corresponds to one virtual GPU (default: 0)
-    inf: <int> # Inferentia request per worker. One unit corresponds to one Inferentia ASIC with 4 NeuronCores and 8GB of cache memory. Each process will have one NeuronCore Group with (4 * inf / processes_per_replica) NeuronCores, so your model should be compiled to run on (4 * inf / processes_per_replica) NeuronCores. (default: 0)
-    mem: <string>  # memory request per worker. One unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null)
-    node_groups: <list:string>  # to select specific node groups (optional)
+- name: <string>  # name of the API (required)
+  kind: TaskAPI  # must be "TaskAPI" for task APIs (required)
+  pod:  # pod configuration (required)
+    containers:  # configurations for the containers to run (at least one constainer must be provided)
+      - name: <string>  # name of the container (required)
+        image: <string>  # docker image to use for the container (required)
+        command: <list[string]>  # entrypoint (required)
+        args: <list[string]>  # arguments to the entrypoint (default: no args)
+        env: <map[string:string]>  # dictionary of environment variables to set in the container (optional)
+        compute:  # compute resource requests (default: see below)
+          cpu: <string|int|float>  # CPU request for the container; one unit of CPU corresponds to one virtual CPU; fractional requests are allowed, and can be specified as a floating point number or via the "m" suffix (default: 200m)
+          gpu: <int>  # GPU request for the container; one unit of GPU corresponds to one virtual GPU (default: 0)
+          inf: <int>  # Inferentia request for the container; one unit of inf corresponds to one virtual Inferentia chip (default: 0)
+          mem: <string>  # memory request for the container; one unit of memory is one byte and can be expressed as an integer or by using one of these suffixes: K, M, G, T (or their power-of two counterparts: Ki, Mi, Gi, Ti) (default: Null)
+          shm: <string>  # size of shared memory (/dev/shm) for sharing data between multiple processes, e.g. 64Mi or 1Gi (default: Null)
+        liveness_probe:  # periodic probe of container liveness; container will be restarted if the probe fails (optional)
+          http_get:  # specifies an http endpoint which must respond with status code 200 (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+            path: <string>  # the path to access on the HTTP server (default: /)
+          tcp_socket:  # specifies a port which must be ready to receive traffic (only one of http_get, tcp_socket, and exec may be specified)
+            port: <int|string>  # the port to access on the container (required)
+          exec:  # specifies a command to run which must exit with code 0 (only one of http_get, tcp_socket, and exec may be specified)
+            command: []  # the command to execute inside the container, which is exec'd (not run inside a shell); the working directory is root ('/') in the container's filesystem (required)
+          initial_delay_seconds: <int>  # number of seconds after the container has started before the probe is initiated (default: 0)
+          timeout_seconds: <int>  # number of seconds until the probe times out (default: 1)
+          period_seconds: <int>  # how often (in seconds) to perform the probe (default: 10)
+          success_threshold: <int>  # minimum consecutive successes for the probe to be considered successful after having failed (default: 1)
+          failure_threshold: <int>  # minimum consecutive failures for the probe to be considered failed after having succeeded (default: 3)
+  node_groups: <list[string]>  # a list of node groups on which this API can run (default: all node groups are eligible)
+  networking:  # networking configuration (default: see below)
+    endpoint: <string>  # endpoint for the API (default: <api_name>)
 ```
diff --git a/docs/workloads/task/definitions.md b/docs/workloads/task/definitions.md
deleted file mode 100644
index ed50b34918..0000000000
--- a/docs/workloads/task/definitions.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Implementation
-
-## Project files
-
-Cortex makes all files in the project directory (i.e. the directory which contains `cortex.yaml`) available for use in your Task implementation. Python bytecode files (`*.pyc`, `*.pyo`, `*.pyd`), files or folders that start with `.`, and the api configuration file (e.g. `cortex.yaml`) are excluded.
-
-The following files can also be added at the root of the project's directory:
-
-* `.cortexignore` file, which follows the same syntax and behavior as a [.gitignore file](https://git-scm.com/docs/gitignore). This may be necessary if you are reaching the size limit for your project directory (32mb).
-* `.env` file, which exports environment variables that can be used in the task. Each line of this file must follow the `VARIABLE=value` format.
-
-For example, if your directory looks like this:
-
-```text
-./my-classifier/
-├── cortex.yaml
-├── values.json
-├── task.py
-├── ...
-└── requirements.txt
-```
-
-You can access `values.json` in your Task like this:
-
-```python
-import json
-
-class Task:
-    def __call__(self, config):
-        with open('values.json', 'r') as values_file:
-            values = json.load(values_file)
-        self.values = values
-```
-
-## Task
-
-### Interface
-
-```python
-# initialization code and variables can be declared here in global scope
-
-class Task:
-    def __call__(self, config):
-        """(Required) Task runnable.
-
-        Args:
-            config (required): Dictionary passed from API configuration (if
-                specified) merged with configuration passed in with Job
-                Submission API. If there are conflicting keys, values in
-                configuration specified in Job submission takes precedence.
-        """
-        pass
-```
-
-## Structured logging
-
-You can use Cortex's logger in your handler implementation to log in JSON. This will enrich your logs with Cortex's metadata, and you can add custom metadata to the logs by adding key value pairs to the `extra` key when using the logger. For example:
-
-```python
-...
-from cortex_internal.lib.log import logger as cortex_logger
-
-class Task:
-    def __call__(self, config):
-        ...
-        cortex_logger.info("completed validations", extra={"accuracy": accuracy})
-```
-
-The dictionary passed in via the `extra` will be flattened by one level. e.g.
-
-```text
-{"asctime": "2021-01-19 15:14:05,291", "levelname": "INFO", "message": "completed validations", "process": 235, "accuracy": 0.97}
-```
-
-To avoid overriding essential Cortex metadata, please refrain from specifying the following extra keys: `asctime`, `levelname`, `message`, `labels`, and `process`. Log lines greater than 5 MB in size will be ignored.
-
-## Cortex Python client
-
-A default [Cortex Python client](../../clients/python.md#cortex.client.client) environment has been configured for your API. This can be used for deploying/deleting/updating or submitting jobs to your running cluster based on the execution flow of your task. For example:
-
-```python
-import cortex
-
-class Task:
-    def __call__(self, config):
-        ...
-        # get client pointing to the default environment
-        client = cortex.client()
-        # deploy API in the existing cluster as part of your pipeline workflow
-        client.deploy(...)
-```
diff --git a/docs/workloads/task/jobs.md b/docs/workloads/task/jobs.md
index fd8701ecc6..bb38732ab3 100644
--- a/docs/workloads/task/jobs.md
+++ b/docs/workloads/task/jobs.md
@@ -1,6 +1,6 @@
 # TaskAPI jobs
 
-## Get the TaskAPI endpoint
+## Get the Task API's endpoint
 
 ```bash
 cortex get <task_api_name>
@@ -11,8 +11,8 @@ cortex get <task_api_name>
 ```yaml
 POST <task_api_endpoint>:
 {
-    "timeout": <int>,  # duration in seconds since the submission of a job before it is terminated (optional)
-    "config": {  # custom fields for this specific job (will override values in `config` specified in your api configuration) (optional)
+    "timeout": <int>,   # duration in seconds since the submission of a job before it is terminated (optional)
+    "config": {         # arbitrary input for this specific job (written to /cortex/spec/job.json in the API containers) (optional)
         "string": <any>
     }
 }
diff --git a/docs/workloads/task/task-apis.md b/docs/workloads/task/task-apis.md
new file mode 100644
index 0000000000..f3d2a4abf0
--- /dev/null
+++ b/docs/workloads/task/task-apis.md
@@ -0,0 +1,3 @@
+# Task APIs
+
+Task APIs run multi-worker jobs on demand.
diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go
index 66bb92b2e3..1ba32c84c5 100644
--- a/pkg/consts/consts.go
+++ b/pkg/consts/consts.go
@@ -24,8 +24,8 @@ var (
 	CortexVersion      = "master" // CORTEX_VERSION
 	CortexVersionMinor = "master" // CORTEX_VERSION_MINOR
 
-	DefaultMaxQueueLength = int64(1024)
-	DefaultMaxConcurrency = int64(16)
+	DefaultMaxQueueLength = int64(100)
+	DefaultMaxConcurrency = int64(1)
 
 	DefaultUserPodPortStr   = "8080"
 	DefaultUserPodPortInt32 = int32(8080)
diff --git a/pkg/types/spec/validations.go b/pkg/types/spec/validations.go
index b5b75be126..1bdac79a8d 100644
--- a/pkg/types/spec/validations.go
+++ b/pkg/types/spec/validations.go
@@ -335,7 +335,8 @@ func httpGetProbeValidation() *cr.StructFieldValidation {
 				{
 					StructField: "Path",
 					StringValidation: &cr.StringValidation{
-						Required:  true,
+						Required:  false,
+						Default:   "/",
 						Validator: urls.ValidateEndpointAllowEmptyPath,
 					},
 				},

From 51832cb082c87806ad2a487efd6c54b5c742bbb2 Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Tue, 25 May 2021 15:52:32 -0700
Subject: [PATCH 2/3] Remove max_concurrency from batch

---
 docs/workloads/batch/configuration.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/workloads/batch/configuration.md b/docs/workloads/batch/configuration.md
index 4b9337a95a..31ef274089 100644
--- a/docs/workloads/batch/configuration.md
+++ b/docs/workloads/batch/configuration.md
@@ -5,7 +5,6 @@
   kind: BatchAPI  # must be "BatchAPI" for batch APIs (required)
   pod:  # pod configuration (required)
     port: <int>  # port to which requests will be sent (default: 8080; exported as $CORTEX_PORT)
-    max_concurrency: <int>  # maximum number of requests that will be concurrently sent into the container (default: 1)
     containers:  # configurations for the containers to run (at least one constainer must be provided)
       - name: <string>  # name of the container (required)
         image: <string>  # docker image to use for the container (required)

From 4d2533ea0eac71672626ba45dce26ae9173d5975 Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Tue, 25 May 2021 16:20:51 -0700
Subject: [PATCH 3/3] Misc

---
 docs/workloads/async/autoscaling.md         |  6 +++---
 docs/workloads/batch/jobs.md                | 12 +++++++++---
 docs/workloads/realtime/autoscaling.md      |  4 ++--
 docs/workloads/realtime/traffic-splitter.md | 14 +++++++-------
 docs/workloads/realtime/troubleshooting.md  |  2 +-
 docs/workloads/task/jobs.md                 |  4 +++-
 pkg/lib/aws/elb.go                          |  2 +-
 7 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/docs/workloads/async/autoscaling.md b/docs/workloads/async/autoscaling.md
index 358b27753c..c83c537154 100644
--- a/docs/workloads/async/autoscaling.md
+++ b/docs/workloads/async/autoscaling.md
@@ -6,9 +6,9 @@ Cortex auto-scales AsyncAPIs on a per-API basis based on your configuration.
 
 ### Relevant pod configuration
 
-In addition to the autoscaling configuration options (described below), there are two fields in the pod configuration which are relevant to replica autoscaling:
+In addition to the autoscaling configuration options (described below), there is one field in the pod configuration which are relevant to replica autoscaling:
 
-**`max_concurrency`** (default: 1): The the maximum number of requests that will be concurrently sent into the container by Cortex. If your web server is designed to handle multiple concurrent requests, increasing `max_concurrency` will increase the throughput of a replica (and result in fewer total replicas for a given load).
+**`max_concurrency`** (default: 1): The maximum number of requests that will be concurrently sent into the container by Cortex. If your web server is designed to handle multiple concurrent requests, increasing `max_concurrency` will increase the throughput of a replica (and result in fewer total replicas for a given load).
 
 <br>
 
@@ -26,7 +26,7 @@ In addition to the autoscaling configuration options (described below), there ar
 
 The autoscaler uses this formula to determine the number of desired replicas:
 
-`desired replicas = sum(in-flight requests accross all replicas) / target_in_flight`
+`desired replicas = total in-flight requests / target_in_flight`
 
 For example, setting `target_in_flight` to `max_concurrency` (the default) causes the cluster to adjust the number of replicas so that on average, there are no requests waiting in the queue.
 
diff --git a/docs/workloads/batch/jobs.md b/docs/workloads/batch/jobs.md
index 751c3fa85e..aa2f34418a 100644
--- a/docs/workloads/batch/jobs.md
+++ b/docs/workloads/batch/jobs.md
@@ -42,7 +42,7 @@ POST <batch_api_endpoint>:
         ],
         "batch_size": <int>,  # the number of items per batch (the handle_batch() function is called once per batch) (required)
     }
-    "config": {               # arbitrary input for this specific job (written to /cortex/spec/job.json in the API containers) (optional)
+    "config": {               # arbitrary input for this specific job (optional)
         "string": <any>
     }
 }
@@ -65,6 +65,8 @@ RESPONSE:
 }
 ```
 
+The entire job specification is written to `/cortex/spec/job.json` in the API containers.
+
 ### S3 file paths
 
 If your input data is a list of files such as images/videos in an S3 directory, you can define `file_path_lister` in your submission request payload. You can use `file_path_lister.s3_paths` to specify a list of files or prefixes, and `file_path_lister.includes` and/or `file_path_lister.excludes` to remove unwanted files. The S3 file paths will be aggregated into batches of size `file_path_lister.batch_size`. To learn more about fine-grained S3 file filtering see [filtering files](#filtering-files).
@@ -93,7 +95,7 @@ POST <batch_api_endpoint>:
         "excludes": [<string>],     # glob patterns (optional)
         "batch_size": <int>,        # the number of S3 file paths per batch (the handle_batch() function is called once per batch) (required)
     }
-    "config": {                     # arbitrary input for this specific job (written to /cortex/spec/job.json in the API containers) (optional)
+    "config": {                     # arbitrary input for this specific job (optional)
         "string": <any>
     }
 }
@@ -116,6 +118,8 @@ RESPONSE:
 }
 ```
 
+The entire job specification is written to `/cortex/spec/job.json` in the API containers.
+
 ### Newline delimited JSON files in S3
 
 If your input dataset is a newline delimited json file in an S3 directory (or a list of them), you can define `delimited_files` in your request payload to break up the contents of the file into batches of size `delimited_files.batch_size`.
@@ -143,7 +147,7 @@ POST <batch_api_endpoint>:
         "excludes": [<string>],     # glob patterns (optional)
         "batch_size": <int>,        # the number of json objects per batch (the handle_batch() function is called once per batch) (required)
     }
-    "config": {                     # arbitrary input for this specific job (written to /cortex/spec/job.json in the API containers) (optional)
+    "config": {                     # arbitrary input for this specific job (optional)
         "string": <any>
     }
 }
@@ -166,6 +170,8 @@ RESPONSE:
 }
 ```
 
+The entire job specification is written to `/cortex/spec/job.json` in the API containers.
+
 ## Get a job's status
 
 ```bash
diff --git a/docs/workloads/realtime/autoscaling.md b/docs/workloads/realtime/autoscaling.md
index 0b53bf84e5..c3a7cfbaca 100644
--- a/docs/workloads/realtime/autoscaling.md
+++ b/docs/workloads/realtime/autoscaling.md
@@ -8,11 +8,11 @@ Cortex autoscales each API independently based on its configuration.
 
 In addition to the autoscaling configuration options (described below), there are two fields in the pod configuration which are relevant to replica autoscaling:
 
-**`max_concurrency`** (default: 1): The the maximum number of requests that will be concurrently sent into the container by Cortex. If your web server is designed to handle multiple concurrent requests, increasing `max_concurrency` will increase the throughput of a replica (and result in fewer total replicas for a given load).
+**`max_concurrency`** (default: 1): The maximum number of requests that will be concurrently sent into the container by Cortex. If your web server is designed to handle multiple concurrent requests, increasing `max_concurrency` will increase the throughput of a replica (and result in fewer total replicas for a given load).
 
 <br>
 
-**`max_queue_length`** (default: 100): The the maximum number of requests which will be queued by the replica (beyond `max_concurrency`) before requests are rejected with HTTP error code 503. For long-running APIs, decreasing `max_replica_concurrency` and configuring the client to retry when it receives 503 responses will improve queue fairness accross replicas by preventing requests from sitting in long queues.
+**`max_queue_length`** (default: 100): The maximum number of requests which will be queued by the replica (beyond `max_concurrency`) before requests are rejected with HTTP error code 503. For long-running APIs, decreasing `max_replica_concurrency` and configuring the client to retry when it receives 503 responses will improve queue fairness accross replicas by preventing requests from sitting in long queues.
 
 <br>
 
diff --git a/docs/workloads/realtime/traffic-splitter.md b/docs/workloads/realtime/traffic-splitter.md
index ae9094127b..07afe1726d 100644
--- a/docs/workloads/realtime/traffic-splitter.md
+++ b/docs/workloads/realtime/traffic-splitter.md
@@ -5,11 +5,11 @@ Traffic Splitters can be used to expose multiple RealtimeAPIs as a single endpoi
 ## Configuration
 
 ```yaml
-- name: <string>  # Traffic Splitter name (required)
-  kind: TrafficSplitter
-  networking:
-    endpoint: <string>  # the endpoint for the Traffic Splitter (default: <name>)
-  apis:  # list of Realtime APIs to target
+- name: <string>  # name of the traffic splitter (required)
+  kind: TrafficSplitter  # must be "TrafficSplitter" for traffic splitters (required)
+  networking:  # networking configuration (default: see below)
+    endpoint: <string>  # the endpoint for the traffic splitter (default: <name>)
+  apis:  # list of Realtime APIs to target (required)
     - name: <string>  # name of a Realtime API that is already running or is included in the same configuration file (required)
       weight: <int>   # percentage of traffic to route to the Realtime API (all non-shadow weights must sum to 100) (required)
       shadow: <bool>  # duplicate incoming traffic and send fire-and-forget to this api (only one shadow per traffic splitter) (default: false)
@@ -17,7 +17,7 @@ Traffic Splitters can be used to expose multiple RealtimeAPIs as a single endpoi
 
 ## Example
 
-This example showcases Cortex's Python client, but these steps can also be performed by using the Cortex CLI with YAML files.
+This example showcases Cortex's Python client, but these steps can also be performed by using the Cortex CLI.
 
 ### Deploy a traffic splitter
 
@@ -49,7 +49,7 @@ new_traffic_splitter_spec = {
 cx.deploy(new_traffic_splitter_spec)
 ```
 
-### Update the APIs
+### Update the target APIs
 
 ```python
 new_traffic_splitter_spec = {
diff --git a/docs/workloads/realtime/troubleshooting.md b/docs/workloads/realtime/troubleshooting.md
index 7ab9bdca35..7e01e6f524 100644
--- a/docs/workloads/realtime/troubleshooting.md
+++ b/docs/workloads/realtime/troubleshooting.md
@@ -62,7 +62,7 @@ You can disable rolling updates for your API in your API configuration: set `max
 
 ```yaml
 - name: my-api
-  kind: RealtimeAPI  # must be "RealtimeAPI" for realtime APIs (required)
+  kind: RealtimeAPI
   # ...
   update_strategy:
     max_surge: 0
diff --git a/docs/workloads/task/jobs.md b/docs/workloads/task/jobs.md
index bb38732ab3..bde647cfab 100644
--- a/docs/workloads/task/jobs.md
+++ b/docs/workloads/task/jobs.md
@@ -12,7 +12,7 @@ cortex get <task_api_name>
 POST <task_api_endpoint>:
 {
     "timeout": <int>,   # duration in seconds since the submission of a job before it is terminated (optional)
-    "config": {         # arbitrary input for this specific job (written to /cortex/spec/job.json in the API containers) (optional)
+    "config": {         # arbitrary input for this specific job (optional)
         "string": <any>
     }
 }
@@ -30,6 +30,8 @@ RESPONSE:
 }
 ```
 
+The entire job specification is written to `/cortex/spec/job.json` in the API containers.
+
 ## Get a job's status
 
 ```bash
diff --git a/pkg/lib/aws/elb.go b/pkg/lib/aws/elb.go
index f19caa8790..ccd8e624d7 100644
--- a/pkg/lib/aws/elb.go
+++ b/pkg/lib/aws/elb.go
@@ -42,7 +42,7 @@ func IsInstanceSupportedByNLB(instanceType string) (bool, error) {
 	return true, nil
 }
 
-// returns the the first load balancer which has all of the specified tags, or nil if no load balancers match
+// returns the first load balancer which has all of the specified tags, or nil if no load balancers match
 func (c *Client) FindLoadBalancer(tags map[string]string) (*elbv2.LoadBalancer, error) {
 	var loadBalancer *elbv2.LoadBalancer
 	var fnErr error