cortexlabs · 1vn · Feb 15, 2019 · Feb 5, 2019 · Feb 5, 2019 · Feb 5, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -13,6 +13,7 @@ services:
 script:
   - docker build . -f images/spark-base/Dockerfile -t cortexlabs/spark-base:latest
   - docker build . -f images/tf-base/Dockerfile -t cortexlabs/tf-base:latest
+  - docker build . -f images/tf-base-gpu/Dockerfile -t cortexlabs/tf-base-gpu:latest
 
   - ./build/images.sh images/operator operator
   - ./build/images.sh images/spark spark

diff --git a/cortex.sh b/cortex.sh
@@ -143,6 +143,8 @@ export CORTEX_IMAGE_SPARK_OPERATOR="${CORTEX_IMAGE_SPARK_OPERATOR:-cortexlabs/sp
 export CORTEX_IMAGE_TF_SERVE="${CORTEX_IMAGE_TF_SERVE:-cortexlabs/tf-serve:$CORTEX_VERSION_STABLE}"
 export CORTEX_IMAGE_TF_TRAIN="${CORTEX_IMAGE_TF_TRAIN:-cortexlabs/tf-train:$CORTEX_VERSION_STABLE}"
 export CORTEX_IMAGE_TF_API="${CORTEX_IMAGE_TF_API:-cortexlabs/tf-api:$CORTEX_VERSION_STABLE}"
+export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}"
+export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}"
 
 export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-""}"
 export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-""}"
@@ -298,6 +300,8 @@ function setup_configmap() {
     --from-literal='IMAGE_TF_TRAIN'=$CORTEX_IMAGE_TF_TRAIN \
     --from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \
     --from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \
+    --from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \
+    --from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \
     -o yaml --dry-run | kubectl apply -f - >/dev/null
 }
 

diff --git a/dev/eks.sh b/dev/eks.sh
@@ -26,9 +26,12 @@ function eks_set_cluster() {
 }
 
 if [ "$1" = "start" ]; then
-  eksctl create cluster --version=1.11 --name=$K8S_NAME  --region $K8S_REGION --nodes=$K8S_NODE_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
+  eksctl create cluster --name=$K8S_NAME  --region $K8S_REGION --nodes-max $K8S_NODES_MAX_COUNT --nodes-min $K8S_NODES_MIN_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
+  if [ $K8S_GPU_NODES_MIN_COUNT -gt 0 ] && [ $K8S_GPU_NODES_MAX_COUNT -gt 0 ]; then
+    eksctl create nodegroup --cluster=$K8S_NAME --nodes-max=$K8S_GPU_NODES_MAX_COUNT --nodes-min=$K8S_GPU_NODES_MIN_COUNT  --node-type=$K8S_GPU_NODE_INSTANCE_TYPE --node-ami=$K8S_GPU_NODE_AMI
+  fi
   eks_set_cluster
-
+  
 elif [ "$1" = "update" ]; then
   echo "Not implemented"
 

diff --git a/dev/kops.sh b/dev/kops.sh
@@ -131,13 +131,14 @@ spec:
   image: kope.io/k8s-1.11-debian-stretch-amd64-hvm-ebs-2018-08-17
   machineType: ${K8S_NODE_INSTANCE_TYPE}
   rootVolumeSize: ${K8S_NODE_VOLUME_SIZE}
-  maxSize: ${K8S_NODE_COUNT}
-  minSize: ${K8S_NODE_COUNT}
+  maxSize: ${K8S_NODES_MAX_COUNT}
+  minSize: ${K8S_NODES_MIN_COUNT}
   nodeLabels:
     kops.k8s.io/instancegroup: nodes
   role: Node
   subnets:
   - ${K8S_ZONE}
+
 EOM
 }
 

diff --git a/dev/registry.sh b/dev/registry.sh
@@ -46,6 +46,8 @@ function create_registry() {
   aws ecr create-repository --repository-name=cortexlabs/tf-serve --region=$REGISTRY_REGION || true
   aws ecr create-repository --repository-name=cortexlabs/tf-train --region=$REGISTRY_REGION || true
   aws ecr create-repository --repository-name=cortexlabs/tf-api --region=$REGISTRY_REGION || true
+  aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true
+  aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true
 }
 
 ### HELPERS ###
@@ -114,6 +116,7 @@ elif [ "$CMD" = "update" ]; then
     cache_builder $ROOT/images/spark-base spark-base
     build_base $ROOT/images/spark-base spark-base
     build_base $ROOT/images/tf-base tf-base
+    build_base $ROOT/images/tf-base-gpu tf-base-gpu
 
     cache_builder $ROOT/images/operator operator
     build_and_push $ROOT/images/operator operator latest
@@ -127,10 +130,12 @@ elif [ "$CMD" = "update" ]; then
     build_and_push $ROOT/images/argo-controller argo-controller latest
     build_and_push $ROOT/images/argo-executor argo-executor latest
     build_and_push $ROOT/images/tf-serve tf-serve latest
+    build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest
   fi
 
   build_and_push $ROOT/images/spark spark latest
   build_and_push $ROOT/images/tf-train tf-train latest
+  build_and_push $ROOT/images/tf-train-gpu tf-train-gpu latest
   build_and_push $ROOT/images/tf-api tf-api latest
 
   cleanup

diff --git a/docs/applications/resources/models.md b/docs/applications/resources/models.md
@@ -44,6 +44,7 @@ Train custom TensorFlow models at scale.
   compute:
     cpu: <string>  # CPU request (default: Null)
     mem: <string>  # memory request (default: Null)
+    gpu: <string>  # GPU request (default: Null)
 
   tags:
     <string>: <scalar>  # arbitrary key/value pairs to attach to the resource (optional)

diff --git a/images/tf-base-gpu/Dockerfile b/images/tf-base-gpu/Dockerfile
@@ -0,0 +1,51 @@
+# Adapted from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+FROM nvidia/cuda:9.0-base-ubuntu16.04
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    cuda-command-line-tools-9-0 \
+    cuda-cublas-9-0 \
+    cuda-cufft-9-0 \
+    cuda-curand-9-0 \
+    cuda-cusolver-9-0 \
+    cuda-cusparse-9-0 \
+    libcudnn7=7.4.1.5-1+cuda9.0 \
+    libfreetype6-dev \
+    libhdf5-serial-dev \
+    libpng12-dev \
+    libzmq3-dev \
+    pkg-config \
+    software-properties-common \
+    unzip
+
+RUN apt-get update && \
+    apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda9.0 \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda9.0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update -qq && apt-get install -y -q \
+        build-essential \
+        curl \
+        wget \
+        python3 \
+        python3-pip \
+        python3-dev \
+        rsync \
+        zlib1g-dev \
+    && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
+
+RUN pip3 --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s python3 /usr/local/bin/python 
+
+RUN pip3 install tensorflow-gpu==1.12
diff --git a/images/tf-serve-gpu/Dockerfile b/images/tf-serve-gpu/Dockerfile
@@ -0,0 +1,28 @@
+FROM nvidia/cuda:9.0-base-ubuntu16.04
+
+ARG TF_VERSION="1.12.0"
+
+RUN apt-get update -qq && apt-get install -y -q \
+        automake \
+        build-essential \
+        curl \
+        libcurl3-dev \
+        git \
+        libtool \
+        libfreetype6-dev \
+        libpng-dev \
+        libzmq3-dev \
+        pkg-config \
+        python3-dev \
+        python3-numpy \
+        python3-pip \
+        software-properties-common \
+        swig \
+        zip \
+        zlib1g-dev \
+    && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
+
+RUN curl -o tensorflow-model-server.deb http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-${TF_VERSION}/t/tensorflow-model-server/tensorflow-model-server_${TF_VERSION}_all.deb
+RUN dpkg -i tensorflow-model-server.deb
+
+ENTRYPOINT ["tensorflow_model_server"]
diff --git a/images/tf-serve/Dockerfile b/images/tf-serve/Dockerfile
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-FROM ubuntu:18.04
+FROM ubuntu:16.04
 
 ARG TF_VERSION="1.12.0"
 

diff --git a/images/tf-train-gpu/Dockerfile b/images/tf-train-gpu/Dockerfile
@@ -0,0 +1,15 @@
+FROM cortexlabs/tf-base-gpu
+
+ENV PYTHONPATH="/src:${PYTHONPATH}"
+
+COPY pkg/workloads/lib/requirements.txt /src/lib/requirements.txt
+COPY pkg/workloads/tf_train/requirements.txt /src/tf_train/requirements.txt
+RUN pip3 install -r /src/lib/requirements.txt && \
+    pip3 install -r /src/tf_train/requirements.txt && \
+    rm -rf /root/.cache/pip*
+
+COPY pkg/workloads/consts.py /src/
+COPY pkg/workloads/lib /src/lib
+COPY pkg/workloads/tf_train /src/tf_train
+
+ENTRYPOINT ["/usr/bin/python3", "/src/tf_train/train.py"]
diff --git a/pkg/api/userconfig/compute.go b/pkg/api/userconfig/compute.go
@@ -142,6 +142,7 @@ func (sparkCompute *SparkCompute) ID() string {
 type TFCompute struct {
 	CPU *Quantity `json:"cpu" yaml:"cpu"`
 	Mem *Quantity `json:"mem" yaml:"mem"`
+	GPU *int64    `json:"gpu" yaml:"gpu"`
 }
 
 var tfComputeFieldValidation = &cr.StructFieldValidation{
@@ -166,6 +167,13 @@ var tfComputeFieldValidation = &cr.StructFieldValidation{
 					Min: k8sresource.MustParse("0"),
 				}),
 			},
+			&cr.StructFieldValidation{
+				StructField: "GPU",
+				Int64PtrValidation: &cr.Int64PtrValidation{
+					Default:     nil,
+					GreaterThan: util.Int64Ptr(0),
+				},
+			},
 		},
 	},
 }
@@ -181,6 +189,7 @@ type APICompute struct {
 	Replicas int32     `json:"replicas" yaml:"replicas"`
 	CPU      *Quantity `json:"cpu" yaml:"cpu"`
 	Mem      *Quantity `json:"mem" yaml:"mem"`
+	GPU      *int64    `json:"gpu" yaml:"gpu"`
 }
 
 var apiComputeFieldValidation = &cr.StructFieldValidation{
@@ -212,6 +221,13 @@ var apiComputeFieldValidation = &cr.StructFieldValidation{
 					Min: k8sresource.MustParse("0"),
 				}),
 			},
+			&cr.StructFieldValidation{
+				StructField: "GPU",
+				Int64PtrValidation: &cr.Int64PtrValidation{
+					Default:     nil,
+					GreaterThan: util.Int64Ptr(0),
+				},
+			},
 		},
 	},
 }
@@ -284,6 +300,11 @@ func MaxTFCompute(tfComputes ...*TFCompute) *TFCompute {
 				aggregated.Mem = tfCompute.Mem
 			}
 		}
+		if tfCompute.GPU != nil {
+			if aggregated.GPU == nil || *tfCompute.GPU > 0 {
+				aggregated.GPU = tfCompute.GPU
+			}
+		}
 	}
 
 	return &aggregated
@@ -299,5 +320,18 @@ func (apiCompute *APICompute) Equal(apiCompute2 APICompute) bool {
 	if !QuantityPtrsEqual(apiCompute.Mem, apiCompute2.Mem) {
 		return false
 	}
+
+	if apiCompute.GPU != nil && apiCompute2.GPU == nil {
+		return false
+	}
+
+	if apiCompute.GPU == nil && apiCompute2.GPU != nil {
+		return false
+	}
+
+	if apiCompute.GPU != nil && apiCompute2.GPU != nil && *apiCompute.GPU != *apiCompute2.GPU {
+		return false
+	}
+
 	return true
 }
diff --git a/pkg/operator/cortexconfig/cortex_config.go b/pkg/operator/cortexconfig/cortex_config.go
@@ -24,15 +24,17 @@ import (
 )
 
 var (
-	LogGroup      string
-	Bucket        string
-	Region        string
-	Namespace     string
-	OperatorImage string
-	SparkImage    string
-	TFTrainImage  string
-	TFServeImage  string
-	TFAPIImage    string
+	LogGroup        string
+	Bucket          string
+	Region          string
+	Namespace       string
+	OperatorImage   string
+	SparkImage      string
+	TFTrainImage    string
+	TFServeImage    string
+	TFAPIImage      string
+	TFTrainImageGPU string
+	TFServeImageGPU string
 )
 
 func init() {
@@ -45,6 +47,8 @@ func init() {
 	TFTrainImage = getStr("IMAGE_TF_TRAIN")
 	TFServeImage = getStr("IMAGE_TF_SERVE")
 	TFAPIImage = getStr("IMAGE_TF_API")
+	TFTrainImageGPU = getStr("IMAGE_TF_TRAIN_GPU")
+	TFServeImageGPU = getStr("IMAGE_TF_SERVE_GPU")
 }
 
 //

diff --git a/pkg/operator/workloads/api.go b/pkg/operator/workloads/api.go
@@ -21,6 +21,7 @@ import (
 
 	appsv1b1 "k8s.io/api/apps/v1beta1"
 	corev1 "k8s.io/api/core/v1"
+	k8sresource "k8s.io/apimachinery/pkg/api/resource"
 
 	"github.com/cortexlabs/cortex/pkg/api/context"
 	s "github.com/cortexlabs/cortex/pkg/api/strings"
@@ -47,6 +48,8 @@ func apiSpec(
 
 	transformResourceList := corev1.ResourceList{}
 	tfServingResourceList := corev1.ResourceList{}
+	transformLimitsList := corev1.ResourceList{}
+	tfServingLimitsList := corev1.ResourceList{}
 
 	if apiCompute.CPU != nil {
 		q1, q2 := apiCompute.CPU.SplitInTwo()
@@ -59,6 +62,13 @@ func apiSpec(
 		tfServingResourceList[corev1.ResourceMemory] = *q2
 	}
 
+	servingImage := cc.TFServeImage
+	if apiCompute.GPU != nil {
+		servingImage = cc.TFServeImageGPU
+		tfServingResourceList["nvidia.com/gpu"] = *k8sresource.NewQuantity(*apiCompute.GPU, k8sresource.DecimalSI)
+		tfServingLimitsList["nvidia.com/gpu"] = *k8sresource.NewQuantity(*apiCompute.GPU, k8sresource.DecimalSI)
+	}
+
 	return k8s.Deployment(&k8s.DeploymentSpec{
 		Name:     internalAPIName(apiName, ctx.App.Name),
 		Replicas: ctx.APIs[apiName].Compute.Replicas,
@@ -102,11 +112,12 @@ func apiSpec(
 						VolumeMounts: k8s.DefaultVolumeMounts(),
 						Resources: corev1.ResourceRequirements{
 							Requests: transformResourceList,
+							Limits:   transformLimitsList,
 						},
 					},
 					{
 						Name:            tfServingContainerName,
-						Image:           cc.TFServeImage,
+						Image:           servingImage,
 						ImagePullPolicy: "Always",
 						Args: []string{
 							"--port=" + tfServingPortStr,
@@ -116,6 +127,7 @@ func apiSpec(
 						VolumeMounts: k8s.DefaultVolumeMounts(),
 						Resources: corev1.ResourceRequirements{
 							Requests: tfServingResourceList,
+							Limits:   tfServingLimitsList,
 						},
 					},
 				},