Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ services:
script:
- docker build . -f images/spark-base/Dockerfile -t cortexlabs/spark-base:latest
- docker build . -f images/tf-base/Dockerfile -t cortexlabs/tf-base:latest
- docker build . -f images/tf-base-gpu/Dockerfile -t cortexlabs/tf-base-gpu:latest

- ./build/images.sh images/operator operator
- ./build/images.sh images/spark spark
Expand Down
4 changes: 4 additions & 0 deletions cortex.sh
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ export CORTEX_IMAGE_SPARK_OPERATOR="${CORTEX_IMAGE_SPARK_OPERATOR:-cortexlabs/sp
export CORTEX_IMAGE_TF_SERVE="${CORTEX_IMAGE_TF_SERVE:-cortexlabs/tf-serve:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_TF_TRAIN="${CORTEX_IMAGE_TF_TRAIN:-cortexlabs/tf-train:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_TF_API="${CORTEX_IMAGE_TF_API:-cortexlabs/tf-api:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}"

export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-""}"
export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-""}"
Expand Down Expand Up @@ -298,6 +300,8 @@ function setup_configmap() {
--from-literal='IMAGE_TF_TRAIN'=$CORTEX_IMAGE_TF_TRAIN \
--from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \
--from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \
--from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \
--from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \
-o yaml --dry-run | kubectl apply -f - >/dev/null
}

Expand Down
7 changes: 5 additions & 2 deletions dev/eks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@ function eks_set_cluster() {
}

if [ "$1" = "start" ]; then
eksctl create cluster --version=1.11 --name=$K8S_NAME --region $K8S_REGION --nodes=$K8S_NODE_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
eksctl create cluster --name=$K8S_NAME --region $K8S_REGION --nodes-max $K8S_NODES_MAX_COUNT --nodes-min $K8S_NODES_MIN_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
if [ $K8S_GPU_NODES_MIN_COUNT -gt 0 ] && [ $K8S_GPU_NODES_MAX_COUNT -gt 0 ]; then
eksctl create nodegroup --cluster=$K8S_NAME --nodes-max=$K8S_GPU_NODES_MAX_COUNT --nodes-min=$K8S_GPU_NODES_MIN_COUNT --node-type=$K8S_GPU_NODE_INSTANCE_TYPE --node-ami=$K8S_GPU_NODE_AMI
fi
eks_set_cluster

elif [ "$1" = "update" ]; then
echo "Not implemented"

Expand Down
5 changes: 3 additions & 2 deletions dev/kops.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,14 @@ spec:
image: kope.io/k8s-1.11-debian-stretch-amd64-hvm-ebs-2018-08-17
machineType: ${K8S_NODE_INSTANCE_TYPE}
rootVolumeSize: ${K8S_NODE_VOLUME_SIZE}
maxSize: ${K8S_NODE_COUNT}
minSize: ${K8S_NODE_COUNT}
maxSize: ${K8S_NODES_MAX_COUNT}
minSize: ${K8S_NODES_MIN_COUNT}
nodeLabels:
kops.k8s.io/instancegroup: nodes
role: Node
subnets:
- ${K8S_ZONE}

EOM
}

Expand Down
5 changes: 5 additions & 0 deletions dev/registry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ function create_registry() {
aws ecr create-repository --repository-name=cortexlabs/tf-serve --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/tf-train --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/tf-api --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true
}

### HELPERS ###
Expand Down Expand Up @@ -114,6 +116,7 @@ elif [ "$CMD" = "update" ]; then
cache_builder $ROOT/images/spark-base spark-base
build_base $ROOT/images/spark-base spark-base
build_base $ROOT/images/tf-base tf-base
build_base $ROOT/images/tf-base-gpu tf-base-gpu

cache_builder $ROOT/images/operator operator
build_and_push $ROOT/images/operator operator latest
Expand All @@ -127,10 +130,12 @@ elif [ "$CMD" = "update" ]; then
build_and_push $ROOT/images/argo-controller argo-controller latest
build_and_push $ROOT/images/argo-executor argo-executor latest
build_and_push $ROOT/images/tf-serve tf-serve latest
build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest
fi

build_and_push $ROOT/images/spark spark latest
build_and_push $ROOT/images/tf-train tf-train latest
build_and_push $ROOT/images/tf-train-gpu tf-train-gpu latest
build_and_push $ROOT/images/tf-api tf-api latest

cleanup
Expand Down
1 change: 1 addition & 0 deletions docs/applications/resources/models.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Train custom TensorFlow models at scale.
compute:
cpu: <string> # CPU request (default: Null)
mem: <string> # memory request (default: Null)
gpu: <string> # GPU request (default: Null)

tags:
<string>: <scalar> # arbitrary key/value pairs to attach to the resource (optional)
Expand Down
51 changes: 51 additions & 0 deletions images/tf-base-gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Adapted from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
FROM nvidia/cuda:9.0-base-ubuntu16.04

# Pick up some TF dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cuda-command-line-tools-9-0 \
cuda-cublas-9-0 \
cuda-cufft-9-0 \
cuda-curand-9-0 \
cuda-cusolver-9-0 \
cuda-cusparse-9-0 \
libcudnn7=7.4.1.5-1+cuda9.0 \
libfreetype6-dev \
libhdf5-serial-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
software-properties-common \
unzip

RUN apt-get update && \
apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda9.0 \
&& apt-get update \
&& apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda9.0 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*


# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8

RUN apt-get update -qq && apt-get install -y -q \
build-essential \
curl \
wget \
python3 \
python3-pip \
python3-dev \
rsync \
zlib1g-dev \
&& apt-get clean -qq && rm -rf /var/lib/apt/lists/*

RUN pip3 --no-cache-dir install --upgrade \
pip \
setuptools

# Some TF tools expect a "python" binary
RUN ln -s python3 /usr/local/bin/python

RUN pip3 install tensorflow-gpu==1.12
28 changes: 28 additions & 0 deletions images/tf-serve-gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM nvidia/cuda:9.0-base-ubuntu16.04

ARG TF_VERSION="1.12.0"

RUN apt-get update -qq && apt-get install -y -q \
automake \
build-essential \
curl \
libcurl3-dev \
git \
libtool \
libfreetype6-dev \
libpng-dev \
libzmq3-dev \
pkg-config \
python3-dev \
python3-numpy \
python3-pip \
software-properties-common \
swig \
zip \
zlib1g-dev \
&& apt-get clean -qq && rm -rf /var/lib/apt/lists/*

RUN curl -o tensorflow-model-server.deb http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-${TF_VERSION}/t/tensorflow-model-server/tensorflow-model-server_${TF_VERSION}_all.deb
RUN dpkg -i tensorflow-model-server.deb

ENTRYPOINT ["tensorflow_model_server"]
2 changes: 1 addition & 1 deletion images/tf-serve/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.


FROM ubuntu:18.04
FROM ubuntu:16.04

ARG TF_VERSION="1.12.0"

Expand Down
15 changes: 15 additions & 0 deletions images/tf-train-gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM cortexlabs/tf-base-gpu

ENV PYTHONPATH="/src:${PYTHONPATH}"

COPY pkg/workloads/lib/requirements.txt /src/lib/requirements.txt
COPY pkg/workloads/tf_train/requirements.txt /src/tf_train/requirements.txt
RUN pip3 install -r /src/lib/requirements.txt && \
pip3 install -r /src/tf_train/requirements.txt && \
rm -rf /root/.cache/pip*

COPY pkg/workloads/consts.py /src/
COPY pkg/workloads/lib /src/lib
COPY pkg/workloads/tf_train /src/tf_train

ENTRYPOINT ["/usr/bin/python3", "/src/tf_train/train.py"]
34 changes: 34 additions & 0 deletions pkg/api/userconfig/compute.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ func (sparkCompute *SparkCompute) ID() string {
type TFCompute struct {
CPU *Quantity `json:"cpu" yaml:"cpu"`
Mem *Quantity `json:"mem" yaml:"mem"`
GPU *int64 `json:"gpu" yaml:"gpu"`
}

var tfComputeFieldValidation = &cr.StructFieldValidation{
Expand All @@ -166,6 +167,13 @@ var tfComputeFieldValidation = &cr.StructFieldValidation{
Min: k8sresource.MustParse("0"),
}),
},
&cr.StructFieldValidation{
StructField: "GPU",
Int64PtrValidation: &cr.Int64PtrValidation{
Default: nil,
GreaterThan: util.Int64Ptr(0),
},
},
},
},
}
Expand All @@ -181,6 +189,7 @@ type APICompute struct {
Replicas int32 `json:"replicas" yaml:"replicas"`
CPU *Quantity `json:"cpu" yaml:"cpu"`
Mem *Quantity `json:"mem" yaml:"mem"`
GPU *int64 `json:"gpu" yaml:"gpu"`
}

var apiComputeFieldValidation = &cr.StructFieldValidation{
Expand Down Expand Up @@ -212,6 +221,13 @@ var apiComputeFieldValidation = &cr.StructFieldValidation{
Min: k8sresource.MustParse("0"),
}),
},
&cr.StructFieldValidation{
StructField: "GPU",
Int64PtrValidation: &cr.Int64PtrValidation{
Default: nil,
GreaterThan: util.Int64Ptr(0),
},
},
},
},
}
Expand Down Expand Up @@ -284,6 +300,11 @@ func MaxTFCompute(tfComputes ...*TFCompute) *TFCompute {
aggregated.Mem = tfCompute.Mem
}
}
if tfCompute.GPU != nil {
if aggregated.GPU == nil || *tfCompute.GPU > 0 {
aggregated.GPU = tfCompute.GPU
}
}
}

return &aggregated
Expand All @@ -299,5 +320,18 @@ func (apiCompute *APICompute) Equal(apiCompute2 APICompute) bool {
if !QuantityPtrsEqual(apiCompute.Mem, apiCompute2.Mem) {
return false
}

if apiCompute.GPU != nil && apiCompute2.GPU == nil {
return false
}

if apiCompute.GPU == nil && apiCompute2.GPU != nil {
return false
}

if apiCompute.GPU != nil && apiCompute2.GPU != nil && *apiCompute.GPU != *apiCompute2.GPU {
return false
}

return true
}
22 changes: 13 additions & 9 deletions pkg/operator/cortexconfig/cortex_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,17 @@ import (
)

var (
LogGroup string
Bucket string
Region string
Namespace string
OperatorImage string
SparkImage string
TFTrainImage string
TFServeImage string
TFAPIImage string
LogGroup string
Bucket string
Region string
Namespace string
OperatorImage string
SparkImage string
TFTrainImage string
TFServeImage string
TFAPIImage string
TFTrainImageGPU string
TFServeImageGPU string
)

func init() {
Expand All @@ -45,6 +47,8 @@ func init() {
TFTrainImage = getStr("IMAGE_TF_TRAIN")
TFServeImage = getStr("IMAGE_TF_SERVE")
TFAPIImage = getStr("IMAGE_TF_API")
TFTrainImageGPU = getStr("IMAGE_TF_TRAIN_GPU")
TFServeImageGPU = getStr("IMAGE_TF_SERVE_GPU")
}

//
Expand Down
14 changes: 13 additions & 1 deletion pkg/operator/workloads/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (

appsv1b1 "k8s.io/api/apps/v1beta1"
corev1 "k8s.io/api/core/v1"
k8sresource "k8s.io/apimachinery/pkg/api/resource"

"github.com/cortexlabs/cortex/pkg/api/context"
s "github.com/cortexlabs/cortex/pkg/api/strings"
Expand All @@ -47,6 +48,8 @@ func apiSpec(

transformResourceList := corev1.ResourceList{}
tfServingResourceList := corev1.ResourceList{}
transformLimitsList := corev1.ResourceList{}
tfServingLimitsList := corev1.ResourceList{}

if apiCompute.CPU != nil {
q1, q2 := apiCompute.CPU.SplitInTwo()
Expand All @@ -59,6 +62,13 @@ func apiSpec(
tfServingResourceList[corev1.ResourceMemory] = *q2
}

servingImage := cc.TFServeImage
if apiCompute.GPU != nil {
servingImage = cc.TFServeImageGPU
tfServingResourceList["nvidia.com/gpu"] = *k8sresource.NewQuantity(*apiCompute.GPU, k8sresource.DecimalSI)
tfServingLimitsList["nvidia.com/gpu"] = *k8sresource.NewQuantity(*apiCompute.GPU, k8sresource.DecimalSI)
}

return k8s.Deployment(&k8s.DeploymentSpec{
Name: internalAPIName(apiName, ctx.App.Name),
Replicas: ctx.APIs[apiName].Compute.Replicas,
Expand Down Expand Up @@ -102,11 +112,12 @@ func apiSpec(
VolumeMounts: k8s.DefaultVolumeMounts(),
Resources: corev1.ResourceRequirements{
Requests: transformResourceList,
Limits: transformLimitsList,
},
},
{
Name: tfServingContainerName,
Image: cc.TFServeImage,
Image: servingImage,
ImagePullPolicy: "Always",
Args: []string{
"--port=" + tfServingPortStr,
Expand All @@ -116,6 +127,7 @@ func apiSpec(
VolumeMounts: k8s.DefaultVolumeMounts(),
Resources: corev1.ResourceRequirements{
Requests: tfServingResourceList,
Limits: tfServingLimitsList,
},
},
},
Expand Down
Loading