Skip to content

Add cloudwatch statsd daemonset #419

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Sep 4, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ ci-build-images:
@./build/build-image.sh images/onnx-serve-gpu onnx-serve-gpu
@./build/build-image.sh images/operator operator
@./build/build-image.sh images/fluentd fluentd
@./build/build-image.sh images/statsd statsd
@./build/build-image.sh images/cluster-autoscaler cluster-autoscaler
@./build/build-image.sh images/nvidia nvidia
@./build/build-image.sh images/metrics-server metrics-server
Expand All @@ -164,6 +165,7 @@ ci-push-images:
@./build/push-image.sh onnx-serve-gpu
@./build/push-image.sh operator
@./build/push-image.sh fluentd
@./build/push-image.sh statsd
@./build/push-image.sh cluster-autoscaler
@./build/push-image.sh nvidia
@./build/push-image.sh metrics-server
Expand Down
2 changes: 2 additions & 0 deletions cortex.sh
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ export CORTEX_NAMESPACE="${CORTEX_NAMESPACE:-cortex}"

export CORTEX_IMAGE_MANAGER="${CORTEX_IMAGE_MANAGER:-cortexlabs/manager:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_FLUENTD="${CORTEX_IMAGE_FLUENTD:-cortexlabs/fluentd:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_STATSD="${CORTEX_IMAGE_STATSD:-cortexlabs/statsd:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_OPERATOR="${CORTEX_IMAGE_OPERATOR:-cortexlabs/operator:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_TF_SERVE="${CORTEX_IMAGE_TF_SERVE:-cortexlabs/tf-serve:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_TF_API="${CORTEX_IMAGE_TF_API:-cortexlabs/tf-api:$CORTEX_VERSION_STABLE}"
Expand Down Expand Up @@ -227,6 +228,7 @@ function install_cortex() {
-e CORTEX_LOG_GROUP=$CORTEX_LOG_GROUP \
-e CORTEX_BUCKET=$CORTEX_BUCKET \
-e CORTEX_IMAGE_FLUENTD=$CORTEX_IMAGE_FLUENTD \
-e CORTEX_IMAGE_STATSD=$CORTEX_IMAGE_STATSD \
-e CORTEX_IMAGE_OPERATOR=$CORTEX_IMAGE_OPERATOR \
-e CORTEX_IMAGE_TF_SERVE=$CORTEX_IMAGE_TF_SERVE \
-e CORTEX_IMAGE_TF_API=$CORTEX_IMAGE_TF_API \
Expand Down
3 changes: 2 additions & 1 deletion dev/registry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ function ecr_login() {
function create_registry() {
aws ecr create-repository --repository-name=cortexlabs/manager --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/fluentd --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/statsd --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/istio-citadel --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/istio-pilot --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/istio-galley --region=$REGISTRY_REGION || true
Expand Down Expand Up @@ -128,6 +129,7 @@ elif [ "$cmd" = "update" ]; then
build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest

build_and_push $ROOT/images/fluentd fluentd latest
build_and_push $ROOT/images/statsd statsd latest
build_and_push $ROOT/images/cluster-autoscaler cluster-autoscaler latest
build_and_push $ROOT/images/nvidia nvidia latest
build_and_push $ROOT/images/metrics-server metrics-server latest
Expand All @@ -140,7 +142,6 @@ elif [ "$cmd" = "update" ]; then
build_and_push $ROOT/images/downloader downloader latest
build_and_push $ROOT/images/tf-api tf-api latest
build_and_push $ROOT/images/onnx-serve onnx-serve latest
build_and_push $ROOT/images/onnx-serve-gpu onnx-serve-gpu latest

cleanup
fi
1 change: 1 addition & 0 deletions docs/cluster/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ export CORTEX_NAMESPACE="cortex"
# Image paths
export CORTEX_IMAGE_MANAGER="cortexlabs/manager:master"
export CORTEX_IMAGE_FLUENTD="cortexlabs/fluentd:master"
export CORTEX_IMAGE_STATSD="cortexlabs/statsd:master"
export CORTEX_IMAGE_OPERATOR="cortexlabs/operator:master"
export CORTEX_IMAGE_TF_SERVE="cortexlabs/tf-serve:master"
export CORTEX_IMAGE_TF_API="cortexlabs/tf-api:master"
Expand Down
1 change: 1 addition & 0 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ export CORTEX_NAMESPACE="cortex"

export CORTEX_IMAGE_MANAGER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/manager:latest"
export CORTEX_IMAGE_FLUENTD="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/fluentd:latest"
export CORTEX_IMAGE_STATSD="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/statsd:latest"
export CORTEX_IMAGE_ONNX_SERVE="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/onnx-serve:latest"
export CORTEX_IMAGE_ONNX_SERVE_GPU="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/onnx-serve-gpu:latest"
export CORTEX_IMAGE_OPERATOR="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/operator:latest"
Expand Down
1 change: 1 addition & 0 deletions images/statsd/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
FROM amazon/cloudwatch-agent:1.226589.0
4 changes: 2 additions & 2 deletions manager/manifests/statsd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ spec:
spec:
containers:
- name: cloudwatch-agent
image: amazon/cloudwatch-agent:latest
image: $CORTEX_IMAGE_STATSD
imagePullPolicy: Always
ports:
# containerPort should be consistent with the listen port defined in configmap
Expand All @@ -65,7 +65,7 @@ spec:
cpu: 200m
memory: 100Mi
requests:
cpu: 200m
cpu: 100m
memory: 100Mi
# Please don't change the env
env:
Expand Down
31 changes: 19 additions & 12 deletions pkg/operator/workloads/api_workload.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,14 +330,18 @@ func tfAPISpec(
"--cache-dir=" + consts.ContextCacheDir,
"--project-dir=" + path.Join(consts.EmptyDirMountPath, "project"),
},
Env: append(k8s.AWSCredentials(), kcore.EnvVar{
Name: "HOST_IP",
ValueFrom: &kcore.EnvVarSource{
FieldRef: &kcore.ObjectFieldSelector{
FieldPath: "status.hostIP",
Env: append(
k8s.AWSCredentials(),
kcore.EnvVar{
Name: "HOST_IP",
ValueFrom: &kcore.EnvVarSource{
FieldRef: &kcore.ObjectFieldSelector{
FieldPath: "status.hostIP",
},
},
},
}), VolumeMounts: k8s.DefaultVolumeMounts(),
),
VolumeMounts: k8s.DefaultVolumeMounts(),
ReadinessProbe: &kcore.Probe{
InitialDelaySeconds: 5,
TimeoutSeconds: 5,
Expand Down Expand Up @@ -494,14 +498,17 @@ func onnxAPISpec(
"--cache-dir=" + consts.ContextCacheDir,
"--project-dir=" + path.Join(consts.EmptyDirMountPath, "project"),
},
Env: append(k8s.AWSCredentials(), kcore.EnvVar{
Name: "HOST_IP",
ValueFrom: &kcore.EnvVarSource{
FieldRef: &kcore.ObjectFieldSelector{
FieldPath: "status.hostIP",
Env: append(
k8s.AWSCredentials(),
kcore.EnvVar{
Name: "HOST_IP",
ValueFrom: &kcore.EnvVarSource{
FieldRef: &kcore.ObjectFieldSelector{
FieldPath: "status.hostIP",
},
},
},
}),
),
VolumeMounts: k8s.DefaultVolumeMounts(),
ReadinessProbe: &kcore.Probe{
InitialDelaySeconds: 5,
Expand Down
22 changes: 14 additions & 8 deletions pkg/operator/workloads/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,17 +245,23 @@ func getAPIDimensions(appName string, api *context.API) []*cloudwatch.Dimension
}

func getAPIDimensionsCounter(appName string, api *context.API) []*cloudwatch.Dimension {
return append(getAPIDimensions(appName, api), &cloudwatch.Dimension{
Name: aws.String("metric_type"),
Value: aws.String("counter"),
})
return append(
getAPIDimensions(appName, api),
&cloudwatch.Dimension{
Name: aws.String("metric_type"),
Value: aws.String("counter"),
},
)
}

func getAPIDimensionsHistogram(appName string, api *context.API) []*cloudwatch.Dimension {
return append(getAPIDimensions(appName, api), &cloudwatch.Dimension{
Name: aws.String("metric_type"),
Value: aws.String("histogram"),
})
return append(
getAPIDimensions(appName, api),
&cloudwatch.Dimension{
Name: aws.String("metric_type"),
Value: aws.String("histogram"),
},
)
}

func getRegressionMetricDef(appName string, api *context.API, period int64) []*cloudwatch.MetricDataQuery {
Expand Down