Skip to content

Commit 312401f

Browse files
authored
Release 0.3.0 (#74)
1 parent d4d6946 commit 312401f

File tree

7 files changed

+42
-44
lines changed

7 files changed

+42
-44
lines changed

.github/workflows/ci-full.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ jobs:
9999
- name: Run Perf Analyzer Job
100100
run: |
101101
kubectl apply -f ci/perf-analyzer-job.yaml
102-
kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=240s || \
102+
kubectl wait --for=condition=complete job/perf-analyzer-job -n cms --timeout=300s || \
103103
(echo "Perf-analyzer job did not complete in time or failed." && exit 1)
104104
105105
POD_NAME=$(kubectl get pods -n cms -l job-name=perf-analyzer-job -o jsonpath="{.items[0].metadata.name}")

docs/advanced-monitoring.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ shows how tracing is configured for CMS SuperSONIC instance:
6363
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \
6464
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \
6565
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \
66-
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
6766
--trace-config mode=opentelemetry \
6867
--trace-config=opentelemetry,resource=pod_name=$(hostname) \
6968
--trace-config opentelemetry,url=supersonic-opentelemetry-collector:4318/v1/traces \

docs/configuration-guide.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ Triton version must be specified in the ``triton.image`` parameter in the values
3333
- |
3434
/opt/tritonserver/bin/tritonserver \
3535
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \
36-
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
3736
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \
3837
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \
3938
--allow-gpu-metrics=true \
@@ -90,7 +89,7 @@ Triton version must be specified in the ``triton.image`` parameter in the values
9089
<br><br>
9190

9291

93-
3. Select Resources for Triton Pods
92+
1. Select Resources for Triton Pods
9493
=============================================
9594

9695
- You can configure CPU, memory, and GPU resources for Triton pods via the ``triton.resources`` parameter in the values file:

helm/supersonic/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: v2
22
name: supersonic
33
description: Server infrastructure for inference-as-a-service in large scientific experiments.
44
icon: https://github.com/fastmachinelearning/SuperSONIC/blob/main/docs/img/SuperSONIC_small_512.png?raw=true
5-
version: 0.2.1
5+
version: 0.3.0
66
type: application
77
home: https://fastmachinelearning.org/SuperSONIC/
88
annotations:

values/values-anvil-cms.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
triton:
2-
# image: fastml/triton-torchgeo:21.02-py3-geometric # run2
3-
image: fastml/triton-torchgeo:22.07-py3-geometric # run3
2+
image: nvcr.io/nvidia/tritonserver:24.11-py3
43
command: ["/bin/sh", "-c"]
54
args:
65
- |
76
/opt/tritonserver/bin/tritonserver \
87
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \
9-
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
108
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \
119
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \
1210
--allow-gpu-metrics=true \

values/values-geddes-cms.yaml

Lines changed: 38 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,30 @@
11
serverLoadThreshold: 20
22
serverLoadMetric: 'sum by (release) (rate(nv_inference_queue_duration_us{release=~"sonic-server"}[30s]) / (rate(nv_inference_exec_count{release=~"sonic-server"}[30s]) * 1000 + 0.001))'
33

4-
triton:
5-
# image: fastml/triton-torchgeo:21.02-py3-geometric # run2
6-
image: fastml/triton-torchgeo:22.07-py3-geometric # run3
4+
triton:
5+
image: nvcr.io/nvidia/tritonserver:24.11-py3
76
command: ["/bin/sh", "-c"]
8-
args:
7+
args:
98
- |
109
/opt/tritonserver/bin/tritonserver \
1110
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \
12-
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
1311
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \
1412
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \
13+
--trace-config mode=opentelemetry \
14+
--trace-config=opentelemetry,resource=pod_name=$(hostname) \
15+
--trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces \
16+
--trace-config rate=100 \
17+
--trace-config level=TIMESTAMPS \
18+
--trace-config count=-1 \
1519
--allow-gpu-metrics=true \
1620
--log-verbose=0 \
1721
--strict-model-config=false \
1822
--exit-timeout-secs=60
19-
# --trace-config mode=opentelemetry
20-
# --trace-config=opentelemetry,resource=pod_name=$(hostname)
21-
# --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces
22-
# --trace-config rate=100 # 1 in 100 requests
23-
# --trace-config level=TIMESTAMPS
24-
# --trace-config count=-1
23+
2524
resources:
26-
limits: { nvidia.com/gpu: 1, cpu: 2, memory: 4G}
27-
requests: { nvidia.com/gpu: 1, cpu: 2, memory: 4G}
28-
nodeSelector: {'cms-af-prod': 'true'}
25+
limits: { nvidia.com/gpu: 1, cpu: 2, memory: 4G }
26+
requests: { nvidia.com/gpu: 1, cpu: 2, memory: 4G }
27+
nodeSelector: { "cms-af-prod": "true" }
2928
tolerations:
3029
- key: hub.jupyter.org/dedicated
3130
operator: Equal
@@ -43,7 +42,7 @@ triton:
4342

4443
envoy:
4544
enabled: true
46-
nodeSelector: {'cms-af-prod': 'true'}
45+
nodeSelector: { "cms-af-prod": "true" }
4746
tolerations:
4847
- key: hub.jupyter.org/dedicated
4948
operator: Equal
@@ -56,30 +55,33 @@ envoy:
5655
enabled: true
5756
hostName: sonic-cms.geddes.rcac.purdue.edu
5857
ingressClassName: public
58+
rate_limiter:
59+
prometheus_based:
60+
enabled: true
61+
tracing_sampling_rate: 0.01
5962

6063
keda:
6164
enabled: true
6265
minReplicaCount: 1
63-
maxReplicaCount: 7
66+
maxReplicaCount: 11
67+
scaleUp:
68+
stabilizationWindowSeconds: 30
69+
periodSeconds: 15
70+
stepsize: 1
71+
scaleDown:
72+
stabilizationWindowSeconds: 45
73+
periodSeconds: 45
74+
stepsize: 1
6475

6576
ingress:
6677
enabled: false
6778

6879
prometheus:
69-
enabled: true
70-
server:
71-
useExistingClusterRoleName: sonic-server-prometheus-role
72-
ingress:
73-
enabled: true
74-
hosts:
75-
- prometheus-cms.geddes.rcac.purdue.edu
76-
tls:
77-
- hosts:
78-
- prometheus-cms.geddes.rcac.purdue.edu
79-
ingressClassName: public
80-
serviceAccounts:
81-
server:
82-
name: sonic-server-prometheus-sa
80+
external:
81+
enabled: true
82+
url: prometheus-af.geddes.rcac.purdue.edu
83+
port: 443
84+
scheme: https
8385

8486
grafana:
8587
enabled: true
@@ -92,7 +94,7 @@ grafana:
9294
type: prometheus
9395
access: proxy
9496
isDefault: true
95-
url: http://sonic-server-prometheus-server:9090
97+
url: https://prometheus-af.geddes.rcac.purdue.edu
9698
jsonData:
9799
timeInterval: "5s"
98100
tlsSkipVerify: true
@@ -106,7 +108,7 @@ grafana:
106108
timeInterval: "5s"
107109
tlsSkipVerify: true
108110
serviceMap:
109-
datasourceUid: 'prometheus'
111+
datasourceUid: "prometheus"
110112
nodeGraph:
111113
enabled: true
112114
ingress:
@@ -127,13 +129,14 @@ opentelemetry-collector:
127129
exporters:
128130
otlp:
129131
endpoint: http://sonic-server-tempo:4317
130-
otlphttp:
132+
otlphttp:
131133
endpoint: http://sonic-server-tempo:4318
132134
prometheusremotewrite:
133-
endpoint: http://sonic-server-prometheus-server:9090/api/v1/write
135+
endpoint: http://prometheus-server:9090/api/v1/write
136+
134137
tempo:
135138
enabled: true
136139
tempo:
137140
metricsGenerator:
138141
enabled: true
139-
remoteWriteUrl: http://sonic-server-prometheus-server:9090/api/v1/write
142+
remoteWriteUrl: http://prometheus-server:9090/api/v1/write

values/values-nautilus-cms.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ triton:
1818
--strict-model-config=false \
1919
--exit-timeout-secs=60 \
2020
--backend-config=onnxruntime,enable-global-threadpool=1
21-
# --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
2221
2322
resources:
2423
limits: { cpu: 1, memory: 3G, nvidia.com/gpu: 1}

0 commit comments

Comments
 (0)