1
1
serverLoadThreshold : 20
2
2
serverLoadMetric : ' sum by (release) (rate(nv_inference_queue_duration_us{release=~"sonic-server"}[30s]) / (rate(nv_inference_exec_count{release=~"sonic-server"}[30s]) * 1000 + 0.001))'
3
3
4
- triton :
5
- # image: fastml/triton-torchgeo:21.02-py3-geometric # run2
6
- image : fastml/triton-torchgeo:22.07-py3-geometric # run3
4
+ triton :
5
+ image : nvcr.io/nvidia/tritonserver:24.11-py3
7
6
command : ["/bin/sh", "-c"]
8
- args :
7
+ args :
9
8
- |
10
9
/opt/tritonserver/bin/tritonserver \
11
10
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoBTag/Combined/data/models/ \
12
- --model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoEgamma/EgammaPhotonProducers/data/models/ \
13
11
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoTauTag/TrainingFiles/data/DeepTauIdSONIC/ \
14
12
--model-repository=/cvmfs/cms.cern.ch/el9_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre7/external/el9_amd64_gcc12/data/RecoMET/METPUSubtraction/data/models/ \
13
+ --trace-config mode=opentelemetry \
14
+ --trace-config=opentelemetry,resource=pod_name=$(hostname) \
15
+ --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces \
16
+ --trace-config rate=100 \
17
+ --trace-config level=TIMESTAMPS \
18
+ --trace-config count=-1 \
15
19
--allow-gpu-metrics=true \
16
20
--log-verbose=0 \
17
21
--strict-model-config=false \
18
22
--exit-timeout-secs=60
19
- # --trace-config mode=opentelemetry
20
- # --trace-config=opentelemetry,resource=pod_name=$(hostname)
21
- # --trace-config opentelemetry,url=sonic-server-opentelemetry-collector:4318/v1/traces
22
- # --trace-config rate=100 # 1 in 100 requests
23
- # --trace-config level=TIMESTAMPS
24
- # --trace-config count=-1
23
+
25
24
resources :
26
- limits : { nvidia.com/gpu: 1, cpu: 2, memory: 4G}
27
- requests : { nvidia.com/gpu: 1, cpu: 2, memory: 4G}
28
- nodeSelector : {' cms-af-prod': ' true' }
25
+ limits : { nvidia.com/gpu: 1, cpu: 2, memory: 4G }
26
+ requests : { nvidia.com/gpu: 1, cpu: 2, memory: 4G }
27
+ nodeSelector : { " cms-af-prod": " true" }
29
28
tolerations :
30
29
- key : hub.jupyter.org/dedicated
31
30
operator : Equal
@@ -43,7 +42,7 @@ triton:
43
42
44
43
envoy :
45
44
enabled : true
46
- nodeSelector : {' cms-af-prod': ' true' }
45
+ nodeSelector : { " cms-af-prod": " true" }
47
46
tolerations :
48
47
- key : hub.jupyter.org/dedicated
49
48
operator : Equal
@@ -56,30 +55,33 @@ envoy:
56
55
enabled : true
57
56
hostName : sonic-cms.geddes.rcac.purdue.edu
58
57
ingressClassName : public
58
+ rate_limiter :
59
+ prometheus_based :
60
+ enabled : true
61
+ tracing_sampling_rate : 0.01
59
62
60
63
keda :
61
64
enabled : true
62
65
minReplicaCount : 1
63
- maxReplicaCount : 7
66
+ maxReplicaCount : 11
67
+ scaleUp :
68
+ stabilizationWindowSeconds : 30
69
+ periodSeconds : 15
70
+ stepsize : 1
71
+ scaleDown :
72
+ stabilizationWindowSeconds : 45
73
+ periodSeconds : 45
74
+ stepsize : 1
64
75
65
76
ingress :
66
77
enabled : false
67
78
68
79
prometheus :
69
- enabled : true
70
- server :
71
- useExistingClusterRoleName : sonic-server-prometheus-role
72
- ingress :
73
- enabled : true
74
- hosts :
75
- - prometheus-cms.geddes.rcac.purdue.edu
76
- tls :
77
- - hosts :
78
- - prometheus-cms.geddes.rcac.purdue.edu
79
- ingressClassName : public
80
- serviceAccounts :
81
- server :
82
- name : sonic-server-prometheus-sa
80
+ external :
81
+ enabled : true
82
+ url : prometheus-af.geddes.rcac.purdue.edu
83
+ port : 443
84
+ scheme : https
83
85
84
86
grafana :
85
87
enabled : true
@@ -92,7 +94,7 @@ grafana:
92
94
type : prometheus
93
95
access : proxy
94
96
isDefault : true
95
- url : http ://sonic-server- prometheus-server:9090
97
+ url : https ://prometheus-af.geddes.rcac.purdue.edu
96
98
jsonData :
97
99
timeInterval : " 5s"
98
100
tlsSkipVerify : true
@@ -106,7 +108,7 @@ grafana:
106
108
timeInterval : " 5s"
107
109
tlsSkipVerify : true
108
110
serviceMap :
109
- datasourceUid : ' prometheus'
111
+ datasourceUid : " prometheus"
110
112
nodeGraph :
111
113
enabled : true
112
114
ingress :
@@ -127,13 +129,14 @@ opentelemetry-collector:
127
129
exporters :
128
130
otlp :
129
131
endpoint : http://sonic-server-tempo:4317
130
- otlphttp :
132
+ otlphttp :
131
133
endpoint : http://sonic-server-tempo:4318
132
134
prometheusremotewrite :
133
- endpoint : http://sonic-server-prometheus-server:9090/api/v1/write
135
+ endpoint : http://prometheus-server:9090/api/v1/write
136
+
134
137
tempo :
135
138
enabled : true
136
139
tempo :
137
140
metricsGenerator :
138
141
enabled : true
139
- remoteWriteUrl : http://sonic-server- prometheus-server:9090/api/v1/write
142
+ remoteWriteUrl : http://prometheus-server:9090/api/v1/write
0 commit comments