Skip to content
This repository was archived by the owner on Sep 19, 2022. It is now read-only.

Commit 8045f0a

Browse files
committed
add total suffix in counter metrics
Signed-off-by: yeya24 <[email protected]>
1 parent ca2c9c4 commit 8045f0a

File tree

4 files changed

+14
-12
lines changed

4 files changed

+14
-12
lines changed

docs/monitoring/README.md

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,32 +56,34 @@ pytorch_operator_is_leader
5656

5757
### Report PyTorch Job metrics:
5858

59+
*Note*: If you are using release v1 pytorch-operator, these PyTorch metrics don't have suffix `total`. So you have to use metric name like `pytorch_operator_jobs_created` to get your metrics. See [PR](https://github.com/kubeflow/tf-operator/pull/1055) to get more information.
60+
5961
**Job Creation**
6062
```
61-
pytorch_operator_jobs_created
63+
pytorch_operator_jobs_created_total
6264
```
6365

6466
**Job Creation**
6567
```
66-
sum (rate (pytorch_operator_jobs_created[60m]))
68+
sum (rate (pytorch_operator_jobs_created_total[60m]))
6769
```
6870

6971
**Job Deletion**
7072
```
71-
pytorch_operator_jobs_deleted
73+
pytorch_operator_jobs_deleted_total
7274
```
7375

7476
**Successful Job Completions**
7577
```
76-
pytorch_operator_jobs_successful
78+
pytorch_operator_jobs_successful_total
7779
```
7880

7981
**Failed Jobs**
8082
```
81-
pytorch_operator_jobs_failed
83+
pytorch_operator_jobs_failed_total
8284
```
8385

8486
**Restarted Jobs**
8587
```
86-
pytorch_operator_jobs_restarted
88+
pytorch_operator_jobs_restarted_total
8789
```

pkg/controller.v1/pytorch/controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ var (
7272
}
7373

7474
pytorchJobsDeletedCount = promauto.NewCounter(prometheus.CounterOpts{
75-
Name: "pytorch_operator_jobs_deleted",
75+
Name: "pytorch_operator_jobs_deleted_total",
7676
Help: "Counts number of PyTorch jobs deleted",
7777
})
7878
)

pkg/controller.v1/pytorch/job.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ const (
2525

2626
var (
2727
pytorchJobsCreatedCount = promauto.NewCounter(prometheus.CounterOpts{
28-
Name: "pytorch_operator_jobs_created",
28+
Name: "pytorch_operator_jobs_created_total",
2929
Help: "Counts number of PyTorch jobs created",
3030
})
3131
)
@@ -49,7 +49,7 @@ func (pc *PyTorchController) addPyTorchJob(obj interface{}) {
4949

5050
status := common.JobStatus{
5151
Conditions: []common.JobCondition{
52-
common.JobCondition{
52+
{
5353
Type: common.JobFailed,
5454
Status: v1.ConditionTrue,
5555
LastUpdateTime: metav1.Now(),

pkg/controller.v1/pytorch/status.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,15 @@ const (
4646

4747
var (
4848
pytorchJobsSuccessCount = promauto.NewCounter(prometheus.CounterOpts{
49-
Name: "pytorch_operator_jobs_successful",
49+
Name: "pytorch_operator_jobs_successful_total",
5050
Help: "Counts number of PyTorch jobs successful",
5151
})
5252
pytorchJobsFailureCount = promauto.NewCounter(prometheus.CounterOpts{
53-
Name: "pytorch_operator_jobs_failed",
53+
Name: "pytorch_operator_jobs_failed_total",
5454
Help: "Counts number of PyTorch jobs failed",
5555
})
5656
pytorchJobsRestartCount = promauto.NewCounter(prometheus.CounterOpts{
57-
Name: "pytorch_operator_jobs_restarted",
57+
Name: "pytorch_operator_jobs_restarted_total",
5858
Help: "Counts number of PyTorch jobs restarted",
5959
})
6060
)

0 commit comments

Comments
 (0)