7
7
8
8
environ ["PROMETHEUS_DISABLE_CREATED_SERIES" ] = "True"
9
9
10
- from prometheus_client import CollectorRegistry , Gauge , Histogram , push_to_gateway # noqa: E402
11
- from summary import Summary # noqa: E402
10
+ from prometheus_client import CollectorRegistry , Counter , Gauge , Histogram , push_to_gateway # noqa: E402
12
11
13
- JOB_READ_LABEL , JOB_WRITE_LABEL = "read" , "write"
14
- JOB_STATUS_OK , JOB_STATUS_ERR = "ok " , "err"
12
+ OP_TYPE_READ , OP_TYPE_WRITE = "read" , "write"
13
+ OP_STATUS_SUCCESS , OP_STATUS_FAILURE = "success " , "err"
15
14
16
- SDK_SERVICE_NAME = environ .get ("SDK_SERVICE" , "sync-python-table" )
15
+ REF = environ .get ("REF" , "main" )
16
+ SDK_SERVICE_NAME = environ .get ("SDK_SERVICE" , "py-sync-table" )
17
17
18
18
19
19
class Metrics :
20
20
def __init__ (self , push_gateway ):
21
21
self ._push_gtw = push_gateway
22
22
self ._registry = CollectorRegistry ()
23
23
self ._metrics = dict (
24
- oks = Gauge (
25
- "oks " ,
26
- "amount of OK requests " ,
27
- labelnames = ("jobName" , ),
24
+ errors_total = Counter (
25
+ "sdk_errors_total " ,
26
+ "Total number of errors encountered, categorized by error type. " ,
27
+ labelnames = ("operation_type" , "error_type" ),
28
28
registry = self ._registry ,
29
29
),
30
- not_oks = Gauge (
31
- "not_oks " ,
32
- "amount of not OK requests " ,
33
- labelnames = ("jobName " ,),
30
+ operations_total = Counter (
31
+ "sdk_operations_total " ,
32
+ "Total number of operations, categorized by type attempted by the SDK. " ,
33
+ labelnames = ("operation_type " ,),
34
34
registry = self ._registry ,
35
35
),
36
- inflight = Gauge (
37
- "inflight " ,
38
- "amount of requests in flight " ,
39
- labelnames = ("jobName " ,),
36
+ operations_success_total = Counter (
37
+ "sdk_operations_success_total " ,
38
+ "Total number of successful operations, categorized by type. " ,
39
+ labelnames = ("operation_type " ,),
40
40
registry = self ._registry ,
41
41
),
42
- latency = Summary (
43
- "latency " ,
44
- "summary of latencies in ms " ,
45
- labelnames = ("jobName" , "status" ),
42
+ operations_failure_total = Counter (
43
+ "sdk_operations_failure_total " ,
44
+ "Total number of failed operations, categorized by type. " ,
45
+ labelnames = ("operation_type" , ),
46
46
registry = self ._registry ,
47
- objectives = (
48
- (0.5 , 0.01 ),
49
- (0.99 , 0.001 ),
50
- (1.0 , 0.0 ),
47
+ ),
48
+ operation_latency_seconds = Histogram (
49
+ "sdk_operation_latency_seconds" ,
50
+ "Latency of operations performed by the SDK in seconds, categorized by type and status." ,
51
+ labelnames = (
52
+ "operation_type" ,
53
+ "operation_status" ,
54
+ ),
55
+ registry = self ._registry ,
56
+ buckets = (
57
+ 0.001 , # 1 ms
58
+ 0.002 , # 2 ms
59
+ 0.003 , # 3 ms
60
+ 0.004 , # 4 ms
61
+ 0.005 , # 5 ms
62
+ 0.0075 , # 7.5 ms
63
+ 0.010 , # 10 ms
64
+ 0.020 , # 20 ms
65
+ 0.050 , # 50 ms
66
+ 0.100 , # 100 ms
67
+ 0.200 , # 200 ms
68
+ 0.500 , # 500 ms
69
+ 1.000 , # 1 s
51
70
),
52
71
),
53
- attempts = Histogram (
54
- "attempts" ,
55
- "histogram of amount of requests" ,
56
- labelnames = ("jobName" , "status" ),
72
+ retry_attempts_total = Counter (
73
+ "sdk_retry_attempts_total" ,
74
+ "Total number of retry attempts, categorized by operation type." ,
75
+ labelnames = ("operation_type" ,),
76
+ registry = self ._registry ,
77
+ ),
78
+ retries_success_total = Counter (
79
+ "sdk_retries_success_total" ,
80
+ "Total number of successful retries, categorized by operation type." ,
81
+ labelnames = ("operation_type" ,),
82
+ registry = self ._registry ,
83
+ ),
84
+ retries_failure_total = Counter (
85
+ "sdk_retries_failure_total" ,
86
+ "Total number of failed retries, categorized by operation type." ,
87
+ labelnames = ("operation_type" ,),
88
+ registry = self ._registry ,
89
+ ),
90
+ pending_operations = Gauge (
91
+ "sdk_pending_operations" ,
92
+ "Current number of pending operations, categorized by type." ,
93
+ labelnames = ("operation_type" ,),
57
94
registry = self ._registry ,
58
- buckets = tuple (range (1 , 11 )),
59
95
),
60
96
)
61
97
self .reset ()
@@ -81,44 +117,44 @@ def start(self, labels):
81
117
if not isinstance (labels , Iterable ):
82
118
labels = (labels ,)
83
119
84
- self .inflight .labels (* labels ).inc ()
120
+ self .pending_operations .labels (* labels ).inc ()
85
121
return time .time ()
86
122
87
123
def stop (self , labels , start_time , attempts = 1 , error = None ):
88
- runtime_ms = 1000 * ( time .time () - start_time )
124
+ duration = time .time () - start_time
89
125
90
126
if not isinstance (labels , Iterable ):
91
127
labels = (labels ,)
92
128
93
- self .inflight .labels (* labels ).dec ()
129
+ self .operations_total .labels (* labels ).inc ()
130
+ self .pending_operations .labels (* labels ).dec ()
131
+ self .retry_attempts_total .labels (* labels ).inc (attempts )
94
132
95
133
if error :
96
- self .not_oks .labels (* labels ).inc ()
97
- self .latency .labels (* labels , JOB_STATUS_ERR ).observe (runtime_ms )
134
+ self .errors_total .labels (* labels , type (error ).__name__ ).inc ()
135
+ self .retries_failure_total .labels (* labels ).inc (attempts )
136
+ self .operations_failure_total .labels (* labels ).inc ()
137
+ self .operation_latency_seconds .labels (* labels , OP_STATUS_FAILURE ).observe (duration )
98
138
return
99
139
100
- self .oks .labels (* labels ).inc ()
101
- self .latency .labels (* labels , JOB_STATUS_OK ). observe ( runtime_ms )
102
- self .attempts .labels (* labels , JOB_STATUS_OK ).observe (attempts )
140
+ self .retries_success_total .labels (* labels ).inc (attempts )
141
+ self .operations_success_total .labels (* labels ). inc ( )
142
+ self .operation_latency_seconds .labels (* labels , OP_STATUS_SUCCESS ).observe (duration )
103
143
104
144
def push (self ):
105
145
push_to_gateway (
106
146
self ._push_gtw ,
107
147
job = f"workload-{ SDK_SERVICE_NAME } " ,
108
148
registry = self ._registry ,
109
149
grouping_key = {
150
+ "ref" : REF ,
110
151
"sdk" : SDK_SERVICE_NAME ,
111
- "sdkVersion " : version ("ydb" ),
152
+ "sdk_version " : version ("ydb" ),
112
153
},
113
154
)
114
155
115
156
def reset (self ):
116
- for label in (JOB_READ_LABEL , JOB_WRITE_LABEL ):
117
- self .oks .labels (label ).set (0 )
118
- self .not_oks .labels (label ).set (0 )
119
- self .inflight .labels (label ).set (0 )
120
-
121
- self .latency .clear ()
122
- self .attempts .clear ()
157
+ for m in self ._metrics .values ():
158
+ m .clear ()
123
159
124
160
self .push ()
0 commit comments