Apache Airflow
If you are making use of the official apache helm chart, make sure the statsd exporter is enabled with proper service annotations, if not, you can enable it by adding the following snippet to the values.yaml file of the airflow deployment:
YAML
1statsd:
2 enabled: true
3 service:
4 extraAnnotations:
5 prometheus.io/port: "9102"
6 prometheus.io/scrape: "true"
You will also need to add the following extraMappings snippet under the statsd configurations of the values.yaml file of the airflow deployment:
YAML
1statsd:
2 extraMappings:
3 - match: "(.+)\\.(.+)_start$"
4 match_metric_type: counter
5 name: "af_agg_job_start"
6 match_type: regex
7 labels:
8 airflow_id: "$1"
9 job_name: "$2"
10 - match: "(.+)\\.(.+)_end$"
11 match_metric_type: counter
12 name: "af_agg_job_end"
13 match_type: regex
14 labels:
15 airflow_id: "$1"
16 job_name: "$2"
17 - match: "(.+)\\.operator_failures_(.+)$"
18 match_metric_type: counter
19 name: "af_agg_operator_failures"
20 match_type: regex
21 labels:
22 airflow_id: "$1"
23 operator_name: "$2"
24 - match: "(.+)\\.operator_successes_(.+)$"
25 match_metric_type: counter
26 name: "af_agg_operator_successes"
27 match_type: regex
28 labels:
29 airflow_id: "$1"
30 operator_name: "$2"
31 - match: "*.ti_failures"
32 match_metric_type: counter
33 name: "af_agg_ti_failures"
34 labels:
35 airflow_id: "$1"
36 - match: "*.ti_successes"
37 match_metric_type: counter
38 name: "af_agg_ti_successes"
39 labels:
40 airflow_id: "$1"
41 - match: "*.zombies_killed"
42 match_metric_type: counter
43 name: "af_agg_zombies_killed"
44 labels:
45 airflow_id: "$1"
46 - match: "*.scheduler_heartbeat"
47 match_metric_type: counter
48 name: "af_agg_scheduler_heartbeat"
49 labels:
50 airflow_id: "$1"
51 - match: "*.dag_processing.processes"
52 match_metric_type: counter
53 name: "af_agg_dag_processing_processes"
54 labels:
55 airflow_id: "$1"
56 - match: "*.scheduler.tasks.killed_externally"
57 match_metric_type: counter
58 name: "af_agg_scheduler_tasks_killed_externally"
59 labels:
60 airflow_id: "$1"
61 - match: "*.scheduler.tasks.running"
62 match_metric_type: counter
63 name: "af_agg_scheduler_tasks_running"
64 labels:
65 airflow_id: "$1"
66 - match: "*.scheduler.tasks.starving"
67 match_metric_type: counter
68 name: "af_agg_scheduler_tasks_starving"
69 labels:
70 airflow_id: "$1"
71 - match: "*.scheduler.orphaned_tasks.cleared"
72 match_metric_type: counter
73 name: "af_agg_scheduler_orphaned_tasks_cleared"
74 labels:
75 airflow_id: "$1"
76 - match: "*.scheduler.orphaned_tasks.adopted"
77 match_metric_type: counter
78 name: "af_agg_scheduler_orphaned_tasks_adopted"
79 labels:
80 airflow_id: "$1"
81 - match: "*.scheduler.critical_section_busy"
82 match_metric_type: counter
83 name: "af_agg_scheduler_critical_section_busy"
84 labels:
85 airflow_id: "$1"
86 - match: "*.sla_email_notification_failure"
87 match_metric_type: counter
88 name: "af_agg_sla_email_notification_failure"
89 labels:
90 airflow_id: "$1"
91 - match: "*.ti.start.*.*"
92 match_metric_type: counter
93 name: "af_agg_ti_start"
94 labels:
95 airflow_id: "$1"
96 dag_id: "$2"
97 task_id: "$3"
98 - match: "*.ti.finish.*.*.*"
99 match_metric_type: counter
100 name: "af_agg_ti_finish"
101 labels:
102 airflow_id: "$1"
103 dag_id: "$2"
104 task_id: "$3"
105 state: "$4"
106 - match: "*.dag.callback_exceptions"
107 match_metric_type: counter
108 name: "af_agg_dag_callback_exceptions"
109 labels:
110 airflow_id: "$1"
111 - match: "*.celery.task_timeout_error"
112 match_metric_type: counter
113 name: "af_agg_celery_task_timeout_error"
114 labels:
115 airflow_id: "$1"
116
117 # === Gauges ===
118 - match: "*.dagbag_size"
119 match_metric_type: gauge
120 name: "af_agg_dagbag_size"
121 labels:
122 airflow_id: "$1"
123 - match: "*.dag_processing.import_errors"
124 match_metric_type: gauge
125 name: "af_agg_dag_processing_import_errors"
126 labels:
127 airflow_id: "$1"
128 - match: "*.dag_processing.total_parse_time"
129 match_metric_type: gauge
130 name: "af_agg_dag_processing_total_parse_time"
131 labels:
132 airflow_id: "$1"
133 - match: "*.dag_processing.last_runtime.*"
134 match_metric_type: gauge
135 name: "af_agg_dag_processing_last_runtime"
136 labels:
137 airflow_id: "$1"
138 dag_file: "$2"
139 - match: "*.dag_processing.last_run.seconds_ago.*"
140 match_metric_type: gauge
141 name: "af_agg_dag_processing_last_run_seconds"
142 labels:
143 airflow_id: "$1"
144 dag_file: "$2"
145 - match: "*.dag_processing.processor_timeouts"
146 match_metric_type: gauge
147 name: "af_agg_dag_processing_processor_timeouts"
148 labels:
149 airflow_id: "$1"
150 - match: "*.executor.open_slots"
151 match_metric_type: gauge
152 name: "af_agg_executor_open_slots"
153 labels:
154 airflow_id: "$1"
155 - match: "*.executor.queued_tasks"
156 match_metric_type: gauge
157 name: "af_agg_executor_queued_tasks"
158 labels:
159 airflow_id: "$1"
160 - match: "*.executor.running_tasks"
161 match_metric_type: gauge
162 name: "af_agg_executor_running_tasks"
163 labels:
164 airflow_id: "$1"
165 - match: "*.pool.open_slots.*"
166 match_metric_type: gauge
167 name: "af_agg_pool_open_slots"
168 labels:
169 airflow_id: "$1"
170 pool_name: "$2"
171 - match: "*.pool.queued_slots.*"
172 match_metric_type: gauge
173 name: "af_agg_pool_queued_slots"
174 labels:
175 airflow_id: "$1"
176 pool_name: "$2"
177 - match: "*.pool.running_slots.*"
178 match_metric_type: gauge
179 name: "af_agg_pool_running_slots"
180 labels:
181 airflow_id: "$1"
182 pool_name: "$2"
183 - match: "*.pool.starving_tasks.*"
184 match_metric_type: gauge
185 name: "af_agg_pool_starving_tasks"
186 labels:
187 airflow_id: "$1"
188 pool_name: "$2"
189 - match: "*.smart_sensor_operator.poked_tasks"
190 match_metric_type: gauge
191 name: "af_agg_smart_sensor_operator_poked_tasks"
192 labels:
193 airflow_id: "$1"
194 - match: "*.smart_sensor_operator.poked_success"
195 match_metric_type: gauge
196 name: "af_agg_smart_sensor_operator_poked_success"
197 labels:
198 airflow_id: "$1"
199 - match: "*.smart_sensor_operator.poked_exception"
200 match_metric_type: gauge
201 name: "af_agg_smart_sensor_operator_poked_exception"
202 labels:
203 airflow_id: "$1"
204 - match: "*.smart_sensor_operator.exception_failures"
205 match_metric_type: gauge
206 name: "af_agg_smart_sensor_operator_exception_failures"
207 labels:
208 airflow_id: "$1"
209 - match: "*.smart_sensor_operator.infra_failures"
210 match_metric_type: gauge
211 name: "af_agg_smart_sensor_operator_infra_failures"
212 labels:
213 airflow_id: "$1"
214
215 # === Timers ===
216 - match: "*.dagrun.dependency-check.*"
217 match_metric_type: observer
218 name: "af_agg_dagrun_dependency_check"
219 labels:
220 airflow_id: "$1"
221 dag_id: "$2"
222 - match: "*.dag.*.*.duration"
223 match_metric_type: observer
224 name: "af_agg_dag_task_duration"
225 labels:
226 airflow_id: "$1"
227 dag_id: "$2"
228 task_id: "$3"
229 - match: "*.dag_processing.last_duration.*"
230 match_metric_type: observer
231 name: "af_agg_dag_processing_duration"
232 labels:
233 airflow_id: "$1"
234 dag_file: "$2"
235 - match: "*.dagrun.duration.success.*"
236 match_metric_type: observer
237 name: "af_agg_dagrun_duration_success"
238 labels:
239 airflow_id: "$1"
240 dag_id: "$2"
241 - match: "*.dagrun.duration.failed.*"
242 match_metric_type: observer
243 name: "af_agg_dagrun_duration_failed"
244 labels:
245 airflow_id: "$1"
246 dag_id: "$2"
247 - match: "*.dagrun.schedule_delay.*"
248 match_metric_type: observer
249 name: "af_agg_dagrun_schedule_delay"
250 labels:
251 airflow_id: "$1"
252 dag_id: "$2"
253 - match: "*.scheduler.critical_section_duration"
254 match_metric_type: observer
255 name: "af_agg_scheduler_critical_section_duration"
256 labels:
257 airflow_id: "$1"
258 - match: "*.dagrun.*.first_task_scheduling_delay"
259 match_metric_type: observer
260 name: "af_agg_dagrun_first_task_scheduling_delay"
261 labels:
262 airflow_id: "$1"
263 dag_id: "$2"
The Airflow Cluster Dashboard can be added into your grafana instance for Visualization.
If you are making use of the community helm chart, you can enable metrics by setting the following in the values.yaml file of the airflow deployment:
YAML
1airflow:
2 extraPipPackages: ["airflow-exporter"]
and
YAML
1web:
2 service:
3 annotations:
4 prometheus.io/path: /admin/metrics
5 prometheus.io/port: "8080"
6 prometheus.io/scrape: "true"