From 0aba68f335a9c582424a8e605b94e29deb6649de Mon Sep 17 00:00:00 2001 From: Eric Gao Date: Wed, 22 Jun 2022 23:05:54 +0800 Subject: [PATCH] [Improvement][Metrics] Update some metrics names in grafana-demo dashboards (#10432) (#10552) --- docs/docs/en/guide/metrics/metrics.md | 13 ++++--- docs/docs/zh/guide/metrics/metrics.md | 7 ++-- .../grafana/DolphinSchedulerMaster.json | 36 +++++++++---------- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/docs/docs/en/guide/metrics/metrics.md b/docs/docs/en/guide/metrics/metrics.md index d4c6d2a239..2ba4f639d6 100644 --- a/docs/docs/en/guide/metrics/metrics.md +++ b/docs/docs/en/guide/metrics/metrics.md @@ -5,10 +5,10 @@ Currently, we only support `Prometheus Exporter` but more are coming soon. ## Quick Start -- We enable Apache DolphinScheduler export metrics in `standalone` mode to help users get hands dirty easily. +- We enable Apache DolphinScheduler to export metrics in `standalone` mode to help users get hands dirty easily. - After triggering tasks in `standalone` mode, you could access metrics list by visiting url `http://localhost:12345/dolphinscheduler/actuator/metrics`. - After triggering tasks in `standalone` mode, you could access `prometheus-format` metrics by visiting url `http://localhost:12345/dolphinscheduler/actuator/prometheus`. -- For a better experience with `Prometheus` and `Grafana`, we have prepared the out-of-the-box `Grafana` configuration for you, you could find the `Grafana` dashboard +- For a better experience with `Prometheus` and `Grafana`, we have prepared the out-of-the-box `Grafana` configurations for you, you could find the `Grafana` dashboards at `dolphinscheduler-meter/resources/grafana` and directly import these dashboards to your `Grafana` instance. - If you want to try with `docker`, you can use the following command to start the out-of-the-box `Prometheus` and `Grafana`: @@ -42,8 +42,8 @@ For example, you can get the master metrics by `curl http://localhost:5679/actua ## Naming Convention & Mapping -- Apache DolphinScheduler metrics naming follows the officially-recommended approach by [Micrometer](https://github.com/micrometer-metrics/micrometer-docs/blob/main/src/docs/concepts/naming.adoc#:~:text=Micrometer%20employs%20a%20naming%20convention,between%20one%20system%20and%20another.) -- `Micrometer` automatically maps the metrics name to suit the external metrics system you configured. Currently, we only support `Prometheus Exporter` but more are coming soon. +- Naming of Apache DolphinScheduler metrics follows the officially-recommended approach by [Micrometer](https://github.com/micrometer-metrics/micrometer-docs/blob/main/src/docs/concepts/naming.adoc) +- `Micrometer` automatically maps the metrics names to suit the external metrics system you configured. Currently, we only support `Prometheus Exporter` but more are coming soon. ### Prometheus @@ -104,11 +104,14 @@ For example, you can get the master metrics by `curl http://localhost:5679/actua - ds.worker.overload.count: (counter) the number of times the worker overloaded - ds.worker.full.submit.queue.count: (counter) the number of times the worker's submit queue being full - ### Api Server Metrics +- Currently, we have not embedded any metrics in Api Server. + ### Alert Server Related +- Currently, we have not embedded any metrics in Alert Server. + In each server, there are some default system-level metrics related to `database connection`, `JVM`, etc. We list them below for your reference: ### Database Related Metrics (Default) diff --git a/docs/docs/zh/guide/metrics/metrics.md b/docs/docs/zh/guide/metrics/metrics.md index 7fd266d978..9b3805c613 100644 --- a/docs/docs/zh/guide/metrics/metrics.md +++ b/docs/docs/zh/guide/metrics/metrics.md @@ -42,7 +42,7 @@ metrics exporter端口`server.port`是在application.yaml里定义的: master: ` ## 命名规则 & 命名映射 -- Apache DolphinScheduler指标命名遵循[Micrometer](https://github.com/micrometer-metrics/micrometer-docs/blob/main/src/docs/concepts/naming.adoc#:~:text=Micrometer%20employs%20a%20naming%20convention,between%20one%20system%20and%20another) +- Apache DolphinScheduler指标命名遵循[Micrometer](https://github.com/micrometer-metrics/micrometer-docs/blob/main/src/docs/concepts/naming.adoc) 官方推荐的命名方式。 - `Micrometer` 会根据您配置的外部指标系统自动将指标名称转化成适合您指标系统的格式。目前,我们只支持`Prometheus Exporter`,但是多样化的指标格式将会持续贡献给用户。 @@ -105,11 +105,14 @@ metrics exporter端口`server.port`是在application.yaml里定义的: master: ` - ds.worker.overload.count: (counter) worker过载次数 - ds.worker.full.submit.queue.count: (counter) worker提交队列全满次数 - ### Api Server指标 +- 目前我们尚未提供任何Api Server指标 + ### Alert Server指标 +- 目前我们尚未提供任何Alert Server指标 + 在每个server中都有一些系统层面(如数据库链接、JVM)的默认指标,为了您的检阅方便,我们也将它们列在了这里: ### 数据库相关指标(默认) diff --git a/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerMaster.json b/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerMaster.json index 4d7a1ae524..5461759c8c 100644 --- a/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerMaster.json +++ b/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerMaster.json @@ -137,7 +137,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "increase(dolphinscheduler_master_overload_count_total[1m])", + "expr": "increase(ds_master_overload_count_total[1m])", "legendFormat": "", "range": true, "refId": "A" @@ -226,7 +226,7 @@ "uid": "PBFA97CFB590B2093" }, "editorMode": "code", - "expr": "increase(dolphinscheduler_master_consume_command_count_total{}[1m])", + "expr": "increase(ds_master_consume_command_count_total{}[1m])", "legendFormat": "master_consume_command", "range": true, "refId": "A" @@ -505,7 +505,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(quartz_job_executed_total)", + "expr": "sum(ds_master_quartz_job_executed_total)", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -572,7 +572,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(quartz_job_executed_total{result=\"success\"}) / sum(quartz_job_executed_total) * 100", + "expr": "sum(ds_master_quartz_job_executed_total{result=\"success\"}) / sum(ds_master_quartz_job_executed_total) * 100", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -634,7 +634,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(quartz_job_executed_total{})", + "expr": "sum(ds_master_quartz_job_executed_total{})", "hide": false, "interval": "", "legendFormat": "Total", @@ -642,7 +642,7 @@ }, { "exemplar": true, - "expr": "quartz_job_executed_total{result=\"success\"}", + "expr": "ds_master_quartz_job_executed_total{result=\"success\"}", "format": "time_series", "hide": false, "interval": "", @@ -652,7 +652,7 @@ }, { "exemplar": true, - "expr": "quartz_job_executed_total{result=\"failure\"}", + "expr": "ds_master_quartz_job_executed_total{result=\"failure\"}", "format": "time_series", "hide": false, "interval": "", @@ -741,7 +741,7 @@ "targets": [ { "exemplar": true, - "expr": "rate(quartz_job_execution_seconds_sum[1m])/rate(quartz_job_execution_seconds_count[1m])", + "expr": "rate(ds_master_quartz_job_execution_time_seconds_sum[1m])/rate(ds_master_quartz_job_execution_time_seconds_count[1m])", "format": "time_series", "hide": false, "instant": false, @@ -845,7 +845,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(increase(quartz_job_executed_total[1m]))", + "expr": "sum(increase(ds_master_quartz_job_executed_total[1m]))", "hide": false, "interval": "", "legendFormat": "Total", @@ -853,7 +853,7 @@ }, { "exemplar": true, - "expr": "increase(quartz_job_executed_total{result=\"success\"}[1m])", + "expr": "increase(ds_master_quartz_job_executed_total{result=\"success\"}[1m])", "format": "time_series", "hide": false, "interval": "", @@ -863,7 +863,7 @@ }, { "exemplar": true, - "expr": "increase(quartz_job_executed_total{result=\"failure\"}[1m])", + "expr": "increase(ds_master_quartz_job_executed_total{result=\"failure\"}[1m])", "format": "time_series", "hide": false, "interval": "", @@ -937,7 +937,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(quartz_job_execution_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(ds_master_quartz_job_execution_time_seconds_bucket[5m])) by (le))", "interval": "", "legendFormat": "", "refId": "A" @@ -1052,7 +1052,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "expr": "increase(dolphinscheduler_process_instance_submit_count_total{}[1m])", + "expr": "increase(ds_workflow_instance_submit_count_total{}[1m])", "refId": "A" } ], @@ -1138,7 +1138,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "expr": "increase(dolphinscheduler_process_instance_finish_count_total{}[1m])", + "expr": "increase(ds_workflow_instance_finish_count_total{}[1m])", "refId": "A" } ], @@ -1224,7 +1224,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "expr": "increase(dolphinscheduler_process_instance_success_count_total{}[1m])", + "expr": "increase(ds_workflow_instance_success_count_total{}[1m])", "refId": "A" } ], @@ -1310,7 +1310,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "expr": "increase(dolphinscheduler_process_instance_stop_count_total{}[1m])", + "expr": "increase(ds_workflow_instance_stop_count_total{}[1m])", "refId": "A" } ], @@ -1396,7 +1396,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "expr": "increase(dolphinscheduler_process_instance_timeout_count_total{}[1m])", + "expr": "increase(ds_workflow_instance_timeout_count_total{}[1m])", "refId": "A" } ], @@ -1482,7 +1482,7 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "expr": "increase(dolphinscheduler_process_instance_failure_count_total{}[1m])", + "expr": "increase(ds_workflow_instance_failure_count_total{}[1m])", "refId": "A" } ],