From 018bb5bbcb01a1757ceec0499bf79b87d538a3db Mon Sep 17 00:00:00 2001 From: zhoujunhao <18853925545@163.com> Date: Wed, 11 Jul 2018 14:51:51 +0800 Subject: [PATCH] [ADD] add entrance health check --- builder/discover/discover.go | 14 +----- builder/exector/exector.go | 1 + builder/exector/monitor.go | 8 ++-- entrance/api/controller/controller.go | 1 + entrance/api/controller/health.go | 48 ++++++++++++++++++++ entrance/core/monitor/prometheus_exporter.go | 8 ++++ worker/discover/manager.go | 14 +----- worker/monitor/collector/collector.go | 12 ++--- 8 files changed, 70 insertions(+), 36 deletions(-) create mode 100644 entrance/api/controller/health.go diff --git a/builder/discover/discover.go b/builder/discover/discover.go index c7d779318..520ee39f0 100644 --- a/builder/discover/discover.go +++ b/builder/discover/discover.go @@ -51,7 +51,7 @@ type TaskManager struct { func NewTaskManager(c option.Config, exec exector.Manager) *TaskManager { ctx, cancel := context.WithCancel(context.Background()) healthStatus["status"] = "health" - healthStatus["info"] = "service health" + healthStatus["info"] = "builder service health" return &TaskManager{ ctx: ctx, cancel: cancel, @@ -78,8 +78,6 @@ func (t *TaskManager) Start() error { //Do do func (t *TaskManager) Do() { hostName, _ := os.Hostname() - timeoutNum := 0 - errorNum := 0 for { select { case <-t.ctx.Done(): @@ -101,20 +99,10 @@ func (t *TaskManager) Do() { } if grpc1.ErrorDesc(err) == "context timeout" { logrus.Warn(err.Error()) - timeoutNum += 1 - if timeoutNum > 10 { - healthStatus["status"] = "unusual" - healthStatus["info"] = "context timeout more than ten times" - } continue } logrus.Error(err.Error()) time.Sleep(time.Second * 2) - errorNum += 1 - if errorNum > 10 { - healthStatus["status"] = "unusual" - healthStatus["info"] = err.Error() - } continue } logrus.Debugf("Receive a task: %s", data.String()) diff --git a/builder/exector/exector.go b/builder/exector/exector.go index 4cb939be1..086f16357 100644 --- a/builder/exector/exector.go +++ b/builder/exector/exector.go @@ -185,6 +185,7 @@ func (e *exectorManager) buildFromImage(in []byte) { } else { i.Logger.Error("从镜像构建应用任务执行失败", map[string]string{"step": "callback", "status": "failure"}) status = "failure" + } } else { break diff --git a/builder/exector/monitor.go b/builder/exector/monitor.go index ac7206c53..4d8a5a1dd 100644 --- a/builder/exector/monitor.go +++ b/builder/exector/monitor.go @@ -2,7 +2,7 @@ package exector import ( "github.com/prometheus/client_golang/prometheus" - "github.com/goodrain/rainbond/worker/discover" + "github.com/goodrain/rainbond/builder/discover" ) // Metric name parts. @@ -57,10 +57,10 @@ func (e *Exporter) scrape(ch chan<- prometheus.Metric) { healthInfo := discover.HealthCheck() healthStatus := healthInfo["status"] var val float64 - if healthStatus == "health"{ - val = 0 - }else { + if healthStatus == "health" { val = 1 + } else { + val = 0 } ch <- prometheus.MustNewConstMetric(e.healthStatus.Desc(), prometheus.GaugeValue, val) diff --git a/entrance/api/controller/controller.go b/entrance/api/controller/controller.go index 4a3bd62a9..d04114052 100644 --- a/entrance/api/controller/controller.go +++ b/entrance/api/controller/controller.go @@ -34,6 +34,7 @@ func Register(container *restful.Container, coreManager core.Manager, readStore DomainSource{coreManager, readStore, apiStoreManager, 10000}.Register(container) NodeSource{coreManager, readStore, apiStoreManager, clientSet}.Register(container) PodSource{apiStoreManager}.Register(container) + HealthStatus{apiStoreManager}.Register(container) } //ResponseType 返回内容 diff --git a/entrance/api/controller/health.go b/entrance/api/controller/health.go new file mode 100644 index 000000000..c8db09b9d --- /dev/null +++ b/entrance/api/controller/health.go @@ -0,0 +1,48 @@ +// Copyright (C) 2014-2018 Goodrain Co., Ltd. +// RAINBOND, Application Management Platform + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. For any non-GPL usage of Rainbond, +// one or multiple Commercial Licenses authorized by Goodrain Co., Ltd. +// must be obtained first. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +package controller + +import ( + apistore "github.com/goodrain/rainbond/entrance/api/store" + + "github.com/emicklei/go-restful" +) + +//PodSource 查询应用实例的端口映射情况 +type HealthStatus struct { + apiStoreManager *apistore.Manager +} + +//Register 注册 +func (h HealthStatus) Register(container *restful.Container) { + ws := new(restful.WebService) + ws.Path("/health"). + Doc("Get pod some info"). + Param(ws.PathParameter("pod_name", "pod name").DataType("string")). + Consumes(restful.MIME_XML, restful.MIME_JSON). + Produces(restful.MIME_JSON, restful.MIME_XML) // you can specify this per route as well + + ws.Route(ws.GET("/").To(h.healthCheck)) // on the response + container.Add(ws) +} + +func (h *HealthStatus) healthCheck(request *restful.Request, response *restful.Response) { + + NewSuccessResponse(map[string]string{"status": "health", "info": ""}, nil, response) +} diff --git a/entrance/core/monitor/prometheus_exporter.go b/entrance/core/monitor/prometheus_exporter.go index 22a615890..c85b6bcef 100644 --- a/entrance/core/monitor/prometheus_exporter.go +++ b/entrance/core/monitor/prometheus_exporter.go @@ -49,6 +49,7 @@ type Exporter struct { scrapeErrors *prometheus.CounterVec lbPluginUp prometheus.Gauge coreManager core.Manager + healthStatus prometheus.Gauge } //NewExporter new a exporter @@ -77,6 +78,12 @@ func NewExporter(coreManager core.Manager) *Exporter { Name: "up", Help: "Whether the default lb plugin is up.", }), + healthStatus:prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: exporter, + Name: "entrance_health_status", + Help: "entrance component health status.", + }), coreManager: coreManager, } } @@ -113,4 +120,5 @@ func (e *Exporter) scrape(ch chan<- prometheus.Metric) { logrus.Error("core manager scrape for prometheus error.", err.Error()) e.error.Set(1) } + ch <- prometheus.MustNewConstMetric(e.healthStatus.Desc(), prometheus.GaugeValue, 1) } diff --git a/worker/discover/manager.go b/worker/discover/manager.go index 40ed52fd3..c54eba56a 100644 --- a/worker/discover/manager.go +++ b/worker/discover/manager.go @@ -56,7 +56,7 @@ func NewTaskManager(c option.Config, executor executor.Manager, statusManager *s ctx, cancel := context.WithCancel(context.Background()) handleManager := handle.NewManager(ctx, c, executor, statusManager) healthStatus["status"] = "health" - healthStatus["info"] = "service health" + healthStatus["info"] = "worker service health" return &TaskManager{ ctx: ctx, cancel: cancel, @@ -84,8 +84,6 @@ func (t *TaskManager) Start() error { func (t *TaskManager) Do() { logrus.Info("start receive task from mq") hostname, _ := os.Hostname() - timeoutNum := 0 - errorNum := 0 for { select { case <-t.ctx.Done(): @@ -105,20 +103,10 @@ func (t *TaskManager) Do() { return } if grpc1.ErrorDesc(err) == "context timeout" { - timeoutNum += 1 - if timeoutNum > 10 { - healthStatus["status"] = "unusual" - healthStatus["info"] = "context timeout more than ten times" - } continue } logrus.Error("receive task error.", err.Error()) time.Sleep(time.Second * 2) - errorNum += 1 - if errorNum > 10 { - healthStatus["status"] = "unusual" - healthStatus["info"] = err.Error() - } continue } logrus.Debugf("receive a task: %v", data) diff --git a/worker/monitor/collector/collector.go b/worker/monitor/collector/collector.go index d33b7a74d..d3a5d18e6 100644 --- a/worker/monitor/collector/collector.go +++ b/worker/monitor/collector/collector.go @@ -123,12 +123,12 @@ func (e *Exporter) scrape(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric(scrapeDurationDesc, prometheus.GaugeValue, time.Since(scrapeTime).Seconds(), "collect.fs") healthInfo := discover.HealthCheck() - healthStatus :=healthInfo["status"] + healthStatus := healthInfo["status"] var val float64 - if healthStatus == "health"{ - val = 0 - }else { + if healthStatus == "health" { val = 1 + } else { + val = 0 } ch <- prometheus.MustNewConstMetric(e.healthStatus.Desc(), prometheus.GaugeValue, val) } @@ -171,8 +171,8 @@ func New(statusManager *status.AppRuntimeSyncClient) *Exporter { Name: "appfs", Help: "tenant service fs used.", }, []string{"tenant_id", "service_id", "volume_type"}), - healthStatus:prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: "worker", + healthStatus: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, Subsystem: "exporter", Name: "worker_health_status", Help: "worker component health status.",