[REV] Optimize service restart policy when status is unhealthy.

This commit is contained in:
Zhang Jiajun 2018-07-27 15:22:39 +08:00
parent 31c25685f3
commit 4a37e55f37
3 changed files with 37 additions and 6 deletions

View File

@ -82,6 +82,17 @@ func (m *ControllerSystemd) RestartService(serviceName string) error {
return nil
}
func (m *ControllerSystemd) StatusService(name string) error {
cmd := fmt.Sprintf("systemctl status %s | head -3 | tail -1 | awk '{print $2}'", name)
err := exec.Command("/usr/bin/bash", "-c", cmd).Run()
if err != nil {
logrus.Errorf("Restart service %s: %v", name, err)
return err
}
return nil
}
func (m *ControllerSystemd) StartList(list []*service.Service) error {
logrus.Info("Starting all services.")

View File

@ -29,6 +29,8 @@ import (
"github.com/goodrain/rainbond/node/nodem/service"
"io/ioutil"
"os/exec"
"time"
"github.com/goodrain/rainbond/node/masterserver/node"
)
type ManagerService struct {
@ -144,26 +146,27 @@ func (m *ManagerService) StartSyncService() {
return
}
unhealthyNum := 0
maxUnhealthyNum := 2
go func() {
defer w.Close()
unhealthyNum := 0
maxUnhealthyNum := 2
for {
select {
case event := <-w.Watch():
switch event.Status {
case service.Stat_healthy:
logrus.Debugf("[%s] check service %s.", event.Status, event.Name)
unhealthyNum = 0
case service.Stat_unhealthy:
logrus.Infof("[%s] check service %s %d times.", event.Status, event.Name, unhealthyNum)
logrus.Debugf("[%s] check service %s %d times.", event.Status, event.Name, unhealthyNum)
if unhealthyNum > maxUnhealthyNum {
logrus.Infof("[%s] check service %s %d times and will be restart.", event.Status, event.Name, unhealthyNum)
m.ctr.RestartService(event.Name)
unhealthyNum = 0
m.ctr.RestartService(event.Name)
}
unhealthyNum++
case service.Stat_death:
logrus.Infof("[%s] check service %s %d times.", event.Status, event.Name, unhealthyNum)
logrus.Debugf("[%s] check service %s %d times.", event.Status, event.Name, unhealthyNum)
if unhealthyNum > maxUnhealthyNum {
logrus.Infof("[%s] check service %s %d times and will be start.", event.Status, event.Name, unhealthyNum)
m.ctr.StartService(event.Name)
@ -185,6 +188,22 @@ func (m *ManagerService) StopSyncService() {
}
}
func (m *ManagerService) WaitStart(name string, duration time.Duration) bool {
max := time.Now().Add(duration)
t := time.Tick(time.Second)
for {
<-t
status, _ := m.healthyManager.GetCurrentServiceHealthy(name)
if status.Status == node.Running {
return true
}
if time.Now().After(max) {
return false
}
}
}
/*
1. reload services config from local file system
2. regenerate systemd config for all service

View File

@ -33,6 +33,7 @@ import (
//Manager Manager
type Manager interface {
GetServiceHealthy(serviceName string) (*service.HealthStatus, bool)
GetCurrentServiceHealthy(serviceName string) (*service.HealthStatus, error)
WatchServiceHealthy(serviceName string) Watcher
CloseWatch(serviceName string, id string) error
Start(hostNode *client.HostNode) error