2018-03-14 14:12:26 +08:00
|
|
|
|
// Copyright (C) 2014-2018 Goodrain Co., Ltd.
|
2017-11-08 10:23:05 +08:00
|
|
|
|
// RAINBOND, Application Management Platform
|
2018-03-14 14:33:31 +08:00
|
|
|
|
|
2017-11-08 10:23:05 +08:00
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
|
// it under the terms of the GNU General Public License as published by
|
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
|
// (at your option) any later version. For any non-GPL usage of Rainbond,
|
|
|
|
|
// one or multiple Commercial Licenses authorized by Goodrain Co., Ltd.
|
|
|
|
|
// must be obtained first.
|
2018-03-14 14:33:31 +08:00
|
|
|
|
|
2017-11-08 10:23:05 +08:00
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
// GNU General Public License for more details.
|
2018-03-14 14:33:31 +08:00
|
|
|
|
|
2017-11-08 10:23:05 +08:00
|
|
|
|
// You should have received a copy of the GNU General Public License
|
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
2017-11-08 21:39:51 +08:00
|
|
|
|
package nodeserver
|
2017-11-08 10:23:05 +08:00
|
|
|
|
|
|
|
|
|
import (
|
2018-05-18 23:40:19 +08:00
|
|
|
|
"context"
|
2017-11-08 10:23:05 +08:00
|
|
|
|
"fmt"
|
|
|
|
|
"os"
|
|
|
|
|
"strconv"
|
2017-11-09 17:32:02 +08:00
|
|
|
|
"strings"
|
2017-11-08 10:23:05 +08:00
|
|
|
|
"sync"
|
|
|
|
|
"time"
|
|
|
|
|
|
2018-05-18 23:40:19 +08:00
|
|
|
|
"github.com/goodrain/rainbond/node/core/job"
|
|
|
|
|
|
|
|
|
|
"github.com/goodrain/rainbond/util/watch"
|
|
|
|
|
|
2017-11-08 10:23:05 +08:00
|
|
|
|
conf "github.com/goodrain/rainbond/cmd/node/option"
|
2018-04-24 16:44:59 +08:00
|
|
|
|
"github.com/goodrain/rainbond/node/api/model"
|
|
|
|
|
corejob "github.com/goodrain/rainbond/node/core/job"
|
|
|
|
|
"github.com/goodrain/rainbond/node/core/store"
|
|
|
|
|
"github.com/goodrain/rainbond/util"
|
2017-11-08 10:23:05 +08:00
|
|
|
|
"github.com/robfig/cron"
|
|
|
|
|
|
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
|
|
|
client "github.com/coreos/etcd/clientv3"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
//Config config server
|
|
|
|
|
type Config struct {
|
|
|
|
|
EtcdEndPoints []string
|
|
|
|
|
EtcdTimeout int
|
|
|
|
|
EtcdPrefix string
|
|
|
|
|
ClusterName string
|
|
|
|
|
APIAddr string
|
|
|
|
|
K8SConfPath string
|
|
|
|
|
EventServerAddress []string
|
|
|
|
|
PrometheusMetricPath string
|
|
|
|
|
TTL int64
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-19 18:46:49 +08:00
|
|
|
|
//Jobs jobs
|
|
|
|
|
type Jobs map[string]*corejob.Job
|
|
|
|
|
|
2017-11-08 10:23:05 +08:00
|
|
|
|
//NodeServer node manager server
|
|
|
|
|
type NodeServer struct {
|
|
|
|
|
*store.Client
|
|
|
|
|
*model.HostNode
|
|
|
|
|
*cron.Cron
|
2018-05-18 23:40:19 +08:00
|
|
|
|
ctx context.Context
|
|
|
|
|
cancel context.CancelFunc
|
2017-11-08 13:36:06 +08:00
|
|
|
|
jobs Jobs // 和结点相关的任务
|
|
|
|
|
onceJobs Jobs //记录执行的单任务
|
|
|
|
|
jobLock sync.Mutex
|
2017-12-19 18:46:49 +08:00
|
|
|
|
cmds map[string]*corejob.Cmd
|
2017-11-08 10:23:05 +08:00
|
|
|
|
// 删除的 job id,用于 group 更新
|
|
|
|
|
delIDs map[string]bool
|
|
|
|
|
ttl int64
|
|
|
|
|
lID client.LeaseID // lease id
|
|
|
|
|
regLeaseID client.LeaseID // lease id
|
|
|
|
|
done chan struct{}
|
|
|
|
|
//Config
|
|
|
|
|
*conf.Conf
|
|
|
|
|
}
|
|
|
|
|
|
2017-11-16 16:32:27 +08:00
|
|
|
|
//Regist 节点注册
|
|
|
|
|
func (n *NodeServer) Regist() error {
|
2017-11-08 10:23:05 +08:00
|
|
|
|
resp, err := n.Client.Grant(n.ttl + 2)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
2017-11-09 17:32:02 +08:00
|
|
|
|
if _, err = n.HostNode.Update(); err != nil {
|
|
|
|
|
return err
|
2017-11-08 10:23:05 +08:00
|
|
|
|
}
|
|
|
|
|
if _, err = n.HostNode.Put(client.WithLease(resp.ID)); err != nil {
|
|
|
|
|
return err
|
|
|
|
|
}
|
|
|
|
|
n.lID = resp.ID
|
2018-05-18 23:40:19 +08:00
|
|
|
|
logrus.Infof("node(%s) registe success", n.HostName)
|
2017-11-08 10:23:05 +08:00
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//Run 启动
|
2018-05-18 23:40:19 +08:00
|
|
|
|
func (n *NodeServer) Run(errchan chan error) (err error) {
|
|
|
|
|
n.ctx, n.cancel = context.WithCancel(context.Background())
|
2017-11-16 16:32:27 +08:00
|
|
|
|
n.Regist()
|
2017-11-08 10:23:05 +08:00
|
|
|
|
go n.keepAlive()
|
2018-05-18 23:40:19 +08:00
|
|
|
|
go n.watchJobs(errchan)
|
2017-11-08 10:23:05 +08:00
|
|
|
|
n.Cron.Start()
|
2017-12-19 18:46:49 +08:00
|
|
|
|
if err := corejob.StartProc(); err != nil {
|
2017-11-09 17:32:02 +08:00
|
|
|
|
logrus.Warnf("[process key will not timeout]proc lease id set err: %s", err.Error())
|
|
|
|
|
}
|
2017-11-08 10:23:05 +08:00
|
|
|
|
return
|
|
|
|
|
}
|
2018-05-18 23:40:19 +08:00
|
|
|
|
|
|
|
|
|
func (n *NodeServer) watchJobs(errChan chan error) error {
|
|
|
|
|
watcher := watch.New(store.DefalutClient.Client, "")
|
|
|
|
|
watchChan, err := watcher.WatchList(n.ctx, n.Conf.JobPath, "")
|
2017-11-08 10:23:05 +08:00
|
|
|
|
if err != nil {
|
2018-05-18 23:40:19 +08:00
|
|
|
|
errChan <- err
|
2017-11-08 10:23:05 +08:00
|
|
|
|
return err
|
|
|
|
|
}
|
2018-05-18 23:40:19 +08:00
|
|
|
|
defer watchChan.Stop()
|
|
|
|
|
for event := range watchChan.ResultChan() {
|
|
|
|
|
switch event.Type {
|
|
|
|
|
case watch.Added:
|
|
|
|
|
j := new(job.Job)
|
|
|
|
|
err := j.Decode(event.GetValue())
|
|
|
|
|
if err != nil {
|
|
|
|
|
logrus.Errorf("decode job error :%s", err)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
n.addJob(j)
|
|
|
|
|
case watch.Modified:
|
|
|
|
|
j := new(job.Job)
|
|
|
|
|
err := j.Decode(event.GetValue())
|
|
|
|
|
if err != nil {
|
|
|
|
|
logrus.Errorf("decode job error :%s", err)
|
|
|
|
|
continue
|
2017-11-08 10:23:05 +08:00
|
|
|
|
}
|
2018-05-18 23:40:19 +08:00
|
|
|
|
n.modJob(j)
|
|
|
|
|
case watch.Deleted:
|
|
|
|
|
n.delJob(event.GetKey())
|
|
|
|
|
default:
|
|
|
|
|
logrus.Errorf("watch job error:%v", event.Error)
|
|
|
|
|
errChan <- event.Error
|
2017-11-08 10:23:05 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2018-05-18 23:40:19 +08:00
|
|
|
|
return nil
|
2017-11-08 10:23:05 +08:00
|
|
|
|
}
|
2017-12-19 18:46:49 +08:00
|
|
|
|
|
|
|
|
|
//添加job缓存
|
|
|
|
|
func (n *NodeServer) addJob(j *corejob.Job) {
|
2017-11-08 10:23:05 +08:00
|
|
|
|
if !j.IsRunOn(n.HostNode) {
|
|
|
|
|
return
|
|
|
|
|
}
|
2017-12-19 18:46:49 +08:00
|
|
|
|
//一次性任务
|
|
|
|
|
if j.Rules.Mode != corejob.Cycle {
|
|
|
|
|
n.runOnceJob(j)
|
|
|
|
|
return
|
|
|
|
|
}
|
2017-11-08 10:23:05 +08:00
|
|
|
|
n.jobLock.Lock()
|
|
|
|
|
defer n.jobLock.Unlock()
|
|
|
|
|
n.jobs[j.ID] = j
|
|
|
|
|
cmds := j.Cmds(n.HostNode)
|
|
|
|
|
if len(cmds) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
for _, cmd := range cmds {
|
|
|
|
|
n.addCmd(cmd)
|
|
|
|
|
}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (n *NodeServer) delJob(id string) {
|
|
|
|
|
n.jobLock.Lock()
|
|
|
|
|
defer n.jobLock.Unlock()
|
|
|
|
|
n.delIDs[id] = true
|
|
|
|
|
job, ok := n.jobs[id]
|
|
|
|
|
// 之前此任务没有在当前结点执行
|
|
|
|
|
if !ok {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
cmds := job.Cmds(n.HostNode)
|
|
|
|
|
if len(cmds) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
for _, cmd := range cmds {
|
|
|
|
|
n.delCmd(cmd)
|
|
|
|
|
}
|
|
|
|
|
delete(n.jobs, id)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-19 18:46:49 +08:00
|
|
|
|
func (n *NodeServer) modJob(job *corejob.Job) {
|
|
|
|
|
if !job.IsRunOn(n.HostNode) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
//一次性任务
|
|
|
|
|
if job.Rules.Mode != corejob.Cycle {
|
|
|
|
|
n.runOnceJob(job)
|
|
|
|
|
return
|
|
|
|
|
}
|
2017-11-08 10:23:05 +08:00
|
|
|
|
oJob, ok := n.jobs[job.ID]
|
|
|
|
|
// 之前此任务没有在当前结点执行,直接增加任务
|
|
|
|
|
if !ok {
|
|
|
|
|
n.addJob(job)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
prevCmds := oJob.Cmds(n.HostNode)
|
|
|
|
|
|
|
|
|
|
job.Count = oJob.Count
|
|
|
|
|
*oJob = *job
|
|
|
|
|
cmds := oJob.Cmds(n.HostNode)
|
|
|
|
|
for id, cmd := range cmds {
|
|
|
|
|
n.modCmd(cmd)
|
|
|
|
|
delete(prevCmds, id)
|
|
|
|
|
}
|
|
|
|
|
for _, cmd := range prevCmds {
|
|
|
|
|
n.delCmd(cmd)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-19 18:46:49 +08:00
|
|
|
|
func (n *NodeServer) addCmd(cmd *corejob.Cmd) {
|
|
|
|
|
n.Cron.Schedule(cmd.Rule.Schedule, cmd)
|
2017-11-08 10:23:05 +08:00
|
|
|
|
n.cmds[cmd.GetID()] = cmd
|
2017-12-19 18:46:49 +08:00
|
|
|
|
logrus.Infof("job[%s] rule[%s] timer[%s] has added", cmd.Job.ID, cmd.Rule.ID, cmd.Rule.Timer)
|
2017-11-08 10:23:05 +08:00
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-19 18:46:49 +08:00
|
|
|
|
func (n *NodeServer) modCmd(cmd *corejob.Cmd) {
|
2017-11-08 10:23:05 +08:00
|
|
|
|
c, ok := n.cmds[cmd.GetID()]
|
|
|
|
|
if !ok {
|
|
|
|
|
n.addCmd(cmd)
|
|
|
|
|
return
|
|
|
|
|
}
|
2017-12-19 18:46:49 +08:00
|
|
|
|
sch := c.Rule.Timer
|
2017-11-08 10:23:05 +08:00
|
|
|
|
*c = *cmd
|
|
|
|
|
// 节点执行时间改变,更新 cron
|
|
|
|
|
// 否则不用更新 cron
|
2017-12-19 18:46:49 +08:00
|
|
|
|
if c.Rule.Timer != sch {
|
|
|
|
|
n.Cron.Schedule(c.Rule.Schedule, c)
|
2017-11-08 10:23:05 +08:00
|
|
|
|
}
|
2017-12-19 18:46:49 +08:00
|
|
|
|
logrus.Infof("job[%s] rule[%s] timer[%s] has updated", c.Job.ID, c.Rule.ID, c.Rule.Timer)
|
2017-11-08 10:23:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
2017-12-19 18:46:49 +08:00
|
|
|
|
func (n *NodeServer) delCmd(cmd *corejob.Cmd) {
|
2017-11-08 10:23:05 +08:00
|
|
|
|
delete(n.cmds, cmd.GetID())
|
|
|
|
|
n.Cron.DelJob(cmd)
|
2017-12-19 18:46:49 +08:00
|
|
|
|
logrus.Infof("job[%s] rule[%s] timer[%s] has deleted", cmd.Job.ID, cmd.Rule.ID, cmd.Rule.Timer)
|
2017-11-08 10:23:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
2017-12-19 18:46:49 +08:00
|
|
|
|
//job must be schedulered
|
|
|
|
|
func (n *NodeServer) runOnceJob(j *corejob.Job) {
|
|
|
|
|
go j.RunWithRecovery()
|
2017-11-08 10:23:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//Stop 停止服务
|
|
|
|
|
func (n *NodeServer) Stop(i interface{}) {
|
2018-05-18 23:40:19 +08:00
|
|
|
|
n.cancel()
|
2017-11-08 10:23:05 +08:00
|
|
|
|
n.HostNode.Down()
|
|
|
|
|
close(n.done)
|
|
|
|
|
n.HostNode.Del()
|
|
|
|
|
n.Client.Close()
|
|
|
|
|
n.Cron.Stop()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (n *NodeServer) keepAlive() {
|
|
|
|
|
duration := time.Duration(n.ttl) * time.Second
|
|
|
|
|
timer := time.NewTimer(duration)
|
|
|
|
|
for {
|
|
|
|
|
select {
|
|
|
|
|
case <-n.done:
|
2017-12-07 18:47:33 +08:00
|
|
|
|
timer.Stop()
|
2017-11-08 10:23:05 +08:00
|
|
|
|
return
|
|
|
|
|
case <-timer.C:
|
|
|
|
|
if n.lID > 0 {
|
|
|
|
|
_, err := n.Client.KeepAliveOnce(n.lID)
|
|
|
|
|
if err == nil {
|
|
|
|
|
timer.Reset(duration)
|
|
|
|
|
continue
|
|
|
|
|
}
|
2017-11-09 17:32:02 +08:00
|
|
|
|
logrus.Warnf("%s lid[%x] keepAlive err: %s, try to reset...", n.HostName, n.lID, err.Error())
|
2017-11-08 10:23:05 +08:00
|
|
|
|
n.lID = 0
|
|
|
|
|
}
|
2017-11-16 16:32:27 +08:00
|
|
|
|
if err := n.Regist(); err != nil {
|
2017-11-09 17:32:02 +08:00
|
|
|
|
logrus.Warnf("%s set lid err: %s, try to reset after %d seconds...", n.HostName, err.Error(), n.ttl)
|
2017-11-08 10:23:05 +08:00
|
|
|
|
} else {
|
2017-11-09 17:32:02 +08:00
|
|
|
|
logrus.Infof("%s set lid[%x] success", n.HostName, n.lID)
|
2017-11-08 10:23:05 +08:00
|
|
|
|
}
|
|
|
|
|
timer.Reset(duration)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//NewNodeServer new server
|
|
|
|
|
func NewNodeServer(cfg *conf.Conf) (*NodeServer, error) {
|
2017-11-09 17:32:02 +08:00
|
|
|
|
currentNode, err := GetCurrentNode(cfg)
|
2017-11-08 10:23:05 +08:00
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
2017-12-19 18:46:49 +08:00
|
|
|
|
if cfg.TTL == 0 {
|
|
|
|
|
cfg.TTL = 10
|
2017-12-07 18:47:33 +08:00
|
|
|
|
}
|
2017-11-08 10:23:05 +08:00
|
|
|
|
n := &NodeServer{
|
2017-11-09 17:32:02 +08:00
|
|
|
|
Client: store.DefalutClient,
|
|
|
|
|
HostNode: currentNode,
|
2017-11-08 13:36:06 +08:00
|
|
|
|
Cron: cron.New(),
|
|
|
|
|
jobs: make(Jobs, 8),
|
|
|
|
|
onceJobs: make(Jobs, 8),
|
2017-12-19 18:46:49 +08:00
|
|
|
|
cmds: make(map[string]*corejob.Cmd),
|
2017-11-08 13:36:06 +08:00
|
|
|
|
delIDs: make(map[string]bool, 8),
|
|
|
|
|
Conf: cfg,
|
|
|
|
|
ttl: cfg.TTL,
|
|
|
|
|
done: make(chan struct{}),
|
2017-11-08 10:23:05 +08:00
|
|
|
|
}
|
|
|
|
|
return n, nil
|
|
|
|
|
}
|
2017-11-09 17:32:02 +08:00
|
|
|
|
|
|
|
|
|
//GetCurrentNode 获取当前节点
|
|
|
|
|
func GetCurrentNode(cfg *conf.Conf) (*model.HostNode, error) {
|
|
|
|
|
uid, err := util.ReadHostID(cfg.HostIDFile)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("Get host id error:%s", err.Error())
|
|
|
|
|
}
|
|
|
|
|
res, err := store.DefalutClient.Get(cfg.NodePath + "/" + uid)
|
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, fmt.Errorf("Get host info error:%s", err.Error())
|
|
|
|
|
}
|
|
|
|
|
var node model.HostNode
|
|
|
|
|
if res.Count == 0 {
|
|
|
|
|
if cfg.HostIP == "" {
|
2017-11-13 21:54:11 +08:00
|
|
|
|
ip, err := util.LocalIP()
|
2017-11-09 17:32:02 +08:00
|
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
cfg.HostIP = ip.String()
|
|
|
|
|
}
|
|
|
|
|
node = CreateNode(cfg, uid, cfg.HostIP)
|
|
|
|
|
} else {
|
|
|
|
|
n := model.GetNodeFromKV(res.Kvs[0])
|
|
|
|
|
if n == nil {
|
|
|
|
|
return nil, fmt.Errorf("Get node info from etcd error")
|
|
|
|
|
}
|
|
|
|
|
node = *n
|
|
|
|
|
}
|
|
|
|
|
node.Role = strings.Split(cfg.NodeRule, ",")
|
|
|
|
|
if node.Labels == nil || len(node.Labels) < 1 {
|
|
|
|
|
node.Labels = map[string]string{}
|
|
|
|
|
}
|
|
|
|
|
for _, rule := range node.Role {
|
|
|
|
|
node.Labels["rainbond_node_rule_"+rule] = "true"
|
|
|
|
|
}
|
2017-11-28 14:30:19 +08:00
|
|
|
|
if node.HostName == "" {
|
|
|
|
|
hostname, _ := os.Hostname()
|
|
|
|
|
node.HostName = hostname
|
|
|
|
|
}
|
|
|
|
|
if node.ClusterNode.PID == "" {
|
|
|
|
|
node.ClusterNode.PID = strconv.Itoa(os.Getpid())
|
|
|
|
|
}
|
2017-11-09 17:32:02 +08:00
|
|
|
|
node.Labels["rainbond_node_hostname"] = node.HostName
|
|
|
|
|
node.Labels["rainbond_node_ip"] = node.InternalIP
|
2017-11-13 21:54:11 +08:00
|
|
|
|
node.UpdataCondition(model.NodeCondition{
|
|
|
|
|
Type: model.NodeInit,
|
|
|
|
|
Status: model.ConditionTrue,
|
|
|
|
|
LastHeartbeatTime: time.Now(),
|
|
|
|
|
LastTransitionTime: time.Now(),
|
|
|
|
|
})
|
2017-12-20 10:23:55 +08:00
|
|
|
|
node.Mode = cfg.RunMode
|
2017-11-09 17:32:02 +08:00
|
|
|
|
return &node, nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//CreateNode 创建节点信息
|
|
|
|
|
func CreateNode(cfg *conf.Conf, nodeID, ip string) model.HostNode {
|
|
|
|
|
HostNode := model.HostNode{
|
|
|
|
|
ID: nodeID,
|
|
|
|
|
ClusterNode: model.ClusterNode{
|
2017-11-13 21:54:11 +08:00
|
|
|
|
PID: strconv.Itoa(os.Getpid()),
|
|
|
|
|
Conditions: make([]model.NodeCondition, 0),
|
2017-11-09 17:32:02 +08:00
|
|
|
|
},
|
|
|
|
|
InternalIP: ip,
|
|
|
|
|
ExternalIP: ip,
|
2017-11-13 21:54:11 +08:00
|
|
|
|
CreateTime: time.Now(),
|
2017-11-09 17:32:02 +08:00
|
|
|
|
}
|
|
|
|
|
return HostNode
|
|
|
|
|
}
|