milvus/internal/proxynode/task_scheduler.go

626 lines
15 KiB
Go
Raw Normal View History

// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
package proxynode
import (
"container/list"
"context"
"errors"
"strconv"
"sync"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/allocator"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/msgstream"
"github.com/milvus-io/milvus/internal/proto/internalpb"
"github.com/milvus-io/milvus/internal/util/trace"
"github.com/opentracing/opentracing-go"
oplog "github.com/opentracing/opentracing-go/log"
)
type TaskQueue interface {
utChan() <-chan int
utEmpty() bool
utFull() bool
addUnissuedTask(t task) error
FrontUnissuedTask() task
PopUnissuedTask() task
AddActiveTask(t task)
PopActiveTask(ts Timestamp) task
getTaskByReqID(reqID UniqueID) task
TaskDoneTest(ts Timestamp) bool
Enqueue(t task) error
}
type BaseTaskQueue struct {
unissuedTasks *list.List
activeTasks map[Timestamp]task
utLock sync.RWMutex
atLock sync.RWMutex
// maxTaskNum should keep still
maxTaskNum int64
utBufChan chan int // to block scheduler
sched *TaskScheduler
}
func (queue *BaseTaskQueue) utChan() <-chan int {
return queue.utBufChan
}
func (queue *BaseTaskQueue) utEmpty() bool {
queue.utLock.RLock()
defer queue.utLock.RUnlock()
return queue.unissuedTasks.Len() == 0
}
func (queue *BaseTaskQueue) utFull() bool {
return int64(queue.unissuedTasks.Len()) >= queue.maxTaskNum
}
func (queue *BaseTaskQueue) addUnissuedTask(t task) error {
queue.utLock.Lock()
defer queue.utLock.Unlock()
if queue.utFull() {
return errors.New("task queue is full")
}
queue.unissuedTasks.PushBack(t)
queue.utBufChan <- 1
return nil
}
func (queue *BaseTaskQueue) FrontUnissuedTask() task {
queue.utLock.RLock()
defer queue.utLock.RUnlock()
if queue.unissuedTasks.Len() <= 0 {
log.Warn("sorry, but the unissued task list is empty!")
return nil
}
return queue.unissuedTasks.Front().Value.(task)
}
func (queue *BaseTaskQueue) PopUnissuedTask() task {
queue.utLock.Lock()
defer queue.utLock.Unlock()
if queue.unissuedTasks.Len() <= 0 {
log.Warn("sorry, but the unissued task list is empty!")
return nil
}
ft := queue.unissuedTasks.Front()
queue.unissuedTasks.Remove(ft)
return ft.Value.(task)
}
func (queue *BaseTaskQueue) AddActiveTask(t task) {
queue.atLock.Lock()
defer queue.atLock.Unlock()
ts := t.EndTs()
_, ok := queue.activeTasks[ts]
if ok {
log.Debug("proxynode", zap.Uint64("task with timestamp ts already in active task list! ts:", ts))
}
queue.activeTasks[ts] = t
}
func (queue *BaseTaskQueue) PopActiveTask(ts Timestamp) task {
queue.atLock.Lock()
defer queue.atLock.Unlock()
t, ok := queue.activeTasks[ts]
if ok {
delete(queue.activeTasks, ts)
return t
}
log.Debug("proxynode", zap.Uint64("task with timestamp ts already in active task list! ts:", ts))
return nil
}
func (queue *BaseTaskQueue) getTaskByReqID(reqID UniqueID) task {
queue.utLock.RLock()
defer queue.utLock.RUnlock()
for e := queue.unissuedTasks.Front(); e != nil; e = e.Next() {
if e.Value.(task).ID() == reqID {
return e.Value.(task)
}
}
queue.atLock.RLock()
defer queue.atLock.RUnlock()
for ats := range queue.activeTasks {
if queue.activeTasks[ats].ID() == reqID {
return queue.activeTasks[ats]
}
}
return nil
}
func (queue *BaseTaskQueue) TaskDoneTest(ts Timestamp) bool {
queue.utLock.RLock()
defer queue.utLock.RUnlock()
for e := queue.unissuedTasks.Front(); e != nil; e = e.Next() {
if e.Value.(task).EndTs() < ts {
return false
}
}
queue.atLock.RLock()
defer queue.atLock.RUnlock()
for ats := range queue.activeTasks {
if ats < ts {
return false
}
}
return true
}
func (queue *BaseTaskQueue) Enqueue(t task) error {
err := t.OnEnqueue()
if err != nil {
return err
}
ts, err := queue.sched.tsoAllocator.AllocOne()
if err != nil {
return err
}
t.SetTs(ts)
reqID, err := queue.sched.idAllocator.AllocOne()
if err != nil {
return err
}
t.SetID(reqID)
return queue.addUnissuedTask(t)
}
type DdTaskQueue struct {
BaseTaskQueue
lock sync.Mutex
}
type DmTaskQueue struct {
BaseTaskQueue
}
func (queue *DmTaskQueue) getPChanStatistics(pchan pChan) (pChanStatistics, error) {
queue.atLock.RLock()
defer queue.atLock.RUnlock()
stats := pChanStatistics{
minTs: 0,
maxTs: ^uint64(0),
invalid: true,
}
for _, t := range queue.activeTasks {
dmlT, _ := t.(dmlTask)
stat, err := dmlT.getStatistics(pchan)
if err != nil {
return pChanStatistics{invalid: true}, nil
}
if stat.minTs < stats.minTs {
stats.minTs = stat.minTs
}
if stat.maxTs > stats.maxTs {
stats.maxTs = stat.maxTs
}
stats.invalid = false
}
return stats, nil
}
type DqTaskQueue struct {
BaseTaskQueue
}
func (queue *DdTaskQueue) Enqueue(t task) error {
queue.lock.Lock()
defer queue.lock.Unlock()
return queue.BaseTaskQueue.Enqueue(t)
}
func NewDdTaskQueue(sched *TaskScheduler) *DdTaskQueue {
return &DdTaskQueue{
BaseTaskQueue: BaseTaskQueue{
unissuedTasks: list.New(),
activeTasks: make(map[Timestamp]task),
maxTaskNum: 1024,
utBufChan: make(chan int, 1024),
sched: sched,
},
}
}
func NewDmTaskQueue(sched *TaskScheduler) *DmTaskQueue {
return &DmTaskQueue{
BaseTaskQueue: BaseTaskQueue{
unissuedTasks: list.New(),
activeTasks: make(map[Timestamp]task),
maxTaskNum: 1024,
utBufChan: make(chan int, 1024),
sched: sched,
},
}
}
func NewDqTaskQueue(sched *TaskScheduler) *DqTaskQueue {
return &DqTaskQueue{
BaseTaskQueue: BaseTaskQueue{
unissuedTasks: list.New(),
activeTasks: make(map[Timestamp]task),
maxTaskNum: 1024,
utBufChan: make(chan int, 1024),
sched: sched,
},
}
}
type TaskScheduler struct {
DdQueue TaskQueue
DmQueue *DmTaskQueue
DqQueue TaskQueue
idAllocator *allocator.IDAllocator
tsoAllocator *TimestampAllocator
wg sync.WaitGroup
ctx context.Context
cancel context.CancelFunc
msFactory msgstream.Factory
}
func NewTaskScheduler(ctx context.Context,
idAllocator *allocator.IDAllocator,
tsoAllocator *TimestampAllocator,
factory msgstream.Factory) (*TaskScheduler, error) {
ctx1, cancel := context.WithCancel(ctx)
s := &TaskScheduler{
idAllocator: idAllocator,
tsoAllocator: tsoAllocator,
ctx: ctx1,
cancel: cancel,
msFactory: factory,
}
s.DdQueue = NewDdTaskQueue(s)
s.DmQueue = NewDmTaskQueue(s)
s.DqQueue = NewDqTaskQueue(s)
return s, nil
}
func (sched *TaskScheduler) scheduleDdTask() task {
return sched.DdQueue.PopUnissuedTask()
}
func (sched *TaskScheduler) scheduleDmTask() task {
return sched.DmQueue.PopUnissuedTask()
}
func (sched *TaskScheduler) scheduleDqTask() task {
return sched.DqQueue.PopUnissuedTask()
}
func (sched *TaskScheduler) getTaskByReqID(collMeta UniqueID) task {
if t := sched.DdQueue.getTaskByReqID(collMeta); t != nil {
return t
}
if t := sched.DmQueue.getTaskByReqID(collMeta); t != nil {
return t
}
if t := sched.DqQueue.getTaskByReqID(collMeta); t != nil {
return t
}
return nil
}
func (sched *TaskScheduler) processTask(t task, q TaskQueue) {
span, ctx := trace.StartSpanFromContext(t.TraceCtx(),
opentracing.Tags{
"Type": t.Name(),
"ID": t.ID(),
})
defer span.Finish()
span.LogFields(oplog.Int64("scheduler process PreExecute", t.ID()))
err := t.PreExecute(ctx)
defer func() {
t.Notify(err)
}()
if err != nil {
trace.LogError(span, err)
return
}
span.LogFields(oplog.Int64("scheduler process AddActiveTask", t.ID()))
q.AddActiveTask(t)
defer func() {
span.LogFields(oplog.Int64("scheduler process PopActiveTask", t.ID()))
q.PopActiveTask(t.EndTs())
}()
span.LogFields(oplog.Int64("scheduler process Execute", t.ID()))
err = t.Execute(ctx)
if err != nil {
trace.LogError(span, err)
return
}
span.LogFields(oplog.Int64("scheduler process PostExecute", t.ID()))
err = t.PostExecute(ctx)
}
func (sched *TaskScheduler) definitionLoop() {
defer sched.wg.Done()
for {
select {
case <-sched.ctx.Done():
return
case <-sched.DdQueue.utChan():
if !sched.DdQueue.utEmpty() {
t := sched.scheduleDdTask()
sched.processTask(t, sched.DdQueue)
}
}
}
}
func (sched *TaskScheduler) manipulationLoop() {
defer sched.wg.Done()
for {
select {
case <-sched.ctx.Done():
return
case <-sched.DmQueue.utChan():
if !sched.DmQueue.utEmpty() {
t := sched.scheduleDmTask()
go sched.processTask(t, sched.DmQueue)
}
}
}
}
func (sched *TaskScheduler) queryLoop() {
defer sched.wg.Done()
for {
select {
case <-sched.ctx.Done():
return
case <-sched.DqQueue.utChan():
if !sched.DqQueue.utEmpty() {
t := sched.scheduleDqTask()
go sched.processTask(t, sched.DqQueue)
} else {
log.Debug("query queue is empty ...")
}
}
}
}
type searchResultBuf struct {
usedVChans map[interface{}]struct{} // set of vChan
receivedVChansSet map[interface{}]struct{} // set of vChan
receivedSealedSegmentIDsSet map[interface{}]struct{} // set of UniqueID
receivedGlobalSegmentIDsSet map[interface{}]struct{} // set of UniqueID
resultBuf []*internalpb.SearchResults
}
func newSearchResultBuf() *searchResultBuf {
return &searchResultBuf{
usedVChans: make(map[interface{}]struct{}),
receivedVChansSet: make(map[interface{}]struct{}),
receivedSealedSegmentIDsSet: make(map[interface{}]struct{}),
receivedGlobalSegmentIDsSet: make(map[interface{}]struct{}),
resultBuf: make([]*internalpb.SearchResults, 0),
}
}
func setContain(m1, m2 map[interface{}]struct{}) bool {
if len(m1) < len(m2) {
return false
}
for k2 := range m2 {
_, ok := m1[k2]
if !ok {
return false
}
}
return true
}
func (sr *searchResultBuf) readyToReduce() bool {
if !setContain(sr.receivedVChansSet, sr.usedVChans) {
return false
}
return setContain(sr.receivedSealedSegmentIDsSet, sr.receivedGlobalSegmentIDsSet)
}
func (sr *searchResultBuf) addPartialResult(result *internalpb.SearchResults) {
sr.resultBuf = append(sr.resultBuf, result)
for _, vchan := range result.ChannelIDsSearched {
sr.receivedVChansSet[vchan] = struct{}{}
}
for _, sealedSegment := range result.SealedSegmentIDsSearched {
sr.receivedSealedSegmentIDsSet[sealedSegment] = struct{}{}
}
for _, globalSegment := range result.GlobalSealedSegmentIDs {
sr.receivedGlobalSegmentIDsSet[globalSegment] = struct{}{}
}
}
func (sched *TaskScheduler) queryResultLoop() {
defer sched.wg.Done()
queryResultMsgStream, _ := sched.msFactory.NewQueryMsgStream(sched.ctx)
queryResultMsgStream.AsConsumer(Params.SearchResultChannelNames, Params.ProxySubName)
log.Debug("proxynode", zap.Strings("search result channel names", Params.SearchResultChannelNames))
log.Debug("proxynode", zap.String("proxySubName", Params.ProxySubName))
queryNodeNum := Params.QueryNodeNum
queryResultMsgStream.Start()
defer queryResultMsgStream.Close()
queryResultBuf := make(map[UniqueID]*searchResultBuf)
retrieveResultBuf := make(map[UniqueID][]*internalpb.RetrieveResults)
for {
select {
case msgPack, ok := <-queryResultMsgStream.Chan():
if !ok {
log.Debug("buf chan closed")
return
}
if msgPack == nil {
continue
}
for _, tsMsg := range msgPack.Msgs {
sp, ctx := trace.StartSpanFromContext(tsMsg.TraceCtx())
tsMsg.SetTraceCtx(ctx)
if searchResultMsg, srOk := tsMsg.(*msgstream.SearchResultMsg); srOk {
reqID := searchResultMsg.Base.MsgID
reqIDStr := strconv.FormatInt(reqID, 10)
t := sched.getTaskByReqID(reqID)
if t == nil {
log.Debug("proxynode", zap.String("QueryResult GetTaskByReqID failed, reqID = ", reqIDStr))
delete(queryResultBuf, reqID)
continue
}
st, ok := t.(*SearchTask)
if !ok {
delete(queryResultBuf, reqID)
continue
}
_, ok = queryResultBuf[reqID]
if !ok {
queryResultBuf[reqID] = newSearchResultBuf()
vchans, err := st.getVChannels()
if err != nil {
delete(queryResultBuf, reqID)
continue
}
for _, vchan := range vchans {
queryResultBuf[reqID].usedVChans[vchan] = struct{}{}
}
}
queryResultBuf[reqID].addPartialResult(&searchResultMsg.SearchResults)
//t := sched.getTaskByReqID(reqID)
{
colName := t.(*SearchTask).query.CollectionName
log.Debug("Getcollection", zap.String("collection name", colName), zap.String("reqID", reqIDStr), zap.Int("answer cnt", len(queryResultBuf[reqID].resultBuf)))
}
if queryResultBuf[reqID].readyToReduce() {
st.resultBuf <- queryResultBuf[reqID].resultBuf
}
sp.Finish()
}
if retrieveResultMsg, rtOk := tsMsg.(*msgstream.RetrieveResultMsg); rtOk {
reqID := retrieveResultMsg.Base.MsgID
reqIDStr := strconv.FormatInt(reqID, 10)
t := sched.getTaskByReqID(reqID)
if t == nil {
log.Debug("proxynode", zap.String("RetrieveResult GetTaskByReqID failed, reqID = ", reqIDStr))
delete(retrieveResultBuf, reqID)
continue
}
_, ok = retrieveResultBuf[reqID]
if !ok {
retrieveResultBuf[reqID] = make([]*internalpb.RetrieveResults, 0)
}
retrieveResultBuf[reqID] = append(retrieveResultBuf[reqID], &retrieveResultMsg.RetrieveResults)
{
colName := t.(*RetrieveTask).retrieve.CollectionName
log.Debug("Getcollection", zap.String("collection name", colName), zap.String("reqID", reqIDStr), zap.Int("answer cnt", len(retrieveResultBuf[reqID])))
}
if len(retrieveResultBuf[reqID]) == queryNodeNum {
t := sched.getTaskByReqID(reqID)
if t != nil {
rt, ok := t.(*RetrieveTask)
if ok {
rt.resultBuf <- retrieveResultBuf[reqID]
delete(retrieveResultBuf, reqID)
}
} else {
}
}
sp.Finish()
}
}
case <-sched.ctx.Done():
log.Debug("proxynode server is closed ...")
return
}
}
}
func (sched *TaskScheduler) Start() error {
sched.wg.Add(1)
go sched.definitionLoop()
sched.wg.Add(1)
go sched.manipulationLoop()
sched.wg.Add(1)
go sched.queryLoop()
sched.wg.Add(1)
go sched.queryResultLoop()
return nil
}
func (sched *TaskScheduler) Close() {
sched.cancel()
sched.wg.Wait()
}
func (sched *TaskScheduler) TaskDoneTest(ts Timestamp) bool {
ddTaskDone := sched.DdQueue.TaskDoneTest(ts)
dmTaskDone := sched.DmQueue.TaskDoneTest(ts)
//dqTaskDone := sched.DqQueue.TaskDoneTest(ts)
return ddTaskDone && dmTaskDone && true
}
func (sched *TaskScheduler) getPChanStatistics(pchan pChan) (pChanStatistics, error) {
return sched.DmQueue.getPChanStatistics(pchan)
}