milvus/internal/querycoordv2/observers/collection_observer.go
2024-08-02 17:20:15 +08:00

352 lines
12 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package observers
import (
"context"
"fmt"
"sync"
"time"
"github.com/samber/lo"
"go.opentelemetry.io/otel/trace"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/checkers"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/eventlog"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
type CollectionObserver struct {
cancel context.CancelFunc
wg sync.WaitGroup
dist *meta.DistributionManager
meta *meta.Meta
targetMgr meta.TargetManagerInterface
targetObserver *TargetObserver
checkerController *checkers.CheckerController
partitionLoadedCount map[int64]int
loadTasks *typeutil.ConcurrentMap[string, LoadTask]
stopOnce sync.Once
}
type LoadTask struct {
LoadType querypb.LoadType
CollectionID int64
PartitionIDs []int64
}
func NewCollectionObserver(
dist *meta.DistributionManager,
meta *meta.Meta,
targetMgr meta.TargetManagerInterface,
targetObserver *TargetObserver,
checherController *checkers.CheckerController,
) *CollectionObserver {
ob := &CollectionObserver{
dist: dist,
meta: meta,
targetMgr: targetMgr,
targetObserver: targetObserver,
checkerController: checherController,
partitionLoadedCount: make(map[int64]int),
loadTasks: typeutil.NewConcurrentMap[string, LoadTask](),
}
// Add load task for collection recovery
collections := meta.GetAllCollections()
for _, collection := range collections {
ob.LoadCollection(context.Background(), collection.GetCollectionID())
}
return ob
}
func (ob *CollectionObserver) Start() {
ctx, cancel := context.WithCancel(context.Background())
ob.cancel = cancel
observePeriod := Params.QueryCoordCfg.CollectionObserverInterval.GetAsDuration(time.Millisecond)
ob.wg.Add(1)
go func() {
defer ob.wg.Done()
ticker := time.NewTicker(observePeriod)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
log.Info("CollectionObserver stopped")
return
case <-ticker.C:
ob.Observe(ctx)
}
}
}()
}
func (ob *CollectionObserver) Stop() {
ob.stopOnce.Do(func() {
if ob.cancel != nil {
ob.cancel()
}
ob.wg.Wait()
})
}
func (ob *CollectionObserver) LoadCollection(ctx context.Context, collectionID int64) {
span := trace.SpanFromContext(ctx)
traceID := span.SpanContext().TraceID()
key := traceID.String()
if !traceID.IsValid() {
key = fmt.Sprintf("LoadCollection_%d", collectionID)
}
ob.loadTasks.Insert(key, LoadTask{LoadType: querypb.LoadType_LoadCollection, CollectionID: collectionID})
ob.checkerController.Check()
}
func (ob *CollectionObserver) LoadPartitions(ctx context.Context, collectionID int64, partitionIDs []int64) {
span := trace.SpanFromContext(ctx)
traceID := span.SpanContext().TraceID()
key := traceID.String()
if !traceID.IsValid() {
key = fmt.Sprintf("LoadPartition_%d_%v", collectionID, partitionIDs)
}
ob.loadTasks.Insert(key, LoadTask{LoadType: querypb.LoadType_LoadPartition, CollectionID: collectionID, PartitionIDs: partitionIDs})
ob.checkerController.Check()
}
func (ob *CollectionObserver) Observe(ctx context.Context) {
ob.observeTimeout()
ob.observeLoadStatus(ctx)
}
func (ob *CollectionObserver) observeTimeout() {
ob.loadTasks.Range(func(traceID string, task LoadTask) bool {
collection := ob.meta.CollectionManager.GetCollection(task.CollectionID)
// collection released
if collection == nil {
log.Info("Load Collection Task canceled, collection removed from meta", zap.Int64("collectionID", task.CollectionID), zap.String("traceID", traceID))
ob.loadTasks.Remove(traceID)
return true
}
switch task.LoadType {
case querypb.LoadType_LoadCollection:
if collection.GetStatus() == querypb.LoadStatus_Loading &&
time.Now().After(collection.UpdatedAt.Add(Params.QueryCoordCfg.LoadTimeoutSeconds.GetAsDuration(time.Second))) {
log.Info("load collection timeout, cancel it",
zap.Int64("collectionID", collection.GetCollectionID()),
zap.Duration("loadTime", time.Since(collection.CreatedAt)))
ob.meta.CollectionManager.RemoveCollection(collection.GetCollectionID())
ob.meta.ReplicaManager.RemoveCollection(collection.GetCollectionID())
ob.targetMgr.RemoveCollection(collection.GetCollectionID())
ob.loadTasks.Remove(traceID)
}
case querypb.LoadType_LoadPartition:
partitionIDs := typeutil.NewSet(task.PartitionIDs...)
partitions := ob.meta.GetPartitionsByCollection(task.CollectionID)
partitions = lo.Filter(partitions, func(partition *meta.Partition, _ int) bool {
return partitionIDs.Contain(partition.GetPartitionID())
})
// all partition released
if len(partitions) == 0 {
log.Info("Load Partitions Task canceled, collection removed from meta",
zap.Int64("collectionID", task.CollectionID),
zap.Int64s("partitionIDs", task.PartitionIDs),
zap.String("traceID", traceID))
ob.loadTasks.Remove(traceID)
return true
}
working := false
for _, partition := range partitions {
if time.Now().Before(partition.UpdatedAt.Add(Params.QueryCoordCfg.LoadTimeoutSeconds.GetAsDuration(time.Second))) {
working = true
break
}
}
// only all partitions timeout means task timeout
if !working {
log.Info("load partitions timeout, cancel it",
zap.Int64("collectionID", task.CollectionID),
zap.Int64s("partitionIDs", task.PartitionIDs))
for _, partition := range partitions {
ob.meta.CollectionManager.RemovePartition(partition.CollectionID, partition.GetPartitionID())
ob.targetMgr.RemovePartition(partition.GetCollectionID(), partition.GetPartitionID())
}
// all partition timeout, remove collection
if len(ob.meta.CollectionManager.GetPartitionsByCollection(task.CollectionID)) == 0 {
log.Info("collection timeout due to all partition removed", zap.Int64("collection", task.CollectionID))
ob.meta.CollectionManager.RemoveCollection(task.CollectionID)
ob.meta.ReplicaManager.RemoveCollection(task.CollectionID)
ob.targetMgr.RemoveCollection(task.CollectionID)
}
}
}
return true
})
}
func (ob *CollectionObserver) readyToObserve(collectionID int64) bool {
metaExist := (ob.meta.GetCollection(collectionID) != nil)
targetExist := ob.targetMgr.IsNextTargetExist(collectionID) || ob.targetMgr.IsCurrentTargetExist(collectionID, common.AllPartitionsID)
return metaExist && targetExist
}
func (ob *CollectionObserver) observeLoadStatus(ctx context.Context) {
loading := false
ob.loadTasks.Range(func(traceID string, task LoadTask) bool {
loading = true
collection := ob.meta.CollectionManager.GetCollection(task.CollectionID)
if collection == nil {
return true
}
var partitions []*meta.Partition
switch task.LoadType {
case querypb.LoadType_LoadCollection:
partitions = ob.meta.GetPartitionsByCollection(task.CollectionID)
case querypb.LoadType_LoadPartition:
partitionIDs := typeutil.NewSet[int64](task.PartitionIDs...)
partitions = ob.meta.GetPartitionsByCollection(task.CollectionID)
partitions = lo.Filter(partitions, func(partition *meta.Partition, _ int) bool {
return partitionIDs.Contain(partition.GetPartitionID())
})
}
loaded := true
for _, partition := range partitions {
if partition.LoadPercentage == 100 {
continue
}
if ob.readyToObserve(partition.CollectionID) {
replicaNum := ob.meta.GetReplicaNumber(partition.GetCollectionID())
ob.observePartitionLoadStatus(ctx, partition, replicaNum)
}
partition = ob.meta.GetPartition(partition.PartitionID)
if partition != nil && partition.LoadPercentage != 100 {
loaded = false
}
}
// all partition loaded, finish task
if len(partitions) > 0 && loaded {
log.Info("Load task finish",
zap.String("traceID", traceID),
zap.Int64("collectionID", task.CollectionID),
zap.Int64s("partitionIDs", task.PartitionIDs),
zap.Stringer("loadType", task.LoadType))
ob.loadTasks.Remove(traceID)
}
return true
})
// trigger check logic when loading collections/partitions
if loading {
ob.checkerController.Check()
}
}
func (ob *CollectionObserver) observePartitionLoadStatus(ctx context.Context, partition *meta.Partition, replicaNum int32) {
log := log.Ctx(ctx).WithRateGroup("qcv2.observePartitionLoadStatus", 1, 60).With(
zap.Int64("collectionID", partition.GetCollectionID()),
zap.Int64("partitionID", partition.GetPartitionID()),
)
segmentTargets := ob.targetMgr.GetSealedSegmentsByPartition(partition.GetCollectionID(), partition.GetPartitionID(), meta.NextTarget)
channelTargets := ob.targetMgr.GetDmChannelsByCollection(partition.GetCollectionID(), meta.NextTarget)
targetNum := len(segmentTargets) + len(channelTargets)
if targetNum == 0 {
log.Info("segments and channels in target are both empty, waiting for new target content")
return
}
log.RatedInfo(10, "partition targets",
zap.Int("segmentTargetNum", len(segmentTargets)),
zap.Int("channelTargetNum", len(channelTargets)),
zap.Int("totalTargetNum", targetNum),
zap.Int32("replicaNum", replicaNum),
)
loadedCount := 0
loadPercentage := int32(0)
for _, channel := range channelTargets {
views := ob.dist.LeaderViewManager.GetByFilter(meta.WithChannelName2LeaderView(channel.GetChannelName()))
nodes := lo.Map(views, func(v *meta.LeaderView, _ int) int64 { return v.ID })
group := utils.GroupNodesByReplica(ob.meta.ReplicaManager, partition.GetCollectionID(), nodes)
loadedCount += len(group)
}
subChannelCount := loadedCount
for _, segment := range segmentTargets {
views := ob.dist.LeaderViewManager.GetByFilter(meta.WithSegment2LeaderView(segment.GetID(), false))
nodes := lo.Map(views, func(view *meta.LeaderView, _ int) int64 { return view.ID })
group := utils.GroupNodesByReplica(ob.meta.ReplicaManager, partition.GetCollectionID(), nodes)
loadedCount += len(group)
}
if loadedCount > 0 {
log.Info("partition load progress",
zap.Int("subChannelCount", subChannelCount),
zap.Int("loadSegmentCount", loadedCount-subChannelCount))
}
loadPercentage = int32(loadedCount * 100 / (targetNum * int(replicaNum)))
if loadedCount <= ob.partitionLoadedCount[partition.GetPartitionID()] && loadPercentage != 100 {
ob.partitionLoadedCount[partition.GetPartitionID()] = loadedCount
return
}
ob.partitionLoadedCount[partition.GetPartitionID()] = loadedCount
if loadPercentage == 100 {
if !ob.targetObserver.Check(ctx, partition.GetCollectionID(), partition.PartitionID) {
log.Warn("failed to manual check current target, skip update load status")
return
}
delete(ob.partitionLoadedCount, partition.GetPartitionID())
}
collectionPercentage, err := ob.meta.CollectionManager.UpdateLoadPercent(partition.PartitionID, loadPercentage)
if err != nil {
log.Warn("failed to update load percentage")
}
log.Info("load status updated",
zap.Int32("partitionLoadPercentage", loadPercentage),
zap.Int32("collectionLoadPercentage", collectionPercentage),
)
eventlog.Record(eventlog.NewRawEvt(eventlog.Level_Info, fmt.Sprintf("collection %d load percentage update: %d", partition.CollectionID, loadPercentage)))
}