2022-10-11 11:39:22 +08:00
|
|
|
// Licensed to the LF AI & Data foundation under one
|
|
|
|
// or more contributor license agreements. See the NOTICE file
|
|
|
|
// distributed with this work for additional information
|
|
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
|
|
// to you under the Apache License, Version 2.0 (the
|
|
|
|
// "License"); you may not use this file except in compliance
|
|
|
|
// with the License. You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2022-09-15 18:48:32 +08:00
|
|
|
package observers
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"go.uber.org/zap"
|
|
|
|
|
|
|
|
"github.com/milvus-io/milvus/internal/log"
|
|
|
|
"github.com/milvus-io/milvus/internal/metrics"
|
|
|
|
"github.com/milvus-io/milvus/internal/proto/querypb"
|
|
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
|
|
|
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
|
|
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
|
|
|
|
)
|
|
|
|
|
|
|
|
type CollectionObserver struct {
|
|
|
|
stopCh chan struct{}
|
|
|
|
|
|
|
|
dist *meta.DistributionManager
|
|
|
|
meta *meta.Meta
|
|
|
|
targetMgr *meta.TargetManager
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewCollectionObserver(
|
|
|
|
dist *meta.DistributionManager,
|
|
|
|
meta *meta.Meta,
|
|
|
|
targetMgr *meta.TargetManager,
|
|
|
|
) *CollectionObserver {
|
|
|
|
return &CollectionObserver{
|
|
|
|
stopCh: make(chan struct{}),
|
|
|
|
dist: dist,
|
|
|
|
meta: meta,
|
|
|
|
targetMgr: targetMgr,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ob *CollectionObserver) Start(ctx context.Context) {
|
|
|
|
const observePeriod = time.Second
|
|
|
|
go func() {
|
|
|
|
ticker := time.NewTicker(observePeriod)
|
|
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
log.Info("CollectionObserver stopped due to context canceled")
|
|
|
|
return
|
|
|
|
|
|
|
|
case <-ob.stopCh:
|
|
|
|
log.Info("CollectionObserver stopped")
|
|
|
|
return
|
|
|
|
|
|
|
|
case <-ticker.C:
|
|
|
|
ob.Observe()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ob *CollectionObserver) Stop() {
|
|
|
|
close(ob.stopCh)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ob *CollectionObserver) Observe() {
|
|
|
|
ob.observeTimeout()
|
|
|
|
ob.observeLoadStatus()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ob *CollectionObserver) observeTimeout() {
|
|
|
|
collections := ob.meta.CollectionManager.GetAllCollections()
|
|
|
|
for _, collection := range collections {
|
|
|
|
if collection.GetStatus() != querypb.LoadStatus_Loading ||
|
2022-09-30 19:48:55 +08:00
|
|
|
time.Now().Before(collection.UpdatedAt.Add(Params.QueryCoordCfg.LoadTimeoutSeconds)) {
|
2022-09-15 18:48:32 +08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Info("load collection timeout, cancel it",
|
|
|
|
zap.Int64("collectionID", collection.GetCollectionID()),
|
|
|
|
zap.Duration("loadTime", time.Since(collection.CreatedAt)))
|
|
|
|
ob.meta.CollectionManager.RemoveCollection(collection.GetCollectionID())
|
|
|
|
ob.meta.ReplicaManager.RemoveCollection(collection.GetCollectionID())
|
|
|
|
ob.targetMgr.RemoveCollection(collection.GetCollectionID())
|
|
|
|
}
|
|
|
|
|
|
|
|
partitions := utils.GroupPartitionsByCollection(
|
|
|
|
ob.meta.CollectionManager.GetAllPartitions())
|
|
|
|
if len(partitions) > 0 {
|
|
|
|
log.Info("observes partitions timeout", zap.Int("partitionNum", len(partitions)))
|
|
|
|
}
|
|
|
|
for collection, partitions := range partitions {
|
|
|
|
log := log.With(
|
|
|
|
zap.Int64("collectionID", collection),
|
|
|
|
)
|
|
|
|
for _, partition := range partitions {
|
|
|
|
if partition.GetStatus() != querypb.LoadStatus_Loading ||
|
|
|
|
time.Now().Before(partition.CreatedAt.Add(Params.QueryCoordCfg.LoadTimeoutSeconds)) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Info("load partition timeout, cancel all partitions",
|
|
|
|
zap.Int64("partitionID", partition.GetPartitionID()),
|
|
|
|
zap.Duration("loadTime", time.Since(partition.CreatedAt)))
|
|
|
|
// TODO(yah01): Now, releasing part of partitions is not allowed
|
|
|
|
ob.meta.CollectionManager.RemoveCollection(partition.GetCollectionID())
|
|
|
|
ob.meta.ReplicaManager.RemoveCollection(partition.GetCollectionID())
|
|
|
|
ob.targetMgr.RemoveCollection(partition.GetCollectionID())
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ob *CollectionObserver) observeLoadStatus() {
|
|
|
|
collections := ob.meta.CollectionManager.GetAllCollections()
|
|
|
|
for _, collection := range collections {
|
|
|
|
if collection.LoadPercentage == 100 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
ob.observeCollectionLoadStatus(collection)
|
|
|
|
}
|
|
|
|
|
|
|
|
partitions := ob.meta.CollectionManager.GetAllPartitions()
|
|
|
|
if len(partitions) > 0 {
|
|
|
|
log.Info("observe partitions status", zap.Int("partitionNum", len(partitions)))
|
|
|
|
}
|
|
|
|
for _, partition := range partitions {
|
|
|
|
if partition.LoadPercentage == 100 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
ob.observePartitionLoadStatus(partition)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ob *CollectionObserver) observeCollectionLoadStatus(collection *meta.Collection) {
|
|
|
|
log := log.With(zap.Int64("collectionID", collection.GetCollectionID()))
|
|
|
|
|
|
|
|
segmentTargets := ob.targetMgr.GetSegmentsByCollection(collection.GetCollectionID())
|
|
|
|
channelTargets := ob.targetMgr.GetDmChannelsByCollection(collection.GetCollectionID())
|
|
|
|
targetNum := len(segmentTargets) + len(channelTargets)
|
|
|
|
log.Info("collection targets",
|
|
|
|
zap.Int("segment-target-num", len(segmentTargets)),
|
|
|
|
zap.Int("channel-target-num", len(channelTargets)),
|
|
|
|
zap.Int("total-target-num", targetNum))
|
|
|
|
if targetNum == 0 {
|
|
|
|
log.Info("collection released, skip it")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
loadedCount := 0
|
|
|
|
for _, channel := range channelTargets {
|
|
|
|
group := utils.GroupNodesByReplica(ob.meta.ReplicaManager,
|
|
|
|
collection.GetCollectionID(),
|
|
|
|
ob.dist.LeaderViewManager.GetChannelDist(channel.GetChannelName()))
|
|
|
|
if len(group) >= int(collection.GetReplicaNumber()) {
|
|
|
|
loadedCount++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
subChannelCount := loadedCount
|
|
|
|
for _, segment := range segmentTargets {
|
|
|
|
group := utils.GroupNodesByReplica(ob.meta.ReplicaManager,
|
|
|
|
collection.GetCollectionID(),
|
|
|
|
ob.dist.LeaderViewManager.GetSealedSegmentDist(segment.GetID()))
|
|
|
|
if len(group) >= int(collection.GetReplicaNumber()) {
|
|
|
|
loadedCount++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if loadedCount > 0 {
|
|
|
|
log.Info("collection load progress",
|
|
|
|
zap.Int("sub-channel-count", subChannelCount),
|
|
|
|
zap.Int("load-segment-count", loadedCount-subChannelCount),
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
updated := collection.Clone()
|
|
|
|
updated.LoadPercentage = int32(loadedCount * 100 / targetNum)
|
2022-09-30 19:48:55 +08:00
|
|
|
if updated.LoadPercentage <= collection.LoadPercentage {
|
|
|
|
return
|
|
|
|
}
|
2022-09-15 18:48:32 +08:00
|
|
|
if loadedCount >= len(segmentTargets)+len(channelTargets) {
|
|
|
|
updated.Status = querypb.LoadStatus_Loaded
|
|
|
|
ob.meta.CollectionManager.UpdateCollection(updated)
|
|
|
|
|
|
|
|
elapsed := time.Since(updated.CreatedAt)
|
|
|
|
metrics.QueryCoordLoadLatency.WithLabelValues().Observe(float64(elapsed.Milliseconds()))
|
|
|
|
} else {
|
|
|
|
ob.meta.CollectionManager.UpdateCollectionInMemory(updated)
|
|
|
|
}
|
2022-09-30 19:48:55 +08:00
|
|
|
log.Info("collection load status updated",
|
|
|
|
zap.Int32("loadPercentage", updated.LoadPercentage),
|
|
|
|
zap.Int32("collectionStatus", int32(updated.GetStatus())))
|
2022-09-15 18:48:32 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func (ob *CollectionObserver) observePartitionLoadStatus(partition *meta.Partition) {
|
|
|
|
log := log.With(
|
|
|
|
zap.Int64("collectionID", partition.GetCollectionID()),
|
|
|
|
zap.Int64("partitionID", partition.GetPartitionID()),
|
|
|
|
)
|
|
|
|
|
|
|
|
segmentTargets := ob.targetMgr.GetSegmentsByCollection(partition.GetCollectionID(), partition.GetPartitionID())
|
|
|
|
channelTargets := ob.targetMgr.GetDmChannelsByCollection(partition.GetCollectionID())
|
|
|
|
targetNum := len(segmentTargets) + len(channelTargets)
|
|
|
|
log.Info("partition targets",
|
|
|
|
zap.Int("segment-target-num", len(segmentTargets)),
|
|
|
|
zap.Int("channel-target-num", len(channelTargets)),
|
|
|
|
zap.Int("total-target-num", targetNum))
|
|
|
|
if targetNum == 0 {
|
|
|
|
log.Info("partition released, skip it")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
loadedCount := 0
|
|
|
|
for _, channel := range channelTargets {
|
|
|
|
group := utils.GroupNodesByReplica(ob.meta.ReplicaManager,
|
|
|
|
partition.GetCollectionID(),
|
|
|
|
ob.dist.LeaderViewManager.GetChannelDist(channel.GetChannelName()))
|
|
|
|
if len(group) >= int(partition.GetReplicaNumber()) {
|
|
|
|
loadedCount++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
subChannelCount := loadedCount
|
|
|
|
for _, segment := range segmentTargets {
|
|
|
|
group := utils.GroupNodesByReplica(ob.meta.ReplicaManager,
|
|
|
|
partition.GetCollectionID(),
|
|
|
|
ob.dist.LeaderViewManager.GetSealedSegmentDist(segment.GetID()))
|
|
|
|
if len(group) >= int(partition.GetReplicaNumber()) {
|
|
|
|
loadedCount++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if loadedCount > 0 {
|
|
|
|
log.Info("partition load progress",
|
|
|
|
zap.Int("sub-channel-count", subChannelCount),
|
|
|
|
zap.Int("load-segment-count", loadedCount-subChannelCount))
|
|
|
|
}
|
|
|
|
|
2022-09-30 19:48:55 +08:00
|
|
|
updated := partition.Clone()
|
|
|
|
updated.LoadPercentage = int32(loadedCount * 100 / targetNum)
|
|
|
|
if updated.LoadPercentage <= partition.LoadPercentage {
|
|
|
|
return
|
|
|
|
}
|
2022-09-15 18:48:32 +08:00
|
|
|
if loadedCount >= len(segmentTargets)+len(channelTargets) {
|
2022-09-30 19:48:55 +08:00
|
|
|
updated.Status = querypb.LoadStatus_Loaded
|
|
|
|
ob.meta.CollectionManager.UpdatePartition(updated)
|
2022-09-15 18:48:32 +08:00
|
|
|
|
2022-09-30 19:48:55 +08:00
|
|
|
elapsed := time.Since(updated.CreatedAt)
|
2022-09-15 18:48:32 +08:00
|
|
|
metrics.QueryCoordLoadLatency.WithLabelValues().Observe(float64(elapsed.Milliseconds()))
|
|
|
|
} else {
|
2022-09-30 19:48:55 +08:00
|
|
|
ob.meta.CollectionManager.UpdatePartitionInMemory(updated)
|
2022-09-15 18:48:32 +08:00
|
|
|
}
|
|
|
|
log.Info("partition load status updated",
|
2022-09-30 19:48:55 +08:00
|
|
|
zap.Int32("loadPercentage", updated.LoadPercentage),
|
|
|
|
zap.Int32("partitionStatus", int32(updated.GetStatus())))
|
2022-09-15 18:48:32 +08:00
|
|
|
}
|