mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-02 11:59:00 +08:00
2c9bb4dfa3
issue: #33744 This PR includes the following changes: 1. Added a new task type to the task scheduler in datacoord: stats task, which sorts segments by primary key. 2. Implemented segment sorting in indexnode. 3. Added a new field `FieldStatsLog` to SegmentInfo to store token index information. --------- Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
238 lines
7.8 KiB
Go
238 lines
7.8 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package indexnode
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/suite"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/proto/etcdpb"
|
|
"github.com/milvus-io/milvus/internal/proto/indexpb"
|
|
"github.com/milvus-io/milvus/internal/proto/workerpb"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"github.com/milvus-io/milvus/internal/util/dependency"
|
|
"github.com/milvus-io/milvus/pkg/common"
|
|
"github.com/milvus-io/milvus/pkg/util/metautil"
|
|
"github.com/milvus-io/milvus/pkg/util/metric"
|
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
|
"github.com/milvus-io/milvus/pkg/util/timerecord"
|
|
)
|
|
|
|
type IndexBuildTaskSuite struct {
|
|
suite.Suite
|
|
schema *schemapb.CollectionSchema
|
|
collectionID int64
|
|
partitionID int64
|
|
segmentID int64
|
|
dataPath string
|
|
|
|
numRows int
|
|
dim int
|
|
}
|
|
|
|
func (suite *IndexBuildTaskSuite) SetupSuite() {
|
|
paramtable.Init()
|
|
suite.collectionID = 1000
|
|
suite.partitionID = 1001
|
|
suite.segmentID = 1002
|
|
suite.dataPath = "/tmp/milvus/data/1000/1001/1002/3/1"
|
|
suite.numRows = 100
|
|
suite.dim = 128
|
|
}
|
|
|
|
func (suite *IndexBuildTaskSuite) SetupTest() {
|
|
suite.schema = &schemapb.CollectionSchema{
|
|
Name: "test",
|
|
Description: "test",
|
|
AutoID: false,
|
|
Fields: []*schemapb.FieldSchema{
|
|
{FieldID: common.RowIDField, Name: common.RowIDFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
|
{FieldID: common.TimeStampField, Name: common.TimeStampFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
|
{FieldID: 100, Name: "pk", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
|
{FieldID: 101, Name: "ts", DataType: schemapb.DataType_Int64},
|
|
{FieldID: 102, Name: "vec", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "128"}}},
|
|
},
|
|
}
|
|
}
|
|
|
|
func (suite *IndexBuildTaskSuite) serializeData() ([]*storage.Blob, error) {
|
|
insertCodec := storage.NewInsertCodecWithSchema(&etcdpb.CollectionMeta{
|
|
Schema: suite.schema,
|
|
})
|
|
return insertCodec.Serialize(suite.partitionID, suite.segmentID, &storage.InsertData{
|
|
Data: map[storage.FieldID]storage.FieldData{
|
|
0: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
|
|
1: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
|
|
100: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
|
|
101: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
|
|
102: &storage.FloatVectorFieldData{Data: generateFloats(suite.numRows * suite.dim), Dim: suite.dim},
|
|
},
|
|
Infos: []storage.BlobInfo{{Length: suite.numRows}},
|
|
})
|
|
}
|
|
|
|
func (suite *IndexBuildTaskSuite) TestBuildMemoryIndex() {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
req := &workerpb.CreateJobRequest{
|
|
BuildID: 1,
|
|
IndexVersion: 1,
|
|
DataPaths: []string{suite.dataPath},
|
|
IndexID: 0,
|
|
IndexName: "",
|
|
IndexParams: []*commonpb.KeyValuePair{{Key: common.IndexTypeKey, Value: "FLAT"}, {Key: common.MetricTypeKey, Value: metric.L2}},
|
|
TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "128"}},
|
|
NumRows: int64(suite.numRows),
|
|
StorageConfig: &indexpb.StorageConfig{
|
|
RootPath: "/tmp/milvus/data",
|
|
StorageType: "local",
|
|
},
|
|
CollectionID: 1,
|
|
PartitionID: 2,
|
|
SegmentID: 3,
|
|
FieldID: 102,
|
|
FieldName: "vec",
|
|
FieldType: schemapb.DataType_FloatVector,
|
|
}
|
|
|
|
cm, err := NewChunkMgrFactory().NewChunkManager(ctx, req.GetStorageConfig())
|
|
suite.NoError(err)
|
|
blobs, err := suite.serializeData()
|
|
suite.NoError(err)
|
|
err = cm.Write(ctx, suite.dataPath, blobs[0].Value)
|
|
suite.NoError(err)
|
|
|
|
t := newIndexBuildTask(ctx, cancel, req, cm, NewIndexNode(context.Background(), dependency.NewDefaultFactory(true)))
|
|
|
|
err = t.PreExecute(context.Background())
|
|
suite.NoError(err)
|
|
err = t.Execute(context.Background())
|
|
suite.NoError(err)
|
|
err = t.PostExecute(context.Background())
|
|
suite.NoError(err)
|
|
}
|
|
|
|
func TestIndexBuildTask(t *testing.T) {
|
|
suite.Run(t, new(IndexBuildTaskSuite))
|
|
}
|
|
|
|
type AnalyzeTaskSuite struct {
|
|
suite.Suite
|
|
schema *schemapb.CollectionSchema
|
|
collectionID int64
|
|
partitionID int64
|
|
segmentID int64
|
|
fieldID int64
|
|
taskID int64
|
|
}
|
|
|
|
func (suite *AnalyzeTaskSuite) SetupSuite() {
|
|
paramtable.Init()
|
|
suite.collectionID = 1000
|
|
suite.partitionID = 1001
|
|
suite.segmentID = 1002
|
|
suite.fieldID = 102
|
|
suite.taskID = 1004
|
|
}
|
|
|
|
func (suite *AnalyzeTaskSuite) SetupTest() {
|
|
suite.schema = &schemapb.CollectionSchema{
|
|
Name: "test",
|
|
Description: "test",
|
|
AutoID: false,
|
|
Fields: []*schemapb.FieldSchema{
|
|
{FieldID: common.RowIDField, Name: common.RowIDFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
|
{FieldID: common.TimeStampField, Name: common.TimeStampFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
|
{FieldID: 100, Name: "pk", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
|
{FieldID: 101, Name: "ts", DataType: schemapb.DataType_Int64},
|
|
{FieldID: 102, Name: "vec", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "1"}}},
|
|
},
|
|
}
|
|
}
|
|
|
|
func (suite *AnalyzeTaskSuite) serializeData() ([]*storage.Blob, error) {
|
|
insertCodec := storage.NewInsertCodecWithSchema(&etcdpb.CollectionMeta{
|
|
Schema: suite.schema,
|
|
})
|
|
return insertCodec.Serialize(suite.partitionID, suite.segmentID, &storage.InsertData{
|
|
Data: map[storage.FieldID]storage.FieldData{
|
|
0: &storage.Int64FieldData{Data: []int64{0, 1, 2}},
|
|
1: &storage.Int64FieldData{Data: []int64{1, 2, 3}},
|
|
100: &storage.Int64FieldData{Data: []int64{0, 1, 2}},
|
|
101: &storage.Int64FieldData{Data: []int64{0, 1, 2}},
|
|
102: &storage.FloatVectorFieldData{Data: []float32{1, 2, 3}, Dim: 1},
|
|
},
|
|
Infos: []storage.BlobInfo{{Length: 3}},
|
|
})
|
|
}
|
|
|
|
func (suite *AnalyzeTaskSuite) TestAnalyze() {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
req := &workerpb.AnalyzeRequest{
|
|
ClusterID: "test",
|
|
TaskID: 1,
|
|
CollectionID: suite.collectionID,
|
|
PartitionID: suite.partitionID,
|
|
FieldID: suite.fieldID,
|
|
FieldName: "vec",
|
|
FieldType: schemapb.DataType_FloatVector,
|
|
SegmentStats: map[int64]*indexpb.SegmentStats{
|
|
suite.segmentID: {
|
|
ID: suite.segmentID,
|
|
NumRows: 1024,
|
|
LogIDs: []int64{1},
|
|
},
|
|
},
|
|
Version: 1,
|
|
StorageConfig: &indexpb.StorageConfig{
|
|
RootPath: "/tmp/milvus/data",
|
|
StorageType: "local",
|
|
},
|
|
Dim: 1,
|
|
}
|
|
|
|
cm, err := NewChunkMgrFactory().NewChunkManager(ctx, req.GetStorageConfig())
|
|
suite.NoError(err)
|
|
blobs, err := suite.serializeData()
|
|
suite.NoError(err)
|
|
dataPath := metautil.BuildInsertLogPath(cm.RootPath(), suite.collectionID, suite.partitionID, suite.segmentID,
|
|
suite.fieldID, 1)
|
|
|
|
err = cm.Write(ctx, dataPath, blobs[0].Value)
|
|
suite.NoError(err)
|
|
|
|
t := &analyzeTask{
|
|
ident: "",
|
|
cancel: cancel,
|
|
ctx: ctx,
|
|
req: req,
|
|
tr: timerecord.NewTimeRecorder("test-indexBuildTask"),
|
|
queueDur: 0,
|
|
node: NewIndexNode(context.Background(), dependency.NewDefaultFactory(true)),
|
|
}
|
|
|
|
err = t.PreExecute(context.Background())
|
|
suite.NoError(err)
|
|
}
|
|
|
|
func TestAnalyzeTaskSuite(t *testing.T) {
|
|
suite.Run(t, new(AnalyzeTaskSuite))
|
|
}
|