milvus/internal/indexnode/task_test.go
cai.zhang c924b0b502
enhance: [cherry-pick] Refine index code and support analyze data (#34311)
This PR primary picks up the support analyzing functionality, including
the following commits:
- main functionality: https://github.com/milvus-io/milvus/pull/33651
- refine indexnode code: https://github.com/milvus-io/milvus/pull/33458
- related fixes:
  - https://github.com/milvus-io/milvus/pull/33832
  - https://github.com/milvus-io/milvus/pull/33161

issue: #30633 
master prs: #33651 , #33458 , #33832 , #33161

---------

Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
Signed-off-by: Patrick Weizhi Xu <weizhi.xu@zilliz.com>
Co-authored-by: chasingegg <chao.gao@zilliz.com>
Co-authored-by: Patrick Weizhi Xu <weizhi.xu@zilliz.com>
2024-07-02 09:50:39 +08:00

343 lines
11 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package indexnode
import (
"context"
"testing"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
milvus_storage "github.com/milvus-io/milvus-storage/go/storage"
"github.com/milvus-io/milvus-storage/go/storage/options"
"github.com/milvus-io/milvus-storage/go/storage/schema"
"github.com/milvus-io/milvus/internal/proto/etcdpb"
"github.com/milvus-io/milvus/internal/proto/indexpb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/internal/util/typeutil"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/util/metautil"
"github.com/milvus-io/milvus/pkg/util/metric"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/timerecord"
)
type IndexBuildTaskSuite struct {
suite.Suite
schema *schemapb.CollectionSchema
collectionID int64
partitionID int64
segmentID int64
dataPath string
numRows int
dim int
}
func (suite *IndexBuildTaskSuite) SetupSuite() {
paramtable.Init()
suite.collectionID = 1000
suite.partitionID = 1001
suite.segmentID = 1002
suite.dataPath = "/tmp/milvus/data/1000/1001/1002/3/1"
suite.numRows = 100
suite.dim = 128
}
func (suite *IndexBuildTaskSuite) SetupTest() {
suite.schema = &schemapb.CollectionSchema{
Name: "test",
Description: "test",
AutoID: false,
Fields: []*schemapb.FieldSchema{
{FieldID: common.RowIDField, Name: common.RowIDFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: common.TimeStampField, Name: common.TimeStampFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: 100, Name: "pk", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: 101, Name: "ts", DataType: schemapb.DataType_Int64},
{FieldID: 102, Name: "vec", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "128"}}},
},
}
}
func (suite *IndexBuildTaskSuite) serializeData() ([]*storage.Blob, error) {
insertCodec := storage.NewInsertCodecWithSchema(&etcdpb.CollectionMeta{
Schema: suite.schema,
})
return insertCodec.Serialize(suite.partitionID, suite.segmentID, &storage.InsertData{
Data: map[storage.FieldID]storage.FieldData{
0: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
1: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
100: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
101: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
102: &storage.FloatVectorFieldData{Data: generateFloats(suite.numRows * suite.dim), Dim: suite.dim},
},
Infos: []storage.BlobInfo{{Length: suite.numRows}},
})
}
func (suite *IndexBuildTaskSuite) TestBuildMemoryIndex() {
ctx, cancel := context.WithCancel(context.Background())
req := &indexpb.CreateJobRequest{
BuildID: 1,
IndexVersion: 1,
DataPaths: []string{suite.dataPath},
IndexID: 0,
IndexName: "",
IndexParams: []*commonpb.KeyValuePair{{Key: common.IndexTypeKey, Value: "FLAT"}, {Key: common.MetricTypeKey, Value: metric.L2}},
TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "128"}},
NumRows: int64(suite.numRows),
StorageConfig: &indexpb.StorageConfig{
RootPath: "/tmp/milvus/data",
StorageType: "local",
},
CollectionID: 1,
PartitionID: 2,
SegmentID: 3,
FieldID: 102,
FieldName: "vec",
FieldType: schemapb.DataType_FloatVector,
}
cm, err := NewChunkMgrFactory().NewChunkManager(ctx, req.GetStorageConfig())
suite.NoError(err)
blobs, err := suite.serializeData()
suite.NoError(err)
err = cm.Write(ctx, suite.dataPath, blobs[0].Value)
suite.NoError(err)
t := newIndexBuildTask(ctx, cancel, req, cm, NewIndexNode(context.Background(), dependency.NewDefaultFactory(true)))
err = t.PreExecute(context.Background())
suite.NoError(err)
err = t.Execute(context.Background())
suite.NoError(err)
err = t.PostExecute(context.Background())
suite.NoError(err)
}
func TestIndexBuildTask(t *testing.T) {
suite.Run(t, new(IndexBuildTaskSuite))
}
type IndexBuildTaskV2Suite struct {
suite.Suite
schema *schemapb.CollectionSchema
arrowSchema *arrow.Schema
space *milvus_storage.Space
}
func (suite *IndexBuildTaskV2Suite) SetupSuite() {
paramtable.Init()
}
func (suite *IndexBuildTaskV2Suite) SetupTest() {
suite.schema = &schemapb.CollectionSchema{
Name: "test",
Description: "test",
AutoID: false,
Fields: []*schemapb.FieldSchema{
{FieldID: 1, Name: "pk", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: 2, Name: "ts", DataType: schemapb.DataType_Int64},
{FieldID: 3, Name: "vec", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "1"}}},
},
}
var err error
suite.arrowSchema, err = typeutil.ConvertToArrowSchema(suite.schema.Fields)
suite.NoError(err)
tmpDir := suite.T().TempDir()
opt := options.NewSpaceOptionBuilder().
SetSchema(schema.NewSchema(
suite.arrowSchema,
&schema.SchemaOptions{
PrimaryColumn: "pk",
VectorColumn: "vec",
VersionColumn: "ts",
})).
Build()
suite.space, err = milvus_storage.Open("file://"+tmpDir, opt)
suite.NoError(err)
b := array.NewRecordBuilder(memory.DefaultAllocator, suite.arrowSchema)
defer b.Release()
b.Field(0).(*array.Int64Builder).AppendValues([]int64{1}, nil)
b.Field(1).(*array.Int64Builder).AppendValues([]int64{1}, nil)
fb := b.Field(2).(*array.FixedSizeBinaryBuilder)
fb.Reserve(1)
fb.Append([]byte{1, 2, 3, 4})
rec := b.NewRecord()
defer rec.Release()
reader, err := array.NewRecordReader(suite.arrowSchema, []arrow.Record{rec})
suite.NoError(err)
err = suite.space.Write(reader, &options.DefaultWriteOptions)
suite.NoError(err)
}
func (suite *IndexBuildTaskV2Suite) TestBuildIndex() {
req := &indexpb.CreateJobRequest{
BuildID: 1,
IndexVersion: 1,
IndexID: 0,
IndexName: "",
IndexParams: []*commonpb.KeyValuePair{{Key: common.IndexTypeKey, Value: "FLAT"}, {Key: common.MetricTypeKey, Value: metric.L2}, {Key: common.DimKey, Value: "1"}},
TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "1"}},
NumRows: 10,
StorageConfig: &indexpb.StorageConfig{
RootPath: "/tmp/milvus/data",
StorageType: "local",
},
CollectionID: 1,
PartitionID: 1,
SegmentID: 1,
FieldID: 3,
FieldName: "vec",
FieldType: schemapb.DataType_FloatVector,
StorePath: "file://" + suite.space.Path(),
StoreVersion: suite.space.GetCurrentVersion(),
IndexStorePath: "file://" + suite.space.Path(),
Dim: 4,
OptionalScalarFields: []*indexpb.OptionalFieldInfo{
{FieldID: 1, FieldName: "pk", FieldType: 5, DataIds: []int64{0}},
},
}
task := newIndexBuildTaskV2(context.Background(), nil, req, NewIndexNode(context.Background(), dependency.NewDefaultFactory(true)))
var err error
err = task.PreExecute(context.Background())
suite.NoError(err)
err = task.Execute(context.Background())
suite.NoError(err)
err = task.PostExecute(context.Background())
suite.NoError(err)
}
func TestIndexBuildTaskV2Suite(t *testing.T) {
suite.Run(t, new(IndexBuildTaskV2Suite))
}
type AnalyzeTaskSuite struct {
suite.Suite
schema *schemapb.CollectionSchema
collectionID int64
partitionID int64
segmentID int64
fieldID int64
taskID int64
}
func (suite *AnalyzeTaskSuite) SetupSuite() {
paramtable.Init()
suite.collectionID = 1000
suite.partitionID = 1001
suite.segmentID = 1002
suite.fieldID = 102
suite.taskID = 1004
}
func (suite *AnalyzeTaskSuite) SetupTest() {
suite.schema = &schemapb.CollectionSchema{
Name: "test",
Description: "test",
AutoID: false,
Fields: []*schemapb.FieldSchema{
{FieldID: common.RowIDField, Name: common.RowIDFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: common.TimeStampField, Name: common.TimeStampFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: 100, Name: "pk", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: 101, Name: "ts", DataType: schemapb.DataType_Int64},
{FieldID: 102, Name: "vec", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "1"}}},
},
}
}
func (suite *AnalyzeTaskSuite) serializeData() ([]*storage.Blob, error) {
insertCodec := storage.NewInsertCodecWithSchema(&etcdpb.CollectionMeta{
Schema: suite.schema,
})
return insertCodec.Serialize(suite.partitionID, suite.segmentID, &storage.InsertData{
Data: map[storage.FieldID]storage.FieldData{
0: &storage.Int64FieldData{Data: []int64{0, 1, 2}},
1: &storage.Int64FieldData{Data: []int64{1, 2, 3}},
100: &storage.Int64FieldData{Data: []int64{0, 1, 2}},
101: &storage.Int64FieldData{Data: []int64{0, 1, 2}},
102: &storage.FloatVectorFieldData{Data: []float32{1, 2, 3}, Dim: 1},
},
Infos: []storage.BlobInfo{{Length: 3}},
})
}
func (suite *AnalyzeTaskSuite) TestAnalyze() {
ctx, cancel := context.WithCancel(context.Background())
req := &indexpb.AnalyzeRequest{
ClusterID: "test",
TaskID: 1,
CollectionID: suite.collectionID,
PartitionID: suite.partitionID,
FieldID: suite.fieldID,
FieldName: "vec",
FieldType: schemapb.DataType_FloatVector,
SegmentStats: map[int64]*indexpb.SegmentStats{
suite.segmentID: {
ID: suite.segmentID,
NumRows: 1024,
LogIDs: []int64{1},
},
},
Version: 1,
StorageConfig: &indexpb.StorageConfig{
RootPath: "/tmp/milvus/data",
StorageType: "local",
},
Dim: 1,
}
cm, err := NewChunkMgrFactory().NewChunkManager(ctx, req.GetStorageConfig())
suite.NoError(err)
blobs, err := suite.serializeData()
suite.NoError(err)
dataPath := metautil.BuildInsertLogPath(cm.RootPath(), suite.collectionID, suite.partitionID, suite.segmentID,
suite.fieldID, 1)
err = cm.Write(ctx, dataPath, blobs[0].Value)
suite.NoError(err)
t := &analyzeTask{
ident: "",
cancel: cancel,
ctx: ctx,
req: req,
tr: timerecord.NewTimeRecorder("test-indexBuildTask"),
queueDur: 0,
node: NewIndexNode(context.Background(), dependency.NewDefaultFactory(true)),
}
err = t.PreExecute(context.Background())
suite.NoError(err)
}
func TestAnalyzeTaskSuite(t *testing.T) {
suite.Run(t, new(AnalyzeTaskSuite))
}