milvus/internal/storage/serde_test.go
Ted Xu dc5ea6f17c
feat: adding binlog streaming writer (#31537)
See #31679

---------

Signed-off-by: Ted Xu <ted.xu@zilliz.com>
2024-04-11 10:33:20 +08:00

276 lines
8.9 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package storage
import (
"bytes"
"context"
"io"
"reflect"
"testing"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/parquet/file"
"github.com/apache/arrow/go/v12/parquet/pqarrow"
"github.com/stretchr/testify/assert"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/common"
)
func TestBinlogDeserializeReader(t *testing.T) {
t.Run("test empty data", func(t *testing.T) {
reader, err := NewBinlogDeserializeReader(nil, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
err = reader.Next()
assert.Equal(t, io.EOF, err)
// blobs := generateTestData(t, 0)
// reader, err = NewBinlogDeserializeReader(blobs, common.RowIDField)
// assert.NoError(t, err)
// err = reader.Next()
// assert.Equal(t, io.EOF, err)
})
t.Run("test deserialize", func(t *testing.T) {
size := 3
blobs, err := generateTestData(size)
assert.NoError(t, err)
reader, err := NewBinlogDeserializeReader(blobs, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
for i := 1; i <= size; i++ {
err = reader.Next()
assert.NoError(t, err)
value := reader.Value()
assertTestData(t, i, value)
}
err = reader.Next()
assert.Equal(t, io.EOF, err)
})
}
func TestBinlogStreamWriter(t *testing.T) {
t.Run("test write", func(t *testing.T) {
size := 3
field := arrow.Field{Name: "bool", Type: arrow.FixedWidthTypes.Boolean}
var w bytes.Buffer
rw, err := newSingleFieldRecordWriter(1, field, &w)
assert.NoError(t, err)
builder := array.NewBooleanBuilder(memory.DefaultAllocator)
builder.AppendValues([]bool{true, false, true}, nil)
arr := builder.NewArray()
defer arr.Release()
ar := array.NewRecord(
arrow.NewSchema(
[]arrow.Field{field},
nil,
),
[]arrow.Array{arr},
int64(size),
)
r := newSimpleArrowRecord(ar, map[FieldID]schemapb.DataType{1: schemapb.DataType_Bool}, map[FieldID]int{1: 0})
defer r.Release()
err = rw.Write(r)
assert.NoError(t, err)
rw.Close()
reader, err := file.NewParquetReader(bytes.NewReader(w.Bytes()))
assert.NoError(t, err)
arrowReader, err := pqarrow.NewFileReader(reader, pqarrow.ArrowReadProperties{BatchSize: 1024}, memory.DefaultAllocator)
assert.NoError(t, err)
rr, err := arrowReader.GetRecordReader(context.Background(), nil, nil)
assert.NoError(t, err)
defer rr.Release()
ok := rr.Next()
assert.True(t, ok)
rec := rr.Record()
defer rec.Release()
assert.Equal(t, int64(size), rec.NumRows())
ok = rr.Next()
assert.False(t, ok)
})
}
func TestBinlogSerializeWriter(t *testing.T) {
t.Run("test empty data", func(t *testing.T) {
reader, err := NewBinlogDeserializeReader(nil, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
err = reader.Next()
assert.Equal(t, io.EOF, err)
})
t.Run("test serialize", func(t *testing.T) {
size := 3
blobs, err := generateTestData(size)
assert.NoError(t, err)
reader, err := NewBinlogDeserializeReader(blobs, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
schema := generateTestSchema()
// Copy write the generated data
writers := NewBinlogStreamWriters(0, 0, 0, schema.Fields)
writer, err := NewBinlogSerializeWriter(schema, 0, 0, writers, 1024)
assert.NoError(t, err)
for i := 1; i <= size; i++ {
err = reader.Next()
assert.NoError(t, err)
value := reader.Value()
assertTestData(t, i, value)
writer.Write(value)
}
err = reader.Next()
assert.Equal(t, io.EOF, err)
err = writer.Close()
assert.NoError(t, err)
assert.True(t, writer.WrittenMemorySize() >= 429)
// Read from the written data
newblobs := make([]*Blob, len(writers))
i := 0
for _, w := range writers {
blob, err := w.Finalize()
assert.NoError(t, err)
assert.NotNil(t, blob)
newblobs[i] = blob
i++
}
// assert.Equal(t, blobs[0].Value, newblobs[0].Value)
reader, err = NewBinlogDeserializeReader(blobs, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
for i := 1; i <= size; i++ {
err = reader.Next()
assert.NoError(t, err, i)
value := reader.Value()
assertTestData(t, i, value)
}
})
}
func TestSerDe(t *testing.T) {
type args struct {
dt schemapb.DataType
v any
}
tests := []struct {
name string
args args
want interface{}
want1 bool
}{
{"test bool", args{dt: schemapb.DataType_Bool, v: true}, true, true},
{"test bool negative", args{dt: schemapb.DataType_Bool, v: nil}, nil, false},
{"test int8", args{dt: schemapb.DataType_Int8, v: int8(1)}, int8(1), true},
{"test int8 negative", args{dt: schemapb.DataType_Int8, v: nil}, nil, false},
{"test int16", args{dt: schemapb.DataType_Int16, v: int16(1)}, int16(1), true},
{"test int16 negative", args{dt: schemapb.DataType_Int16, v: nil}, nil, false},
{"test int32", args{dt: schemapb.DataType_Int32, v: int32(1)}, int32(1), true},
{"test int32 negative", args{dt: schemapb.DataType_Int32, v: nil}, nil, false},
{"test int64", args{dt: schemapb.DataType_Int64, v: int64(1)}, int64(1), true},
{"test int64 negative", args{dt: schemapb.DataType_Int64, v: nil}, nil, false},
{"test float32", args{dt: schemapb.DataType_Float, v: float32(1)}, float32(1), true},
{"test float32 negative", args{dt: schemapb.DataType_Float, v: nil}, nil, false},
{"test float64", args{dt: schemapb.DataType_Double, v: float64(1)}, float64(1), true},
{"test float64 negative", args{dt: schemapb.DataType_Double, v: nil}, nil, false},
{"test string", args{dt: schemapb.DataType_String, v: "test"}, "test", true},
{"test string negative", args{dt: schemapb.DataType_String, v: nil}, nil, false},
{"test varchar", args{dt: schemapb.DataType_VarChar, v: "test"}, "test", true},
{"test varchar negative", args{dt: schemapb.DataType_VarChar, v: nil}, nil, false},
{"test array negative", args{dt: schemapb.DataType_Array, v: "{}"}, nil, false},
{"test array negative null", args{dt: schemapb.DataType_Array, v: nil}, nil, false},
{"test json", args{dt: schemapb.DataType_JSON, v: []byte("{}")}, []byte("{}"), true},
{"test json negative", args{dt: schemapb.DataType_JSON, v: nil}, nil, false},
{"test float vector", args{dt: schemapb.DataType_FloatVector, v: []float32{1.0}}, []float32{1.0}, true},
{"test float vector negative", args{dt: schemapb.DataType_FloatVector, v: nil}, nil, false},
{"test bool vector", args{dt: schemapb.DataType_BinaryVector, v: []byte{0xff}}, []byte{0xff}, true},
{"test float16 vector", args{dt: schemapb.DataType_Float16Vector, v: []byte{0xff, 0xff}}, []byte{0xff, 0xff}, true},
{"test bfloat16 vector", args{dt: schemapb.DataType_BFloat16Vector, v: []byte{0xff, 0xff}}, []byte{0xff, 0xff}, true},
{"test bfloat16 vector negative", args{dt: schemapb.DataType_BFloat16Vector, v: nil}, nil, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dt := tt.args.dt
v := tt.args.v
builder := array.NewBuilder(memory.DefaultAllocator, serdeMap[dt].arrowType(1))
serdeMap[dt].serialize(builder, v)
// assert.True(t, ok)
a := builder.NewArray()
got, got1 := serdeMap[dt].deserialize(a, 0)
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("deserialize() got = %v, want %v", got, tt.want)
}
if got1 != tt.want1 {
t.Errorf("deserialize() got1 = %v, want %v", got1, tt.want1)
}
})
}
}
func BenchmarkDeserializeReader(b *testing.B) {
len := 1000000
blobs, err := generateTestData(len)
assert.NoError(b, err)
b.ResetTimer()
for i := 0; i < b.N; i++ {
reader, err := NewBinlogDeserializeReader(blobs, common.RowIDField)
assert.NoError(b, err)
defer reader.Close()
for i := 0; i < len; i++ {
err = reader.Next()
_ = reader.Value()
assert.NoError(b, err)
}
err = reader.Next()
assert.Equal(b, io.EOF, err)
}
}
func BenchmarkBinlogIterator(b *testing.B) {
len := 1000000
blobs, err := generateTestData(len)
assert.NoError(b, err)
b.ResetTimer()
for i := 0; i < b.N; i++ {
itr, err := NewInsertBinlogIterator(blobs, common.RowIDField, schemapb.DataType_Int64)
assert.NoError(b, err)
defer itr.Dispose()
for i := 0; i < len; i++ {
assert.True(b, itr.HasNext())
_, err = itr.Next()
assert.NoError(b, err)
}
assert.False(b, itr.HasNext())
}
}