mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-04 12:59:23 +08:00
c45f8a2946
issue: #29292 Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
1333 lines
44 KiB
Go
1333 lines
44 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package importutil
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"math/rand"
|
|
"os"
|
|
"testing"
|
|
|
|
"github.com/apache/arrow/go/v12/arrow"
|
|
"github.com/apache/arrow/go/v12/arrow/array"
|
|
"github.com/apache/arrow/go/v12/arrow/memory"
|
|
"github.com/apache/arrow/go/v12/parquet"
|
|
"github.com/apache/arrow/go/v12/parquet/pqarrow"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/mock"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/mocks"
|
|
"github.com/milvus-io/milvus/pkg/common"
|
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
|
)
|
|
|
|
// parquetSampleSchema() return a schema contains all supported data types with an int64 primary key
|
|
func parquetSampleSchema() *schemapb.CollectionSchema {
|
|
schema := &schemapb.CollectionSchema{
|
|
Name: "schema",
|
|
Description: "schema",
|
|
AutoID: true,
|
|
EnableDynamicField: true,
|
|
Fields: []*schemapb.FieldSchema{
|
|
{
|
|
FieldID: 102,
|
|
Name: "FieldBool",
|
|
IsPrimaryKey: false,
|
|
Description: "bool",
|
|
DataType: schemapb.DataType_Bool,
|
|
},
|
|
{
|
|
FieldID: 103,
|
|
Name: "FieldInt8",
|
|
IsPrimaryKey: false,
|
|
Description: "int8",
|
|
DataType: schemapb.DataType_Int8,
|
|
},
|
|
{
|
|
FieldID: 104,
|
|
Name: "FieldInt16",
|
|
IsPrimaryKey: false,
|
|
Description: "int16",
|
|
DataType: schemapb.DataType_Int16,
|
|
},
|
|
{
|
|
FieldID: 105,
|
|
Name: "FieldInt32",
|
|
IsPrimaryKey: false,
|
|
Description: "int32",
|
|
DataType: schemapb.DataType_Int32,
|
|
},
|
|
{
|
|
FieldID: 106,
|
|
Name: "FieldInt64",
|
|
IsPrimaryKey: true,
|
|
AutoID: false,
|
|
Description: "int64",
|
|
DataType: schemapb.DataType_Int64,
|
|
},
|
|
{
|
|
FieldID: 107,
|
|
Name: "FieldFloat",
|
|
IsPrimaryKey: false,
|
|
Description: "float",
|
|
DataType: schemapb.DataType_Float,
|
|
},
|
|
{
|
|
FieldID: 108,
|
|
Name: "FieldDouble",
|
|
IsPrimaryKey: false,
|
|
Description: "double",
|
|
DataType: schemapb.DataType_Double,
|
|
},
|
|
{
|
|
FieldID: 109,
|
|
Name: "FieldString",
|
|
IsPrimaryKey: false,
|
|
Description: "string",
|
|
DataType: schemapb.DataType_VarChar,
|
|
TypeParams: []*commonpb.KeyValuePair{
|
|
{Key: common.MaxLengthKey, Value: "128"},
|
|
},
|
|
},
|
|
{
|
|
FieldID: 110,
|
|
Name: "FieldBinaryVector",
|
|
IsPrimaryKey: false,
|
|
Description: "binary_vector",
|
|
DataType: schemapb.DataType_BinaryVector,
|
|
TypeParams: []*commonpb.KeyValuePair{
|
|
{Key: common.DimKey, Value: "32"},
|
|
},
|
|
},
|
|
{
|
|
FieldID: 111,
|
|
Name: "FieldFloatVector",
|
|
IsPrimaryKey: false,
|
|
Description: "float_vector",
|
|
DataType: schemapb.DataType_FloatVector,
|
|
TypeParams: []*commonpb.KeyValuePair{
|
|
{Key: common.DimKey, Value: "4"},
|
|
},
|
|
},
|
|
{
|
|
FieldID: 112,
|
|
Name: "FieldJSON",
|
|
IsPrimaryKey: false,
|
|
Description: "json",
|
|
DataType: schemapb.DataType_JSON,
|
|
},
|
|
{
|
|
FieldID: 113,
|
|
Name: "FieldArrayBool",
|
|
IsPrimaryKey: false,
|
|
Description: "int16 array",
|
|
DataType: schemapb.DataType_Array,
|
|
ElementType: schemapb.DataType_Bool,
|
|
},
|
|
{
|
|
FieldID: 114,
|
|
Name: "FieldArrayInt8",
|
|
IsPrimaryKey: false,
|
|
Description: "int16 array",
|
|
DataType: schemapb.DataType_Array,
|
|
ElementType: schemapb.DataType_Int8,
|
|
},
|
|
{
|
|
FieldID: 115,
|
|
Name: "FieldArrayInt16",
|
|
IsPrimaryKey: false,
|
|
Description: "int16 array",
|
|
DataType: schemapb.DataType_Array,
|
|
ElementType: schemapb.DataType_Int16,
|
|
},
|
|
{
|
|
FieldID: 116,
|
|
Name: "FieldArrayInt32",
|
|
IsPrimaryKey: false,
|
|
Description: "int16 array",
|
|
DataType: schemapb.DataType_Array,
|
|
ElementType: schemapb.DataType_Int32,
|
|
},
|
|
{
|
|
FieldID: 117,
|
|
Name: "FieldArrayInt64",
|
|
IsPrimaryKey: false,
|
|
Description: "int16 array",
|
|
DataType: schemapb.DataType_Array,
|
|
ElementType: schemapb.DataType_Int64,
|
|
},
|
|
{
|
|
FieldID: 118,
|
|
Name: "FieldArrayFloat",
|
|
IsPrimaryKey: false,
|
|
Description: "int16 array",
|
|
DataType: schemapb.DataType_Array,
|
|
ElementType: schemapb.DataType_Float,
|
|
},
|
|
{
|
|
FieldID: 119,
|
|
Name: "FieldArrayDouble",
|
|
IsPrimaryKey: false,
|
|
Description: "int16 array",
|
|
DataType: schemapb.DataType_Array,
|
|
ElementType: schemapb.DataType_Double,
|
|
},
|
|
{
|
|
FieldID: 120,
|
|
Name: "FieldArrayString",
|
|
IsPrimaryKey: false,
|
|
Description: "string array",
|
|
DataType: schemapb.DataType_Array,
|
|
ElementType: schemapb.DataType_VarChar,
|
|
},
|
|
{
|
|
FieldID: 121,
|
|
Name: "$meta",
|
|
IsPrimaryKey: false,
|
|
Description: "dynamic field",
|
|
DataType: schemapb.DataType_JSON,
|
|
IsDynamic: true,
|
|
},
|
|
{
|
|
FieldID: 122,
|
|
Name: "FieldBinaryVector2",
|
|
IsPrimaryKey: false,
|
|
Description: "binary_vector2",
|
|
DataType: schemapb.DataType_BinaryVector,
|
|
TypeParams: []*commonpb.KeyValuePair{
|
|
{Key: common.DimKey, Value: "64"},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
return schema
|
|
}
|
|
|
|
func milvusDataTypeToArrowType(dataType schemapb.DataType, isBinary bool) arrow.DataType {
|
|
switch dataType {
|
|
case schemapb.DataType_Bool:
|
|
return &arrow.BooleanType{}
|
|
case schemapb.DataType_Int8:
|
|
return &arrow.Int8Type{}
|
|
case schemapb.DataType_Int16:
|
|
return &arrow.Int16Type{}
|
|
case schemapb.DataType_Int32:
|
|
return &arrow.Int32Type{}
|
|
case schemapb.DataType_Int64:
|
|
return &arrow.Int64Type{}
|
|
case schemapb.DataType_Float:
|
|
return &arrow.Float32Type{}
|
|
case schemapb.DataType_Double:
|
|
return &arrow.Float64Type{}
|
|
case schemapb.DataType_VarChar, schemapb.DataType_String:
|
|
return &arrow.StringType{}
|
|
case schemapb.DataType_Array:
|
|
return &arrow.ListType{}
|
|
case schemapb.DataType_JSON:
|
|
return &arrow.StringType{}
|
|
case schemapb.DataType_FloatVector:
|
|
return arrow.ListOfField(arrow.Field{
|
|
Name: "item",
|
|
Type: &arrow.Float32Type{},
|
|
Nullable: true,
|
|
Metadata: arrow.Metadata{},
|
|
})
|
|
case schemapb.DataType_BinaryVector:
|
|
if isBinary {
|
|
return &arrow.BinaryType{}
|
|
}
|
|
return arrow.ListOfField(arrow.Field{
|
|
Name: "item",
|
|
Type: &arrow.Uint8Type{},
|
|
Nullable: true,
|
|
Metadata: arrow.Metadata{},
|
|
})
|
|
case schemapb.DataType_Float16Vector:
|
|
return arrow.ListOfField(arrow.Field{
|
|
Name: "item",
|
|
Type: &arrow.Float16Type{},
|
|
Nullable: true,
|
|
Metadata: arrow.Metadata{},
|
|
})
|
|
default:
|
|
panic("unsupported data type")
|
|
}
|
|
}
|
|
|
|
func convertMilvusSchemaToArrowSchema(schema *schemapb.CollectionSchema) *arrow.Schema {
|
|
fields := make([]arrow.Field, 0)
|
|
for _, field := range schema.GetFields() {
|
|
if field.GetDataType() == schemapb.DataType_Array {
|
|
fields = append(fields, arrow.Field{
|
|
Name: field.GetName(),
|
|
Type: arrow.ListOfField(arrow.Field{
|
|
Name: "item",
|
|
Type: milvusDataTypeToArrowType(field.GetElementType(), false),
|
|
Nullable: true,
|
|
Metadata: arrow.Metadata{},
|
|
}),
|
|
Nullable: true,
|
|
Metadata: arrow.Metadata{},
|
|
})
|
|
continue
|
|
}
|
|
fields = append(fields, arrow.Field{
|
|
Name: field.GetName(),
|
|
Type: milvusDataTypeToArrowType(field.GetDataType(), field.Name == "FieldBinaryVector2"),
|
|
Nullable: true,
|
|
Metadata: arrow.Metadata{},
|
|
})
|
|
}
|
|
return arrow.NewSchema(fields, nil)
|
|
}
|
|
|
|
func buildArrayData(dataType, elementType schemapb.DataType, dim, rows int, isBinary bool) arrow.Array {
|
|
mem := memory.NewGoAllocator()
|
|
switch dataType {
|
|
case schemapb.DataType_Bool:
|
|
builder := array.NewBooleanBuilder(mem)
|
|
for i := 0; i < rows; i++ {
|
|
builder.Append(i%2 == 0)
|
|
}
|
|
return builder.NewBooleanArray()
|
|
case schemapb.DataType_Int8:
|
|
builder := array.NewInt8Builder(mem)
|
|
for i := 0; i < rows; i++ {
|
|
builder.Append(int8(i))
|
|
}
|
|
return builder.NewInt8Array()
|
|
case schemapb.DataType_Int16:
|
|
builder := array.NewInt16Builder(mem)
|
|
for i := 0; i < rows; i++ {
|
|
builder.Append(int16(i))
|
|
}
|
|
return builder.NewInt16Array()
|
|
case schemapb.DataType_Int32:
|
|
builder := array.NewInt32Builder(mem)
|
|
for i := 0; i < rows; i++ {
|
|
builder.Append(int32(i))
|
|
}
|
|
return builder.NewInt32Array()
|
|
case schemapb.DataType_Int64:
|
|
builder := array.NewInt64Builder(mem)
|
|
for i := 0; i < rows; i++ {
|
|
builder.Append(int64(i))
|
|
}
|
|
return builder.NewInt64Array()
|
|
case schemapb.DataType_Float:
|
|
builder := array.NewFloat32Builder(mem)
|
|
for i := 0; i < rows; i++ {
|
|
builder.Append(float32(i) * 0.1)
|
|
}
|
|
return builder.NewFloat32Array()
|
|
case schemapb.DataType_Double:
|
|
builder := array.NewFloat64Builder(mem)
|
|
for i := 0; i < rows; i++ {
|
|
builder.Append(float64(i) * 0.02)
|
|
}
|
|
return builder.NewFloat64Array()
|
|
case schemapb.DataType_VarChar, schemapb.DataType_String:
|
|
builder := array.NewStringBuilder(mem)
|
|
for i := 0; i < rows; i++ {
|
|
builder.Append(randomString(10))
|
|
}
|
|
return builder.NewStringArray()
|
|
case schemapb.DataType_FloatVector:
|
|
builder := array.NewListBuilder(mem, &arrow.Float32Type{})
|
|
offsets := make([]int32, 0, rows)
|
|
valid := make([]bool, 0, rows)
|
|
for i := 0; i < dim*rows; i++ {
|
|
builder.ValueBuilder().(*array.Float32Builder).Append(float32(i))
|
|
}
|
|
for i := 0; i < rows; i++ {
|
|
offsets = append(offsets, int32(i*dim))
|
|
valid = append(valid, true)
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
return builder.NewListArray()
|
|
case schemapb.DataType_BinaryVector:
|
|
if isBinary {
|
|
builder := array.NewBinaryBuilder(mem, &arrow.BinaryType{})
|
|
for i := 0; i < rows; i++ {
|
|
element := make([]byte, dim/8)
|
|
for j := 0; j < dim/8; j++ {
|
|
element[j] = randomString(1)[0]
|
|
}
|
|
builder.Append(element)
|
|
}
|
|
return builder.NewBinaryArray()
|
|
}
|
|
builder := array.NewListBuilder(mem, &arrow.Uint8Type{})
|
|
offsets := make([]int32, 0, rows)
|
|
valid := make([]bool, 0)
|
|
for i := 0; i < dim*rows/8; i++ {
|
|
builder.ValueBuilder().(*array.Uint8Builder).Append(uint8(i))
|
|
}
|
|
for i := 0; i < rows; i++ {
|
|
offsets = append(offsets, int32(dim*i/8))
|
|
valid = append(valid, true)
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
return builder.NewListArray()
|
|
case schemapb.DataType_JSON:
|
|
builder := array.NewStringBuilder(mem)
|
|
for i := 0; i < rows; i++ {
|
|
builder.Append(fmt.Sprintf("{\"a\": \"%s\", \"b\": %d}", randomString(3), i))
|
|
}
|
|
return builder.NewStringArray()
|
|
case schemapb.DataType_Array:
|
|
offsets := make([]int32, 0, rows)
|
|
valid := make([]bool, 0, rows)
|
|
index := 0
|
|
for i := 0; i < rows; i++ {
|
|
index += i % 10
|
|
offsets = append(offsets, int32(index))
|
|
valid = append(valid, true)
|
|
}
|
|
switch elementType {
|
|
case schemapb.DataType_Bool:
|
|
builder := array.NewListBuilder(mem, &arrow.BooleanType{})
|
|
valueBuilder := builder.ValueBuilder().(*array.BooleanBuilder)
|
|
for i := 0; i < index; i++ {
|
|
valueBuilder.Append(i%2 == 0)
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
return builder.NewListArray()
|
|
case schemapb.DataType_Int8:
|
|
builder := array.NewListBuilder(mem, &arrow.Int8Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Int8Builder)
|
|
for i := 0; i < index; i++ {
|
|
valueBuilder.Append(int8(i))
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
return builder.NewListArray()
|
|
case schemapb.DataType_Int16:
|
|
builder := array.NewListBuilder(mem, &arrow.Int16Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Int16Builder)
|
|
for i := 0; i < index; i++ {
|
|
valueBuilder.Append(int16(i))
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
return builder.NewListArray()
|
|
case schemapb.DataType_Int32:
|
|
builder := array.NewListBuilder(mem, &arrow.Int32Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Int32Builder)
|
|
for i := 0; i < index; i++ {
|
|
valueBuilder.Append(int32(i))
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
return builder.NewListArray()
|
|
case schemapb.DataType_Int64:
|
|
builder := array.NewListBuilder(mem, &arrow.Int64Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Int64Builder)
|
|
for i := 0; i < index; i++ {
|
|
valueBuilder.Append(int64(i))
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
return builder.NewListArray()
|
|
case schemapb.DataType_Float:
|
|
builder := array.NewListBuilder(mem, &arrow.Float32Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Float32Builder)
|
|
for i := 0; i < index; i++ {
|
|
valueBuilder.Append(float32(i) * 0.1)
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
return builder.NewListArray()
|
|
case schemapb.DataType_Double:
|
|
builder := array.NewListBuilder(mem, &arrow.Float64Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Float64Builder)
|
|
for i := 0; i < index; i++ {
|
|
valueBuilder.Append(float64(i) * 0.02)
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
return builder.NewListArray()
|
|
case schemapb.DataType_VarChar, schemapb.DataType_String:
|
|
builder := array.NewListBuilder(mem, &arrow.StringType{})
|
|
valueBuilder := builder.ValueBuilder().(*array.StringBuilder)
|
|
for i := 0; i < index; i++ {
|
|
valueBuilder.Append(randomString(5) + "-" + fmt.Sprintf("%d", i))
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
return builder.NewListArray()
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func writeParquet(w io.Writer, milvusSchema *schemapb.CollectionSchema, numRows int) error {
|
|
schema := convertMilvusSchemaToArrowSchema(milvusSchema)
|
|
fw, err := pqarrow.NewFileWriter(schema, w, parquet.NewWriterProperties(parquet.WithMaxRowGroupLength(1000)), pqarrow.DefaultWriterProps())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer fw.Close()
|
|
|
|
batch := 1000
|
|
for i := 0; i <= numRows/batch; i++ {
|
|
columns := make([]arrow.Array, 0, len(milvusSchema.Fields))
|
|
for _, field := range milvusSchema.Fields {
|
|
dim, _ := getFieldDimension(field)
|
|
columnData := buildArrayData(field.DataType, field.ElementType, dim, batch, field.Name == "FieldBinaryVector2")
|
|
columns = append(columns, columnData)
|
|
}
|
|
recordBatch := array.NewRecord(schema, columns, int64(batch))
|
|
err = fw.Write(recordBatch)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func writeLessFieldParquet(w io.Writer, milvusSchema *schemapb.CollectionSchema, numRows int) error {
|
|
for i, field := range milvusSchema.Fields {
|
|
if field.GetName() == "FieldInt64" {
|
|
milvusSchema.Fields = append(milvusSchema.Fields[:i], milvusSchema.Fields[i+1:]...)
|
|
break
|
|
}
|
|
}
|
|
schema := convertMilvusSchemaToArrowSchema(milvusSchema)
|
|
fw, err := pqarrow.NewFileWriter(schema, w, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer fw.Close()
|
|
|
|
batch := 1000
|
|
for i := 0; i <= numRows/batch; i++ {
|
|
columns := make([]arrow.Array, 0, len(milvusSchema.Fields))
|
|
for _, field := range milvusSchema.Fields {
|
|
dim, _ := getFieldDimension(field)
|
|
columnData := buildArrayData(field.DataType, field.ElementType, dim, batch, field.Name == "FieldBinaryVector2")
|
|
columns = append(columns, columnData)
|
|
}
|
|
recordBatch := array.NewRecord(schema, columns, int64(batch))
|
|
err = fw.Write(recordBatch)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func writeMoreFieldParquet(w io.Writer, milvusSchema *schemapb.CollectionSchema, numRows int) error {
|
|
milvusSchema.Fields = append(milvusSchema.Fields, &schemapb.FieldSchema{
|
|
FieldID: 200,
|
|
Name: "FieldMore",
|
|
DataType: schemapb.DataType_Int64,
|
|
})
|
|
schema := convertMilvusSchemaToArrowSchema(milvusSchema)
|
|
fw, err := pqarrow.NewFileWriter(schema, w, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer fw.Close()
|
|
|
|
batch := 1000
|
|
for i := 0; i <= numRows/batch; i++ {
|
|
columns := make([]arrow.Array, 0, len(milvusSchema.Fields)+1)
|
|
for _, field := range milvusSchema.Fields {
|
|
dim, _ := getFieldDimension(field)
|
|
columnData := buildArrayData(field.DataType, field.ElementType, dim, batch, field.Name == "FieldBinaryVector2")
|
|
columns = append(columns, columnData)
|
|
}
|
|
recordBatch := array.NewRecord(schema, columns, int64(batch))
|
|
err = fw.Write(recordBatch)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func randomString(length int) string {
|
|
letterRunes := []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
b := make([]rune, length)
|
|
for i := range b {
|
|
b[i] = letterRunes[rand.Intn(len(letterRunes))]
|
|
}
|
|
return string(b)
|
|
}
|
|
|
|
func TestParquetParser(t *testing.T) {
|
|
paramtable.Init()
|
|
filePath := "/tmp/parser.parquet"
|
|
ctx := context.Background()
|
|
schema := parquetSampleSchema()
|
|
idAllocator := newIDAllocator(ctx, t, nil)
|
|
defer os.Remove(filePath)
|
|
|
|
writeFile := func() {
|
|
wf, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE, 0o666)
|
|
assert.NoError(t, err)
|
|
err = writeParquet(wf, schema, 100)
|
|
assert.NoError(t, err)
|
|
}
|
|
writeFile()
|
|
|
|
t.Run("read file", func(t *testing.T) {
|
|
cm := createLocalChunkManager(t)
|
|
flushFunc := func(fields BlockData, shardID int, partID int64) error {
|
|
return nil
|
|
}
|
|
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
updateProgress := func(percent int64) {
|
|
assert.Greater(t, percent, int64(0))
|
|
}
|
|
|
|
// parquet schema sizePreRecord = 5296
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 102400, cm, filePath, flushFunc, updateProgress)
|
|
assert.NoError(t, err)
|
|
defer parquetParser.Close()
|
|
err = parquetParser.Parse()
|
|
assert.NoError(t, err)
|
|
})
|
|
}
|
|
|
|
func TestParquetReader_Error(t *testing.T) {
|
|
paramtable.Init()
|
|
filePath := "/tmp/par_err.parquet"
|
|
ctx := context.Background()
|
|
schema := parquetSampleSchema()
|
|
idAllocator := newIDAllocator(ctx, t, nil)
|
|
defer os.Remove(filePath)
|
|
|
|
writeFile := func() {
|
|
wf, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE, 0o666)
|
|
assert.NoError(t, err)
|
|
err = writeParquet(wf, schema, 100)
|
|
assert.NoError(t, err)
|
|
}
|
|
writeFile()
|
|
|
|
t.Run("field not exist", func(t *testing.T) {
|
|
schema.Fields = append(schema.Fields, &schemapb.FieldSchema{
|
|
FieldID: 200,
|
|
Name: "invalid",
|
|
Description: "invalid field",
|
|
DataType: schemapb.DataType_JSON,
|
|
})
|
|
|
|
cm := createLocalChunkManager(t)
|
|
flushFunc := func(fields BlockData, shardID int, partID int64) error {
|
|
return nil
|
|
}
|
|
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 10240, cm, filePath, flushFunc, nil)
|
|
assert.NoError(t, err)
|
|
defer parquetParser.Close()
|
|
err = parquetParser.Parse()
|
|
assert.Error(t, err)
|
|
|
|
// reset schema
|
|
schema = parquetSampleSchema()
|
|
})
|
|
|
|
t.Run("schema mismatch", func(t *testing.T) {
|
|
schema.Fields[0].DataType = schemapb.DataType_JSON
|
|
cm := createLocalChunkManager(t)
|
|
flushFunc := func(fields BlockData, shardID int, partID int64) error {
|
|
return nil
|
|
}
|
|
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 10240, cm, filePath, flushFunc, nil)
|
|
assert.NoError(t, err)
|
|
defer parquetParser.Close()
|
|
err = parquetParser.Parse()
|
|
assert.Error(t, err)
|
|
|
|
// reset schema
|
|
schema = parquetSampleSchema()
|
|
})
|
|
|
|
t.Run("list data mismatch", func(t *testing.T) {
|
|
schema.Fields[11].DataType = schemapb.DataType_Bool
|
|
schema.Fields[11].ElementType = schemapb.DataType_None
|
|
cm := createLocalChunkManager(t)
|
|
flushFunc := func(fields BlockData, shardID int, partID int64) error {
|
|
return nil
|
|
}
|
|
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 10240, cm, filePath, flushFunc, nil)
|
|
assert.NoError(t, err)
|
|
defer parquetParser.Close()
|
|
err = parquetParser.Parse()
|
|
assert.Error(t, err)
|
|
|
|
// reset schema
|
|
schema = parquetSampleSchema()
|
|
})
|
|
|
|
t.Run("data not match", func(t *testing.T) {
|
|
cm := createLocalChunkManager(t)
|
|
flushFunc := func(fields BlockData, shardID int, partID int64) error {
|
|
return nil
|
|
}
|
|
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 10240, cm, filePath, flushFunc, nil)
|
|
assert.NoError(t, err)
|
|
defer parquetParser.Close()
|
|
|
|
err = parquetParser.createReaders()
|
|
assert.NoError(t, err)
|
|
t.Run("read not bool field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldInt8"]
|
|
columnReader.dataType = schemapb.DataType_Bool
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not int8 field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_Int8
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not int16 field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_Int16
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not int32 field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_Int32
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not int64 field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_Int64
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not float field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_Float
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not double field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_Double
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not string field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_VarChar
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_Bool
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_Int64
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_VarChar
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not bool array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayString"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_Bool
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not int8 array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayString"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_Int8
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not int16 array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayString"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_Int16
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not int32 array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayString"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_Int32
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not int64 array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayString"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_Int64
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not float array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayString"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_Float
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not double array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayString"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_Double
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not string array field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayBool"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_VarChar
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not float vector field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayBool"]
|
|
columnReader.dataType = schemapb.DataType_FloatVector
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read irregular float vector field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayFloat"]
|
|
columnReader.dataType = schemapb.DataType_FloatVector
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read irregular float vector field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayDouble"]
|
|
columnReader.dataType = schemapb.DataType_FloatVector
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not binary vector field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_BinaryVector
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not binary vector field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayBool"]
|
|
columnReader.dataType = schemapb.DataType_BinaryVector
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read irregular binary vector field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayInt64"]
|
|
columnReader.dataType = schemapb.DataType_BinaryVector
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read not json field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldBool"]
|
|
columnReader.dataType = schemapb.DataType_JSON
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read illegal json field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldString"]
|
|
columnReader.dataType = schemapb.DataType_JSON
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read unknown field", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldString"]
|
|
columnReader.dataType = schemapb.DataType_None
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
|
|
t.Run("read unsupported array", func(t *testing.T) {
|
|
columnReader := parquetParser.columnMap["FieldArrayString"]
|
|
columnReader.dataType = schemapb.DataType_Array
|
|
columnReader.elementType = schemapb.DataType_JSON
|
|
data, err := parquetParser.readData(columnReader, 1024)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, data)
|
|
})
|
|
})
|
|
|
|
t.Run("flush failed", func(t *testing.T) {
|
|
cm := createLocalChunkManager(t)
|
|
flushFunc := func(fields BlockData, shardID int, partID int64) error {
|
|
return fmt.Errorf("mock error")
|
|
}
|
|
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
updateProgress := func(percent int64) {
|
|
assert.Greater(t, percent, int64(0))
|
|
}
|
|
|
|
// parquet schema sizePreRecord = 5296
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 102400, cm, filePath, flushFunc, updateProgress)
|
|
assert.NoError(t, err)
|
|
defer parquetParser.Close()
|
|
err = parquetParser.Parse()
|
|
assert.Error(t, err)
|
|
})
|
|
}
|
|
|
|
func TestNewParquetParser(t *testing.T) {
|
|
paramtable.Init()
|
|
ctx := context.Background()
|
|
t.Run("nil collectionInfo", func(t *testing.T) {
|
|
parquetParser, err := NewParquetParser(ctx, nil, nil, 10240, nil, "", nil, nil)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, parquetParser)
|
|
})
|
|
|
|
t.Run("nil idAlloc", func(t *testing.T) {
|
|
collectionInfo, err := NewCollectionInfo(parquetSampleSchema(), 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, nil, 10240, nil, "", nil, nil)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, parquetParser)
|
|
})
|
|
|
|
t.Run("nil chunk manager", func(t *testing.T) {
|
|
collectionInfo, err := NewCollectionInfo(parquetSampleSchema(), 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
idAllocator := newIDAllocator(ctx, t, nil)
|
|
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 10240, nil, "", nil, nil)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, parquetParser)
|
|
})
|
|
|
|
t.Run("nil flush func", func(t *testing.T) {
|
|
collectionInfo, err := NewCollectionInfo(parquetSampleSchema(), 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
idAllocator := newIDAllocator(ctx, t, nil)
|
|
cm := createLocalChunkManager(t)
|
|
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 10240, cm, "", nil, nil)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, parquetParser)
|
|
})
|
|
|
|
t.Run("chunk manager reader fail", func(t *testing.T) {
|
|
collectionInfo, err := NewCollectionInfo(parquetSampleSchema(), 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
idAllocator := newIDAllocator(ctx, t, nil)
|
|
cm := mocks.NewChunkManager(t)
|
|
cm.EXPECT().Reader(mock.Anything, mock.Anything).Return(nil, fmt.Errorf("mock error"))
|
|
flushFunc := func(fields BlockData, shardID int, partID int64) error {
|
|
return nil
|
|
}
|
|
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 10240, cm, "", flushFunc, nil)
|
|
assert.Error(t, err)
|
|
assert.Nil(t, parquetParser)
|
|
})
|
|
}
|
|
|
|
func Test_convertArrowSchemaToDataType(t *testing.T) {
|
|
type testcase struct {
|
|
arrowField arrow.Field
|
|
dataType schemapb.DataType
|
|
isArray bool
|
|
}
|
|
testcases := []testcase{
|
|
{arrow.Field{Type: &arrow.BooleanType{}}, schemapb.DataType_Bool, false},
|
|
{arrow.Field{Type: arrow.ListOfField(arrow.Field{Type: &arrow.BooleanType{}})}, schemapb.DataType_Bool, true},
|
|
|
|
{arrow.Field{Type: &arrow.Int8Type{}}, schemapb.DataType_Int8, false},
|
|
{arrow.Field{Type: arrow.ListOfField(arrow.Field{Type: &arrow.Int8Type{}})}, schemapb.DataType_Int8, true},
|
|
|
|
{arrow.Field{Type: &arrow.Int16Type{}}, schemapb.DataType_Int16, false},
|
|
{arrow.Field{Type: arrow.ListOfField(arrow.Field{Type: &arrow.Int16Type{}})}, schemapb.DataType_Int16, true},
|
|
|
|
{arrow.Field{Type: &arrow.Int32Type{}}, schemapb.DataType_Int32, false},
|
|
{arrow.Field{Type: arrow.ListOfField(arrow.Field{Type: &arrow.Int32Type{}})}, schemapb.DataType_Int32, true},
|
|
|
|
{arrow.Field{Type: &arrow.Int64Type{}}, schemapb.DataType_Int64, false},
|
|
{arrow.Field{Type: arrow.ListOfField(arrow.Field{Type: &arrow.Int64Type{}})}, schemapb.DataType_Int64, true},
|
|
|
|
{arrow.Field{Type: &arrow.Float32Type{}}, schemapb.DataType_Float, false},
|
|
{arrow.Field{Type: arrow.ListOfField(arrow.Field{Type: &arrow.Float32Type{}})}, schemapb.DataType_Float, true},
|
|
|
|
{arrow.Field{Type: &arrow.Float64Type{}}, schemapb.DataType_Double, false},
|
|
{arrow.Field{Type: arrow.ListOfField(arrow.Field{Type: &arrow.Float64Type{}})}, schemapb.DataType_Double, true},
|
|
|
|
{arrow.Field{Type: &arrow.StringType{}}, schemapb.DataType_VarChar, false},
|
|
{arrow.Field{Type: arrow.ListOfField(arrow.Field{Type: &arrow.StringType{}})}, schemapb.DataType_VarChar, true},
|
|
|
|
{arrow.Field{Type: &arrow.BinaryType{}}, schemapb.DataType_BinaryVector, false},
|
|
{arrow.Field{Type: arrow.ListOfField(arrow.Field{Type: &arrow.Uint8Type{}})}, schemapb.DataType_BinaryVector, true},
|
|
{arrow.Field{Type: &arrow.Uint8Type{}}, schemapb.DataType_None, false},
|
|
|
|
{arrow.Field{Type: &arrow.Float16Type{}}, schemapb.DataType_None, false},
|
|
{arrow.Field{Type: arrow.ListOfField(arrow.Field{Type: &arrow.Float16Type{}})}, schemapb.DataType_Float16Vector, true},
|
|
|
|
{arrow.Field{Type: &arrow.DayTimeIntervalType{}}, schemapb.DataType_None, false},
|
|
}
|
|
|
|
for _, tt := range testcases {
|
|
arrowType, isList := convertArrowSchemaToDataType(tt.arrowField, false)
|
|
assert.Equal(t, tt.isArray, isList)
|
|
assert.Equal(t, tt.dataType, arrowType)
|
|
}
|
|
}
|
|
|
|
func Test_isConvertible(t *testing.T) {
|
|
type testcase struct {
|
|
arrowType schemapb.DataType
|
|
dataType schemapb.DataType
|
|
isArray bool
|
|
expect bool
|
|
}
|
|
testcases := []testcase{
|
|
{schemapb.DataType_Bool, schemapb.DataType_Bool, false, true},
|
|
{schemapb.DataType_Bool, schemapb.DataType_Bool, true, true},
|
|
{schemapb.DataType_Bool, schemapb.DataType_Int8, false, false},
|
|
{schemapb.DataType_Bool, schemapb.DataType_Int8, true, false},
|
|
{schemapb.DataType_Bool, schemapb.DataType_String, false, false},
|
|
{schemapb.DataType_Bool, schemapb.DataType_String, true, false},
|
|
|
|
{schemapb.DataType_Int8, schemapb.DataType_Bool, false, false},
|
|
{schemapb.DataType_Int8, schemapb.DataType_String, false, false},
|
|
{schemapb.DataType_Int8, schemapb.DataType_JSON, false, false},
|
|
{schemapb.DataType_Int8, schemapb.DataType_Int8, false, true},
|
|
{schemapb.DataType_Int8, schemapb.DataType_Int8, true, true},
|
|
{schemapb.DataType_Int8, schemapb.DataType_Int16, false, true},
|
|
{schemapb.DataType_Int8, schemapb.DataType_Int32, false, true},
|
|
{schemapb.DataType_Int8, schemapb.DataType_Int64, false, true},
|
|
{schemapb.DataType_Int8, schemapb.DataType_Float, false, true},
|
|
{schemapb.DataType_Int8, schemapb.DataType_Double, false, true},
|
|
{schemapb.DataType_Int8, schemapb.DataType_FloatVector, false, false},
|
|
|
|
{schemapb.DataType_Int16, schemapb.DataType_Bool, false, false},
|
|
{schemapb.DataType_Int16, schemapb.DataType_String, false, false},
|
|
{schemapb.DataType_Int16, schemapb.DataType_JSON, false, false},
|
|
{schemapb.DataType_Int16, schemapb.DataType_Int8, false, false},
|
|
{schemapb.DataType_Int16, schemapb.DataType_Int16, false, true},
|
|
{schemapb.DataType_Int16, schemapb.DataType_Int32, false, true},
|
|
{schemapb.DataType_Int16, schemapb.DataType_Int64, false, true},
|
|
{schemapb.DataType_Int16, schemapb.DataType_Float, false, true},
|
|
{schemapb.DataType_Int16, schemapb.DataType_Double, false, true},
|
|
{schemapb.DataType_Int16, schemapb.DataType_FloatVector, false, false},
|
|
|
|
{schemapb.DataType_Int32, schemapb.DataType_Bool, false, false},
|
|
{schemapb.DataType_Int32, schemapb.DataType_String, false, false},
|
|
{schemapb.DataType_Int32, schemapb.DataType_JSON, false, false},
|
|
{schemapb.DataType_Int32, schemapb.DataType_Int8, false, false},
|
|
{schemapb.DataType_Int32, schemapb.DataType_Int16, false, false},
|
|
{schemapb.DataType_Int32, schemapb.DataType_Int32, false, true},
|
|
{schemapb.DataType_Int32, schemapb.DataType_Int64, false, true},
|
|
{schemapb.DataType_Int32, schemapb.DataType_Float, false, true},
|
|
{schemapb.DataType_Int32, schemapb.DataType_Double, false, true},
|
|
{schemapb.DataType_Int32, schemapb.DataType_FloatVector, false, false},
|
|
|
|
{schemapb.DataType_Int64, schemapb.DataType_Bool, false, false},
|
|
{schemapb.DataType_Int64, schemapb.DataType_String, false, false},
|
|
{schemapb.DataType_Int64, schemapb.DataType_JSON, false, false},
|
|
{schemapb.DataType_Int64, schemapb.DataType_Int8, false, false},
|
|
{schemapb.DataType_Int64, schemapb.DataType_Int16, false, false},
|
|
{schemapb.DataType_Int64, schemapb.DataType_Int32, false, false},
|
|
{schemapb.DataType_Int64, schemapb.DataType_Int64, false, true},
|
|
{schemapb.DataType_Int64, schemapb.DataType_Float, false, true},
|
|
{schemapb.DataType_Int64, schemapb.DataType_Double, false, true},
|
|
{schemapb.DataType_Int64, schemapb.DataType_FloatVector, false, false},
|
|
|
|
{schemapb.DataType_Float, schemapb.DataType_Bool, false, false},
|
|
{schemapb.DataType_Float, schemapb.DataType_String, false, false},
|
|
{schemapb.DataType_Float, schemapb.DataType_JSON, false, false},
|
|
{schemapb.DataType_Float, schemapb.DataType_Int8, false, false},
|
|
{schemapb.DataType_Float, schemapb.DataType_Int16, false, false},
|
|
{schemapb.DataType_Float, schemapb.DataType_Int32, false, false},
|
|
{schemapb.DataType_Float, schemapb.DataType_Int64, false, false},
|
|
{schemapb.DataType_Float, schemapb.DataType_Float, false, true},
|
|
{schemapb.DataType_Float, schemapb.DataType_Double, false, true},
|
|
{schemapb.DataType_Float, schemapb.DataType_FloatVector, true, true},
|
|
|
|
{schemapb.DataType_Double, schemapb.DataType_Bool, false, false},
|
|
{schemapb.DataType_Double, schemapb.DataType_String, false, false},
|
|
{schemapb.DataType_Double, schemapb.DataType_JSON, false, false},
|
|
{schemapb.DataType_Double, schemapb.DataType_Int8, false, false},
|
|
{schemapb.DataType_Double, schemapb.DataType_Int16, false, false},
|
|
{schemapb.DataType_Double, schemapb.DataType_Int32, false, false},
|
|
{schemapb.DataType_Double, schemapb.DataType_Int64, false, false},
|
|
{schemapb.DataType_Double, schemapb.DataType_Float, false, false},
|
|
{schemapb.DataType_Double, schemapb.DataType_Double, false, true},
|
|
{schemapb.DataType_Double, schemapb.DataType_FloatVector, true, true},
|
|
|
|
{schemapb.DataType_VarChar, schemapb.DataType_VarChar, false, true},
|
|
{schemapb.DataType_VarChar, schemapb.DataType_JSON, false, true},
|
|
{schemapb.DataType_VarChar, schemapb.DataType_Bool, false, false},
|
|
{schemapb.DataType_VarChar, schemapb.DataType_Int64, false, false},
|
|
{schemapb.DataType_VarChar, schemapb.DataType_Float, false, false},
|
|
{schemapb.DataType_VarChar, schemapb.DataType_FloatVector, false, false},
|
|
|
|
{schemapb.DataType_Float16Vector, schemapb.DataType_Float16Vector, true, true},
|
|
{schemapb.DataType_Float16Vector, schemapb.DataType_Float16Vector, false, true},
|
|
{schemapb.DataType_BinaryVector, schemapb.DataType_BinaryVector, true, true},
|
|
{schemapb.DataType_BinaryVector, schemapb.DataType_BinaryVector, false, true},
|
|
|
|
{schemapb.DataType_JSON, schemapb.DataType_JSON, false, true},
|
|
{schemapb.DataType_JSON, schemapb.DataType_VarChar, false, false},
|
|
|
|
{schemapb.DataType_Array, schemapb.DataType_Array, false, false},
|
|
}
|
|
for _, tt := range testcases {
|
|
assert.Equal(t, tt.expect, isConvertible(tt.arrowType, tt.dataType, tt.isArray))
|
|
}
|
|
}
|
|
|
|
func TestCalcRowCountPerBlock(t *testing.T) {
|
|
t.Run("dim not valid", func(t *testing.T) {
|
|
schema := &schemapb.CollectionSchema{
|
|
Name: "dim_invalid",
|
|
Description: "dim not invalid",
|
|
Fields: []*schemapb.FieldSchema{
|
|
{
|
|
FieldID: 100,
|
|
Name: "pk",
|
|
IsPrimaryKey: true,
|
|
Description: "pk",
|
|
DataType: schemapb.DataType_Int64,
|
|
AutoID: true,
|
|
},
|
|
{
|
|
FieldID: 101,
|
|
Name: "vector",
|
|
Description: "vector",
|
|
DataType: schemapb.DataType_FloatVector,
|
|
TypeParams: []*commonpb.KeyValuePair{
|
|
{
|
|
Key: "dim",
|
|
Value: "invalid",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
EnableDynamicField: false,
|
|
}
|
|
|
|
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
p := &ParquetParser{
|
|
collectionInfo: collectionInfo,
|
|
}
|
|
|
|
_, err = p.calcRowCountPerBlock()
|
|
assert.Error(t, err)
|
|
|
|
err = p.consume()
|
|
assert.Error(t, err)
|
|
})
|
|
|
|
t.Run("nil schema", func(t *testing.T) {
|
|
collectionInfo := &CollectionInfo{
|
|
Schema: &schemapb.CollectionSchema{
|
|
Name: "nil_schema",
|
|
Description: "",
|
|
AutoID: false,
|
|
Fields: nil,
|
|
EnableDynamicField: false,
|
|
},
|
|
ShardNum: 2,
|
|
}
|
|
p := &ParquetParser{
|
|
collectionInfo: collectionInfo,
|
|
}
|
|
|
|
_, err := p.calcRowCountPerBlock()
|
|
assert.Error(t, err)
|
|
})
|
|
|
|
t.Run("normal case", func(t *testing.T) {
|
|
collectionInfo, err := NewCollectionInfo(parquetSampleSchema(), 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
p := &ParquetParser{
|
|
collectionInfo: collectionInfo,
|
|
blockSize: 10,
|
|
}
|
|
|
|
_, err = p.calcRowCountPerBlock()
|
|
assert.NoError(t, err)
|
|
})
|
|
}
|
|
|
|
func TestParquetParser_LessField(t *testing.T) {
|
|
paramtable.Init()
|
|
filePath := "/tmp/less_field.parquet"
|
|
ctx := context.Background()
|
|
schema := parquetSampleSchema()
|
|
idAllocator := newIDAllocator(ctx, t, nil)
|
|
defer os.Remove(filePath)
|
|
|
|
writeFile := func() {
|
|
wf, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE, 0o666)
|
|
assert.NoError(t, err)
|
|
err = writeLessFieldParquet(wf, schema, 100)
|
|
assert.NoError(t, err)
|
|
}
|
|
writeFile()
|
|
|
|
schema = parquetSampleSchema()
|
|
|
|
t.Run("read file", func(t *testing.T) {
|
|
cm := createLocalChunkManager(t)
|
|
flushFunc := func(fields BlockData, shardID int, partID int64) error {
|
|
return nil
|
|
}
|
|
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
updateProgress := func(percent int64) {
|
|
assert.Greater(t, percent, int64(0))
|
|
}
|
|
|
|
// parquet schema sizePreRecord = 5296
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 102400, cm, filePath, flushFunc, updateProgress)
|
|
assert.NoError(t, err)
|
|
defer parquetParser.Close()
|
|
err = parquetParser.Parse()
|
|
assert.Error(t, err)
|
|
})
|
|
}
|
|
|
|
func TestParquetParser_MoreField(t *testing.T) {
|
|
paramtable.Init()
|
|
filePath := "/tmp/more_field.parquet"
|
|
ctx := context.Background()
|
|
schema := parquetSampleSchema()
|
|
idAllocator := newIDAllocator(ctx, t, nil)
|
|
defer os.Remove(filePath)
|
|
|
|
writeFile := func() {
|
|
wf, err := os.OpenFile(filePath, os.O_RDWR|os.O_CREATE, 0o666)
|
|
assert.NoError(t, err)
|
|
err = writeMoreFieldParquet(wf, schema, 100)
|
|
assert.NoError(t, err)
|
|
}
|
|
writeFile()
|
|
|
|
schema = parquetSampleSchema()
|
|
|
|
t.Run("read file", func(t *testing.T) {
|
|
cm := createLocalChunkManager(t)
|
|
flushFunc := func(fields BlockData, shardID int, partID int64) error {
|
|
return nil
|
|
}
|
|
collectionInfo, err := NewCollectionInfo(schema, 2, []int64{1})
|
|
assert.NoError(t, err)
|
|
|
|
updateProgress := func(percent int64) {
|
|
assert.Greater(t, percent, int64(0))
|
|
}
|
|
|
|
// parquet schema sizePreRecord = 5296
|
|
parquetParser, err := NewParquetParser(ctx, collectionInfo, idAllocator, 102400, cm, filePath, flushFunc, updateProgress)
|
|
assert.NoError(t, err)
|
|
defer parquetParser.Close()
|
|
err = parquetParser.Parse()
|
|
assert.Error(t, err)
|
|
})
|
|
}
|