mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-05 05:18:52 +08:00
c45f8a2946
issue: #29292 Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
1114 lines
40 KiB
Go
1114 lines
40 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package importutil
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"path"
|
|
"runtime/debug"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"go.uber.org/zap"
|
|
"go.uber.org/zap/zapcore"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/allocator"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"github.com/milvus-io/milvus/pkg/common"
|
|
"github.com/milvus-io/milvus/pkg/log"
|
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
|
)
|
|
|
|
type (
|
|
BlockData map[storage.FieldID]storage.FieldData // a map of field ID to field data
|
|
ShardData map[int64]BlockData // a map of partition ID to block data
|
|
)
|
|
|
|
func isCanceled(ctx context.Context) bool {
|
|
// canceled?
|
|
select {
|
|
case <-ctx.Done():
|
|
return true
|
|
default:
|
|
break
|
|
}
|
|
return false
|
|
}
|
|
|
|
func initBlockData(collectionSchema *schemapb.CollectionSchema) BlockData {
|
|
blockData := make(BlockData)
|
|
// rowID field is a hidden field with fieldID=0, it is always auto-generated by IDAllocator
|
|
// if primary key is int64 and autoID=true, primary key field is equal to rowID field
|
|
blockData[common.RowIDField] = &storage.Int64FieldData{
|
|
Data: make([]int64, 0),
|
|
}
|
|
|
|
for i := 0; i < len(collectionSchema.Fields); i++ {
|
|
schema := collectionSchema.Fields[i]
|
|
switch schema.DataType {
|
|
case schemapb.DataType_Bool:
|
|
blockData[schema.GetFieldID()] = &storage.BoolFieldData{
|
|
Data: make([]bool, 0),
|
|
}
|
|
case schemapb.DataType_Float:
|
|
blockData[schema.GetFieldID()] = &storage.FloatFieldData{
|
|
Data: make([]float32, 0),
|
|
}
|
|
case schemapb.DataType_Double:
|
|
blockData[schema.GetFieldID()] = &storage.DoubleFieldData{
|
|
Data: make([]float64, 0),
|
|
}
|
|
case schemapb.DataType_Int8:
|
|
blockData[schema.GetFieldID()] = &storage.Int8FieldData{
|
|
Data: make([]int8, 0),
|
|
}
|
|
case schemapb.DataType_Int16:
|
|
blockData[schema.GetFieldID()] = &storage.Int16FieldData{
|
|
Data: make([]int16, 0),
|
|
}
|
|
case schemapb.DataType_Int32:
|
|
blockData[schema.GetFieldID()] = &storage.Int32FieldData{
|
|
Data: make([]int32, 0),
|
|
}
|
|
case schemapb.DataType_Int64:
|
|
blockData[schema.GetFieldID()] = &storage.Int64FieldData{
|
|
Data: make([]int64, 0),
|
|
}
|
|
case schemapb.DataType_BinaryVector:
|
|
dim, _ := getFieldDimension(schema)
|
|
blockData[schema.GetFieldID()] = &storage.BinaryVectorFieldData{
|
|
Data: make([]byte, 0),
|
|
Dim: dim,
|
|
}
|
|
case schemapb.DataType_FloatVector:
|
|
dim, _ := getFieldDimension(schema)
|
|
blockData[schema.GetFieldID()] = &storage.FloatVectorFieldData{
|
|
Data: make([]float32, 0),
|
|
Dim: dim,
|
|
}
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
blockData[schema.GetFieldID()] = &storage.StringFieldData{
|
|
Data: make([]string, 0),
|
|
}
|
|
case schemapb.DataType_JSON:
|
|
blockData[schema.GetFieldID()] = &storage.JSONFieldData{
|
|
Data: make([][]byte, 0),
|
|
}
|
|
case schemapb.DataType_Array:
|
|
blockData[schema.GetFieldID()] = &storage.ArrayFieldData{
|
|
Data: make([]*schemapb.ScalarField, 0),
|
|
ElementType: schema.GetElementType(),
|
|
}
|
|
default:
|
|
log.Warn("Import util: unsupported data type", zap.String("DataType", getTypeName(schema.DataType)))
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return blockData
|
|
}
|
|
|
|
func initShardData(collectionSchema *schemapb.CollectionSchema, partitionIDs []int64) ShardData {
|
|
shardData := make(ShardData)
|
|
for i := 0; i < len(partitionIDs); i++ {
|
|
blockData := initBlockData(collectionSchema)
|
|
if blockData == nil {
|
|
return nil
|
|
}
|
|
shardData[partitionIDs[i]] = blockData
|
|
}
|
|
|
|
return shardData
|
|
}
|
|
|
|
func parseFloat(s string, bitsize int, fieldName string) (float64, error) {
|
|
value, err := strconv.ParseFloat(s, bitsize)
|
|
if err != nil {
|
|
return 0, merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%s' for field '%s', error: %v", s, fieldName, err))
|
|
}
|
|
|
|
err = typeutil.VerifyFloat(value)
|
|
if err != nil {
|
|
return 0, merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%s' for field '%s', error: %v", s, fieldName, err))
|
|
}
|
|
|
|
return value, nil
|
|
}
|
|
|
|
// Validator is field value validator
|
|
type Validator struct {
|
|
convertFunc func(obj interface{}, field storage.FieldData) error // convert data function
|
|
primaryKey bool // true for primary key
|
|
autoID bool // only for primary key field
|
|
isString bool // for string field
|
|
dimension int // only for vector field
|
|
fieldName string // field name
|
|
fieldID int64 // field ID
|
|
}
|
|
|
|
// initValidators constructs valiator methods and data conversion methods
|
|
func initValidators(collectionSchema *schemapb.CollectionSchema, validators map[storage.FieldID]*Validator) error {
|
|
if collectionSchema == nil {
|
|
return merr.WrapErrImportFailed("collection schema is nil")
|
|
}
|
|
|
|
for i := 0; i < len(collectionSchema.Fields); i++ {
|
|
schema := collectionSchema.Fields[i]
|
|
|
|
validators[schema.GetFieldID()] = &Validator{}
|
|
validators[schema.GetFieldID()].primaryKey = schema.GetIsPrimaryKey()
|
|
validators[schema.GetFieldID()].autoID = schema.GetAutoID()
|
|
validators[schema.GetFieldID()].fieldName = schema.GetName()
|
|
validators[schema.GetFieldID()].fieldID = schema.GetFieldID()
|
|
validators[schema.GetFieldID()].isString = false
|
|
|
|
switch schema.DataType {
|
|
case schemapb.DataType_Bool:
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
if value, ok := obj.(bool); ok {
|
|
field.(*storage.BoolFieldData).Data = append(field.(*storage.BoolFieldData).Data, value)
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for bool type field '%s'", obj, schema.GetName()))
|
|
}
|
|
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Float:
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
if num, ok := obj.(json.Number); ok {
|
|
value, err := parseFloat(string(num), 32, schema.GetName())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
field.(*storage.FloatFieldData).Data = append(field.(*storage.FloatFieldData).Data, float32(value))
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for float type field '%s'", obj, schema.GetName()))
|
|
}
|
|
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Double:
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
if num, ok := obj.(json.Number); ok {
|
|
value, err := parseFloat(string(num), 64, schema.GetName())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
field.(*storage.DoubleFieldData).Data = append(field.(*storage.DoubleFieldData).Data, value)
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for double type field '%s'", obj, schema.GetName()))
|
|
}
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Int8:
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
if num, ok := obj.(json.Number); ok {
|
|
value, err := strconv.ParseInt(string(num), 0, 8)
|
|
if err != nil {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for int8 field '%s', error: %v", num, schema.GetName(), err))
|
|
}
|
|
field.(*storage.Int8FieldData).Data = append(field.(*storage.Int8FieldData).Data, int8(value))
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int8 type field '%s'", obj, schema.GetName()))
|
|
}
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Int16:
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
if num, ok := obj.(json.Number); ok {
|
|
value, err := strconv.ParseInt(string(num), 0, 16)
|
|
if err != nil {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for int16 field '%s', error: %v", num, schema.GetName(), err))
|
|
}
|
|
field.(*storage.Int16FieldData).Data = append(field.(*storage.Int16FieldData).Data, int16(value))
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int16 type field '%s'", obj, schema.GetName()))
|
|
}
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Int32:
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
if num, ok := obj.(json.Number); ok {
|
|
value, err := strconv.ParseInt(string(num), 0, 32)
|
|
if err != nil {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for int32 field '%s', error: %v", num, schema.GetName(), err))
|
|
}
|
|
field.(*storage.Int32FieldData).Data = append(field.(*storage.Int32FieldData).Data, int32(value))
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int32 type field '%s'", obj, schema.GetName()))
|
|
}
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Int64:
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
if num, ok := obj.(json.Number); ok {
|
|
value, err := strconv.ParseInt(string(num), 0, 64)
|
|
if err != nil {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for int64 field '%s', error: %v", num, schema.GetName(), err))
|
|
}
|
|
field.(*storage.Int64FieldData).Data = append(field.(*storage.Int64FieldData).Data, value)
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int64 type field '%s'", obj, schema.GetName()))
|
|
}
|
|
return nil
|
|
}
|
|
case schemapb.DataType_BinaryVector:
|
|
dim, err := getFieldDimension(schema)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
validators[schema.GetFieldID()].dimension = dim
|
|
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
arr, ok := obj.([]interface{})
|
|
if !ok {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("'%v' is not an array for binary vector field '%s'", obj, schema.GetName()))
|
|
}
|
|
// we use uint8 to represent binary vector in json file, each uint8 value represents 8 dimensions.
|
|
if len(arr)*8 != dim {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("bit size %d doesn't equal to vector dimension %d of field '%s'", len(arr)*8, dim, schema.GetName()))
|
|
}
|
|
|
|
for i := 0; i < len(arr); i++ {
|
|
if num, ok := arr[i].(json.Number); ok {
|
|
value, err := strconv.ParseUint(string(num), 0, 8)
|
|
if err != nil {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for binary vector field '%s', error: %v", num, schema.GetName(), err))
|
|
}
|
|
field.(*storage.BinaryVectorFieldData).Data = append(field.(*storage.BinaryVectorFieldData).Data, byte(value))
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for binary vector field '%s'", obj, schema.GetName()))
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
case schemapb.DataType_FloatVector:
|
|
dim, err := getFieldDimension(schema)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
validators[schema.GetFieldID()].dimension = dim
|
|
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
arr, ok := obj.([]interface{})
|
|
if !ok {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("'%v' is not an array for float vector field '%s'", obj, schema.GetName()))
|
|
}
|
|
if len(arr) != dim {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("array size %d doesn't equal to vector dimension %d of field '%s'", len(arr), dim, schema.GetName()))
|
|
}
|
|
|
|
for i := 0; i < len(arr); i++ {
|
|
if num, ok := arr[i].(json.Number); ok {
|
|
value, err := parseFloat(string(num), 32, schema.GetName())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
field.(*storage.FloatVectorFieldData).Data = append(field.(*storage.FloatVectorFieldData).Data, float32(value))
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for float vector field '%s'", obj, schema.GetName()))
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
validators[schema.GetFieldID()].isString = true
|
|
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
if value, ok := obj.(string); ok {
|
|
field.(*storage.StringFieldData).Data = append(field.(*storage.StringFieldData).Data, value)
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for varchar type field '%s'", obj, schema.GetName()))
|
|
}
|
|
return nil
|
|
}
|
|
case schemapb.DataType_JSON:
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
// for JSON data, we accept two kinds input: string and map[string]interface
|
|
// user can write JSON content as {"FieldJSON": "{\"x\": 8}"} or {"FieldJSON": {"x": 8}}
|
|
if value, ok := obj.(string); ok {
|
|
var dummy interface{}
|
|
err := json.Unmarshal([]byte(value), &dummy)
|
|
if err != nil {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value '%v' for JSON field '%s', error: %v", value, schema.GetName(), err))
|
|
}
|
|
field.(*storage.JSONFieldData).Data = append(field.(*storage.JSONFieldData).Data, []byte(value))
|
|
} else if mp, ok := obj.(map[string]interface{}); ok {
|
|
bs, err := json.Marshal(mp)
|
|
if err != nil {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to parse value for JSON field '%s', error: %v", schema.GetName(), err))
|
|
}
|
|
field.(*storage.JSONFieldData).Data = append(field.(*storage.JSONFieldData).Data, bs)
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for JSON type field '%s'", obj, schema.GetName()))
|
|
}
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Array:
|
|
validators[schema.GetFieldID()].convertFunc = func(obj interface{}, field storage.FieldData) error {
|
|
arr, ok := obj.([]interface{})
|
|
if !ok {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("'%v' is not an array for array field '%s'", obj, schema.GetName()))
|
|
}
|
|
return getArrayElementData(schema, arr, field)
|
|
}
|
|
default:
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("unsupport data type: %s", getTypeName(collectionSchema.Fields[i].DataType)))
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func getArrayElementData(schema *schemapb.FieldSchema, arr []interface{}, field storage.FieldData) error {
|
|
switch schema.GetElementType() {
|
|
case schemapb.DataType_Bool:
|
|
boolData := make([]bool, 0)
|
|
for i := 0; i < len(arr); i++ {
|
|
if value, ok := arr[i].(bool); ok {
|
|
boolData = append(boolData, value)
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for bool array field '%s'", arr, schema.GetName()))
|
|
}
|
|
}
|
|
field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_BoolData{
|
|
BoolData: &schemapb.BoolArray{
|
|
Data: boolData,
|
|
},
|
|
},
|
|
})
|
|
case schemapb.DataType_Int8:
|
|
int8Data := make([]int32, 0)
|
|
for i := 0; i < len(arr); i++ {
|
|
if num, ok := arr[i].(json.Number); ok {
|
|
value, err := strconv.ParseInt(string(num), 0, 8)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
int8Data = append(int8Data, int32(value))
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int array field '%s'", arr, schema.GetName()))
|
|
}
|
|
}
|
|
field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_IntData{
|
|
IntData: &schemapb.IntArray{
|
|
Data: int8Data,
|
|
},
|
|
},
|
|
})
|
|
|
|
case schemapb.DataType_Int16:
|
|
int16Data := make([]int32, 0)
|
|
for i := 0; i < len(arr); i++ {
|
|
if num, ok := arr[i].(json.Number); ok {
|
|
value, err := strconv.ParseInt(string(num), 0, 16)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
int16Data = append(int16Data, int32(value))
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int array field '%s'", arr, schema.GetName()))
|
|
}
|
|
}
|
|
field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_IntData{
|
|
IntData: &schemapb.IntArray{
|
|
Data: int16Data,
|
|
},
|
|
},
|
|
})
|
|
case schemapb.DataType_Int32:
|
|
intData := make([]int32, 0)
|
|
for i := 0; i < len(arr); i++ {
|
|
if num, ok := arr[i].(json.Number); ok {
|
|
value, err := strconv.ParseInt(string(num), 0, 32)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
intData = append(intData, int32(value))
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for int array field '%s'", arr, schema.GetName()))
|
|
}
|
|
}
|
|
field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_IntData{
|
|
IntData: &schemapb.IntArray{
|
|
Data: intData,
|
|
},
|
|
},
|
|
})
|
|
case schemapb.DataType_Int64:
|
|
longData := make([]int64, 0)
|
|
for i := 0; i < len(arr); i++ {
|
|
if num, ok := arr[i].(json.Number); ok {
|
|
value, err := strconv.ParseInt(string(num), 0, 64)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
longData = append(longData, value)
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for long array field '%s'", arr, schema.GetName()))
|
|
}
|
|
}
|
|
field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_LongData{
|
|
LongData: &schemapb.LongArray{
|
|
Data: longData,
|
|
},
|
|
},
|
|
})
|
|
case schemapb.DataType_Float:
|
|
floatData := make([]float32, 0)
|
|
for i := 0; i < len(arr); i++ {
|
|
if num, ok := arr[i].(json.Number); ok {
|
|
value, err := parseFloat(string(num), 32, schema.GetName())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
floatData = append(floatData, float32(value))
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for float array field '%s'", arr, schema.GetName()))
|
|
}
|
|
}
|
|
field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_FloatData{
|
|
FloatData: &schemapb.FloatArray{
|
|
Data: floatData,
|
|
},
|
|
},
|
|
})
|
|
case schemapb.DataType_Double:
|
|
doubleData := make([]float64, 0)
|
|
for i := 0; i < len(arr); i++ {
|
|
if num, ok := arr[i].(json.Number); ok {
|
|
value, err := parseFloat(string(num), 32, schema.GetName())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
doubleData = append(doubleData, value)
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for double array field '%s'", arr, schema.GetName()))
|
|
}
|
|
}
|
|
field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_DoubleData{
|
|
DoubleData: &schemapb.DoubleArray{
|
|
Data: doubleData,
|
|
},
|
|
},
|
|
})
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
stringFieldData := &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_StringData{
|
|
StringData: &schemapb.StringArray{
|
|
Data: make([]string, 0),
|
|
},
|
|
},
|
|
}
|
|
for i := 0; i < len(arr); i++ {
|
|
if str, ok := arr[i].(string); ok {
|
|
stringFieldData.GetStringData().Data = append(stringFieldData.GetStringData().Data, str)
|
|
} else {
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("illegal value '%v' for string array field '%s'", arr, schema.GetName()))
|
|
}
|
|
}
|
|
field.(*storage.ArrayFieldData).Data = append(field.(*storage.ArrayFieldData).Data, stringFieldData)
|
|
default:
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("unsupport element type: %v", getTypeName(schema.GetElementType())))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func printFieldsDataInfo(fieldsData BlockData, msg string, files []string) {
|
|
stats := make([]zapcore.Field, 0)
|
|
for k, v := range fieldsData {
|
|
stats = append(stats, zap.Int(strconv.FormatInt(k, 10), v.RowNum()))
|
|
}
|
|
|
|
if len(files) > 0 {
|
|
stats = append(stats, zap.Any(Files, files))
|
|
}
|
|
log.Info(msg, stats...)
|
|
}
|
|
|
|
// GetFileNameAndExt extracts file name and extension
|
|
// for example: "/a/b/c.ttt" returns "c" and ".ttt"
|
|
func GetFileNameAndExt(filePath string) (string, string) {
|
|
fileName := path.Base(filePath)
|
|
fileType := path.Ext(fileName)
|
|
fileNameWithoutExt := strings.TrimSuffix(fileName, fileType)
|
|
return fileNameWithoutExt, fileType
|
|
}
|
|
|
|
// getFieldDimension gets dimension of vecotor field
|
|
func getFieldDimension(schema *schemapb.FieldSchema) (int, error) {
|
|
for _, kvPair := range schema.GetTypeParams() {
|
|
key, value := kvPair.GetKey(), kvPair.GetValue()
|
|
if key == common.DimKey {
|
|
dim, err := strconv.Atoi(value)
|
|
if err != nil {
|
|
return 0, merr.WrapErrImportFailed(fmt.Sprintf("illegal vector dimension '%s' for field '%s', error: %v", value, schema.GetName(), err))
|
|
}
|
|
return dim, nil
|
|
}
|
|
}
|
|
|
|
return 0, merr.WrapErrImportFailed(fmt.Sprintf("vector dimension is not defined for field '%s'", schema.GetName()))
|
|
}
|
|
|
|
// triggerGC triggers golang gc to return all free memory back to the underlying system at once,
|
|
// Note: this operation is expensive, and can lead to latency spikes as it holds the heap lock through the whole process
|
|
func triggerGC() {
|
|
debug.FreeOSMemory()
|
|
}
|
|
|
|
// if user didn't provide dynamic data, fill the dynamic field by "{}"
|
|
func fillDynamicData(blockData BlockData, collectionSchema *schemapb.CollectionSchema) error {
|
|
if !collectionSchema.GetEnableDynamicField() {
|
|
return nil
|
|
}
|
|
|
|
dynamicFieldID := int64(-1)
|
|
for i := 0; i < len(collectionSchema.Fields); i++ {
|
|
schema := collectionSchema.Fields[i]
|
|
if schema.GetIsDynamic() {
|
|
dynamicFieldID = schema.GetFieldID()
|
|
break
|
|
}
|
|
}
|
|
|
|
if dynamicFieldID < 0 {
|
|
return merr.WrapErrImportFailed("the collection schema is dynamic but dynamic field is not found")
|
|
}
|
|
|
|
rowCount := 0
|
|
if len(blockData) > 0 {
|
|
for id, v := range blockData {
|
|
if id == dynamicFieldID {
|
|
continue
|
|
}
|
|
rowCount = v.RowNum()
|
|
}
|
|
}
|
|
|
|
dynamicData, ok := blockData[dynamicFieldID]
|
|
if !ok || dynamicData == nil {
|
|
// dynamic field data is not provided, create new one
|
|
dynamicData = &storage.JSONFieldData{
|
|
Data: make([][]byte, 0),
|
|
}
|
|
}
|
|
|
|
if dynamicData.RowNum() < rowCount {
|
|
// fill the dynamic data by an empty JSON object, make sure the row count is eaual to other fields
|
|
data := dynamicData.(*storage.JSONFieldData)
|
|
bs := []byte("{}")
|
|
dynamicRowCount := dynamicData.RowNum()
|
|
for i := 0; i < rowCount-dynamicRowCount; i++ {
|
|
data.Data = append(data.Data, bs)
|
|
}
|
|
}
|
|
|
|
blockData[dynamicFieldID] = dynamicData
|
|
|
|
return nil
|
|
}
|
|
|
|
// tryFlushBlocks does the two things:
|
|
// 1. if accumulate data of a block exceed blockSize, call callFlushFunc to generate new binlog file
|
|
// 2. if total accumulate data exceed maxTotalSize, call callFlushFunc to flush the biggest block
|
|
func tryFlushBlocks(ctx context.Context,
|
|
shardsData []ShardData,
|
|
collectionSchema *schemapb.CollectionSchema,
|
|
callFlushFunc ImportFlushFunc,
|
|
blockSize int64,
|
|
maxTotalSize int64,
|
|
force bool,
|
|
) error {
|
|
totalSize := 0
|
|
biggestSize := 0
|
|
biggestItem := -1
|
|
biggestPartition := int64(-1)
|
|
|
|
// 1. if accumulate data of a block exceed blockSize, call callFlushFunc to generate new binlog file
|
|
for i := 0; i < len(shardsData); i++ {
|
|
// outside context might be canceled(service stop, or future enhancement for canceling import task)
|
|
if isCanceled(ctx) {
|
|
log.Warn("Import util: import task was canceled")
|
|
return merr.WrapErrImportFailed("import task was canceled")
|
|
}
|
|
|
|
shardData := shardsData[i]
|
|
for partitionID, blockData := range shardData {
|
|
err := fillDynamicData(blockData, collectionSchema)
|
|
if err != nil {
|
|
log.Warn("Import util: failed to fill dynamic field", zap.Error(err))
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to fill dynamic field, error: %v", err))
|
|
}
|
|
|
|
// Note: even rowCount is 0, the size is still non-zero
|
|
size := 0
|
|
rowCount := 0
|
|
for _, fieldData := range blockData {
|
|
size += fieldData.GetMemorySize()
|
|
rowCount = fieldData.RowNum()
|
|
}
|
|
|
|
// force to flush, called at the end of Read()
|
|
if force && rowCount > 0 {
|
|
printFieldsDataInfo(blockData, "import util: prepare to force flush a block", nil)
|
|
err := callFlushFunc(blockData, i, partitionID)
|
|
if err != nil {
|
|
log.Warn("Import util: failed to force flush block data", zap.Int("shardID", i),
|
|
zap.Int64("partitionID", partitionID), zap.Error(err))
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to force flush block data for shard id %d to partition %d, error: %v", i, partitionID, err))
|
|
}
|
|
log.Info("Import util: force flush", zap.Int("rowCount", rowCount), zap.Int("size", size),
|
|
zap.Int("shardID", i), zap.Int64("partitionID", partitionID))
|
|
|
|
shardData[partitionID] = initBlockData(collectionSchema)
|
|
if shardData[partitionID] == nil {
|
|
log.Warn("Import util: failed to initialize FieldData list", zap.Int("shardID", i), zap.Int64("partitionID", partitionID))
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to initialize FieldData list for shard id %d to partition %d", i, partitionID))
|
|
}
|
|
continue
|
|
}
|
|
|
|
// if segment size is larger than predefined blockSize, flush to create a new binlog file
|
|
// initialize a new FieldData list for next round batch read
|
|
if size > int(blockSize) && rowCount > 0 {
|
|
printFieldsDataInfo(blockData, "import util: prepare to flush block larger than blockSize", nil)
|
|
err := callFlushFunc(blockData, i, partitionID)
|
|
if err != nil {
|
|
log.Warn("Import util: failed to flush block data", zap.Int("shardID", i),
|
|
zap.Int64("partitionID", partitionID), zap.Error(err))
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to flush block data for shard id %d to partition %d, error: %v", i, partitionID, err))
|
|
}
|
|
log.Info("Import util: block size exceed limit and flush", zap.Int("rowCount", rowCount), zap.Int("size", size),
|
|
zap.Int("shardID", i), zap.Int64("partitionID", partitionID), zap.Int64("blockSize", blockSize))
|
|
|
|
shardData[partitionID] = initBlockData(collectionSchema)
|
|
if shardData[partitionID] == nil {
|
|
log.Warn("Import util: failed to initialize FieldData list", zap.Int("shardID", i), zap.Int64("partitionID", partitionID))
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to initialize FieldData list for shard id %d to partition %d", i, partitionID))
|
|
}
|
|
continue
|
|
}
|
|
|
|
// calculate the total size(ignore the flushed blocks)
|
|
// find out the biggest block for the step 2
|
|
totalSize += size
|
|
if size > biggestSize {
|
|
biggestSize = size
|
|
biggestItem = i
|
|
biggestPartition = partitionID
|
|
}
|
|
}
|
|
}
|
|
|
|
// 2. if total accumulate data exceed maxTotalSize, call callFlushFUnc to flush the biggest block
|
|
if totalSize > int(maxTotalSize) && biggestItem >= 0 && biggestPartition >= 0 {
|
|
// outside context might be canceled(service stop, or future enhancement for canceling import task)
|
|
if isCanceled(ctx) {
|
|
log.Warn("Import util: import task was canceled")
|
|
return merr.WrapErrImportFailed("import task was canceled")
|
|
}
|
|
|
|
blockData := shardsData[biggestItem][biggestPartition]
|
|
err := fillDynamicData(blockData, collectionSchema)
|
|
if err != nil {
|
|
log.Warn("Import util: failed to fill dynamic field", zap.Error(err))
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to fill dynamic field, error: %v", err))
|
|
}
|
|
|
|
// Note: even rowCount is 0, the size is still non-zero
|
|
size := 0
|
|
rowCount := 0
|
|
for _, fieldData := range blockData {
|
|
size += fieldData.GetMemorySize()
|
|
rowCount = fieldData.RowNum()
|
|
}
|
|
|
|
if rowCount > 0 {
|
|
printFieldsDataInfo(blockData, "import util: prepare to flush biggest block", nil)
|
|
err = callFlushFunc(blockData, biggestItem, biggestPartition)
|
|
if err != nil {
|
|
log.Warn("Import util: failed to flush biggest block data", zap.Int("shardID", biggestItem),
|
|
zap.Int64("partitionID", biggestPartition))
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to flush biggest block data for shard id %d to partition %d, error: %v",
|
|
biggestItem, biggestPartition, err))
|
|
}
|
|
log.Info("Import util: total size exceed limit and flush", zap.Int("rowCount", rowCount),
|
|
zap.Int("size", size), zap.Int("totalSize", totalSize), zap.Int("shardID", biggestItem))
|
|
|
|
shardsData[biggestItem][biggestPartition] = initBlockData(collectionSchema)
|
|
if shardsData[biggestItem][biggestPartition] == nil {
|
|
log.Warn("Import util: failed to initialize FieldData list", zap.Int("shardID", biggestItem),
|
|
zap.Int64("partitionID", biggestPartition))
|
|
return merr.WrapErrImportFailed(fmt.Sprintf("failed to initialize FieldData list for shard id %d to partition %d", biggestItem, biggestPartition))
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func getTypeName(dt schemapb.DataType) string {
|
|
switch dt {
|
|
case schemapb.DataType_Bool:
|
|
return "Bool"
|
|
case schemapb.DataType_Int8:
|
|
return "Int8"
|
|
case schemapb.DataType_Int16:
|
|
return "Int16"
|
|
case schemapb.DataType_Int32:
|
|
return "Int32"
|
|
case schemapb.DataType_Int64:
|
|
return "Int64"
|
|
case schemapb.DataType_Float:
|
|
return "Float"
|
|
case schemapb.DataType_Double:
|
|
return "Double"
|
|
case schemapb.DataType_VarChar:
|
|
return "Varchar"
|
|
case schemapb.DataType_String:
|
|
return "String"
|
|
case schemapb.DataType_BinaryVector:
|
|
return "BinaryVector"
|
|
case schemapb.DataType_FloatVector:
|
|
return "FloatVector"
|
|
case schemapb.DataType_JSON:
|
|
return "JSON"
|
|
default:
|
|
return "InvalidType"
|
|
}
|
|
}
|
|
|
|
func pkToShard(pk interface{}, shardNum uint32) (uint32, error) {
|
|
var shard uint32
|
|
strPK, ok := pk.(string)
|
|
if ok {
|
|
hash := typeutil.HashString2Uint32(strPK)
|
|
shard = hash % shardNum
|
|
} else {
|
|
intPK, ok := pk.(int64)
|
|
if !ok {
|
|
log.Warn("parser: primary key field must be int64 or varchar")
|
|
return 0, merr.WrapErrImportFailed("primary key field must be int64 or varchar")
|
|
}
|
|
hash, _ := typeutil.Hash32Int64(intPK)
|
|
shard = hash % shardNum
|
|
}
|
|
|
|
return shard, nil
|
|
}
|
|
|
|
func UpdateKVInfo(infos *[]*commonpb.KeyValuePair, k string, v string) error {
|
|
if infos == nil {
|
|
return merr.WrapErrImportFailed("Import util: kv array pointer is nil")
|
|
}
|
|
|
|
found := false
|
|
for _, kv := range *infos {
|
|
if kv.GetKey() == k {
|
|
kv.Value = v
|
|
found = true
|
|
}
|
|
}
|
|
if !found {
|
|
*infos = append(*infos, &commonpb.KeyValuePair{Key: k, Value: v})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// appendFunc defines the methods to append data to storage.FieldData
|
|
func appendFunc(schema *schemapb.FieldSchema) func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
switch schema.DataType {
|
|
case schemapb.DataType_Bool:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.BoolFieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).(bool))
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Float:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.FloatFieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).(float32))
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Double:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.DoubleFieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).(float64))
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Int8:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.Int8FieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).(int8))
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Int16:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.Int16FieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).(int16))
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Int32:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.Int32FieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).(int32))
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Int64:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.Int64FieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).(int64))
|
|
return nil
|
|
}
|
|
case schemapb.DataType_BinaryVector:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.BinaryVectorFieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).([]byte)...)
|
|
return nil
|
|
}
|
|
case schemapb.DataType_FloatVector:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.FloatVectorFieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).([]float32)...)
|
|
return nil
|
|
}
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.StringFieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).(string))
|
|
return nil
|
|
}
|
|
case schemapb.DataType_JSON:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.JSONFieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).([]byte))
|
|
return nil
|
|
}
|
|
case schemapb.DataType_Array:
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
arr := target.(*storage.ArrayFieldData)
|
|
arr.Data = append(arr.Data, src.GetRow(n).(*schemapb.ScalarField))
|
|
return nil
|
|
}
|
|
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func prepareAppendFunctions(collectionInfo *CollectionInfo) (map[string]func(src storage.FieldData, n int, target storage.FieldData) error, error) {
|
|
appendFunctions := make(map[string]func(src storage.FieldData, n int, target storage.FieldData) error)
|
|
for i := 0; i < len(collectionInfo.Schema.Fields); i++ {
|
|
schema := collectionInfo.Schema.Fields[i]
|
|
appendFuncErr := appendFunc(schema)
|
|
if appendFuncErr == nil {
|
|
log.Warn("parser: unsupported field data type")
|
|
return nil, fmt.Errorf("unsupported field data type: %d", schema.GetDataType())
|
|
}
|
|
appendFunctions[schema.GetName()] = appendFuncErr
|
|
}
|
|
return appendFunctions, nil
|
|
}
|
|
|
|
// checkRowCount check row count of each field, all fields row count must be equal
|
|
func checkRowCount(collectionInfo *CollectionInfo, fieldsData BlockData) (int, error) {
|
|
rowCount := 0
|
|
rowCounter := make(map[string]int)
|
|
for i := 0; i < len(collectionInfo.Schema.Fields); i++ {
|
|
schema := collectionInfo.Schema.Fields[i]
|
|
if !schema.GetAutoID() {
|
|
v, ok := fieldsData[schema.GetFieldID()]
|
|
if !ok {
|
|
if schema.GetIsDynamic() {
|
|
// user might not provide numpy file for dynamic field, skip it, will auto-generate later
|
|
continue
|
|
}
|
|
log.Warn("field not provided", zap.String("fieldName", schema.GetName()))
|
|
return 0, fmt.Errorf("field '%s' not provided", schema.GetName())
|
|
}
|
|
rowCounter[schema.GetName()] = v.RowNum()
|
|
if v.RowNum() > rowCount {
|
|
rowCount = v.RowNum()
|
|
}
|
|
}
|
|
}
|
|
|
|
for name, count := range rowCounter {
|
|
if count != rowCount {
|
|
log.Warn("field row count is not equal to other fields row count", zap.String("fieldName", name),
|
|
zap.Int("rowCount", count), zap.Int("otherRowCount", rowCount))
|
|
return 0, fmt.Errorf("field '%s' row count %d is not equal to other fields row count: %d", name, count, rowCount)
|
|
}
|
|
}
|
|
|
|
return rowCount, nil
|
|
}
|
|
|
|
// hashToPartition hash partition key to get an partition ID, return the first partition ID if no partition key exist
|
|
// CollectionInfo ensures only one partition ID in the PartitionIDs if no partition key exist
|
|
func hashToPartition(collectionInfo *CollectionInfo, fieldsData BlockData, rowNumber int) (int64, error) {
|
|
if collectionInfo.PartitionKey == nil {
|
|
// no partition key, directly return the target partition id
|
|
if len(collectionInfo.PartitionIDs) != 1 {
|
|
return 0, fmt.Errorf("collection '%s' partition list is empty", collectionInfo.Schema.Name)
|
|
}
|
|
return collectionInfo.PartitionIDs[0], nil
|
|
}
|
|
|
|
partitionKeyID := collectionInfo.PartitionKey.GetFieldID()
|
|
fieldData := fieldsData[partitionKeyID]
|
|
value := fieldData.GetRow(rowNumber)
|
|
index, err := pkToShard(value, uint32(len(collectionInfo.PartitionIDs)))
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
return collectionInfo.PartitionIDs[index], nil
|
|
}
|
|
|
|
// splitFieldsData is to split the in-memory data(parsed from column-based files) into shards
|
|
func splitFieldsData(collectionInfo *CollectionInfo, fieldsData BlockData, shards []ShardData, rowIDAllocator *allocator.IDAllocator) ([]int64, error) {
|
|
if len(fieldsData) == 0 {
|
|
log.Warn("fields data to split is empty")
|
|
return nil, fmt.Errorf("fields data to split is empty")
|
|
}
|
|
|
|
if len(shards) != int(collectionInfo.ShardNum) {
|
|
log.Warn("block count is not equal to collection shard number", zap.Int("shardsLen", len(shards)),
|
|
zap.Int32("shardNum", collectionInfo.ShardNum))
|
|
return nil, fmt.Errorf("block count %d is not equal to collection shard number %d", len(shards), collectionInfo.ShardNum)
|
|
}
|
|
|
|
rowCount, err := checkRowCount(collectionInfo, fieldsData)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// generate auto id for primary key and rowid field
|
|
rowIDBegin, rowIDEnd, err := rowIDAllocator.Alloc(uint32(rowCount))
|
|
if err != nil {
|
|
log.Warn("failed to alloc row ID", zap.Int("rowCount", rowCount), zap.Error(err))
|
|
return nil, fmt.Errorf("failed to alloc %d rows ID, error: %w", rowCount, err)
|
|
}
|
|
|
|
rowIDField, ok := fieldsData[common.RowIDField]
|
|
if !ok {
|
|
rowIDField = &storage.Int64FieldData{
|
|
Data: make([]int64, 0, rowCount),
|
|
}
|
|
fieldsData[common.RowIDField] = rowIDField
|
|
}
|
|
rowIDFieldArr := rowIDField.(*storage.Int64FieldData)
|
|
for i := rowIDBegin; i < rowIDEnd; i++ {
|
|
rowIDFieldArr.Data = append(rowIDFieldArr.Data, i)
|
|
}
|
|
|
|
// reset the primary keys, as we know, only int64 pk can be auto-generated
|
|
primaryKey := collectionInfo.PrimaryKey
|
|
autoIDRange := make([]int64, 0)
|
|
if primaryKey.GetAutoID() {
|
|
log.Info("generating auto-id", zap.Int("rowCount", rowCount), zap.Int64("rowIDBegin", rowIDBegin))
|
|
if primaryKey.GetDataType() != schemapb.DataType_Int64 {
|
|
log.Warn("primary key field is auto-generated but the field type is not int64")
|
|
return nil, fmt.Errorf("primary key field is auto-generated but the field type is not int64")
|
|
}
|
|
|
|
primaryDataArr := &storage.Int64FieldData{
|
|
Data: make([]int64, 0, rowCount),
|
|
}
|
|
for i := rowIDBegin; i < rowIDEnd; i++ {
|
|
primaryDataArr.Data = append(primaryDataArr.Data, i)
|
|
}
|
|
|
|
fieldsData[primaryKey.GetFieldID()] = primaryDataArr
|
|
autoIDRange = append(autoIDRange, rowIDBegin, rowIDEnd)
|
|
}
|
|
|
|
// if the primary key is not auto-gernerate and user doesn't provide, return error
|
|
primaryData, ok := fieldsData[primaryKey.GetFieldID()]
|
|
if !ok || primaryData.RowNum() <= 0 {
|
|
log.Warn("primary key field is not provided", zap.String("keyName", primaryKey.GetName()))
|
|
return nil, fmt.Errorf("primary key '%s' field data is not provided", primaryKey.GetName())
|
|
}
|
|
|
|
// prepare append functions
|
|
appendFunctions, err := prepareAppendFunctions(collectionInfo)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// split data into shards
|
|
for i := 0; i < rowCount; i++ {
|
|
// hash to a shard number and partition
|
|
pk := primaryData.GetRow(i)
|
|
shard, err := pkToShard(pk, uint32(collectionInfo.ShardNum))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
partitionID, err := hashToPartition(collectionInfo, fieldsData, i)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// set rowID field
|
|
rowIDField := shards[shard][partitionID][common.RowIDField].(*storage.Int64FieldData)
|
|
rowIDField.Data = append(rowIDField.Data, rowIDFieldArr.GetRow(i).(int64))
|
|
|
|
// append row to shard
|
|
for k := 0; k < len(collectionInfo.Schema.Fields); k++ {
|
|
schema := collectionInfo.Schema.Fields[k]
|
|
srcData := fieldsData[schema.GetFieldID()]
|
|
targetData := shards[shard][partitionID][schema.GetFieldID()]
|
|
if srcData == nil && schema.GetIsDynamic() {
|
|
// user might not provide numpy file for dynamic field, skip it, will auto-generate later
|
|
continue
|
|
}
|
|
if srcData == nil || targetData == nil {
|
|
log.Warn("cannot append data since source or target field data is nil",
|
|
zap.String("FieldName", schema.GetName()),
|
|
zap.Bool("sourceNil", srcData == nil), zap.Bool("targetNil", targetData == nil))
|
|
return nil, fmt.Errorf("cannot append data for field '%s', possibly no any fields corresponding to this numpy file, or a required numpy file is not provided",
|
|
schema.GetName())
|
|
}
|
|
appendFunc := appendFunctions[schema.GetName()]
|
|
err := appendFunc(srcData, i, targetData)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
}
|
|
|
|
return autoIDRange, nil
|
|
}
|