enhance: optimize vector offsets handling for parquet (#32822)

Issue: #22837

Signed-off-by: Cai Yudong <yudong.cai@zilliz.com>
This commit is contained in:
Cai Yudong 2024-05-09 14:43:30 +08:00 committed by GitHub
parent a06f601c6e
commit 8bb58d0460
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 26 additions and 26 deletions

View File

@ -431,7 +431,6 @@ func ReadBinaryDataForSparseFloatVector(pcr *FieldReader, count int64) (any, err
data := make([][]byte, 0, count)
maxDim := uint32(0)
for _, chunk := range chunked.Chunks() {
rows := chunk.Data().Len()
listReader := chunk.(*array.List)
offsets := listReader.Offsets()
if !isVectorAligned(offsets, pcr.dim, schemapb.DataType_SparseFloatVector) {
@ -442,9 +441,9 @@ func ReadBinaryDataForSparseFloatVector(pcr *FieldReader, count int64) (any, err
return nil, WrapTypeErr("binary", listReader.ListValues().DataType().Name(), pcr.field)
}
vecData := uint8Reader.Uint8Values()
for i := 0; i < rows; i++ {
elemCount := int((offsets[i+1] - offsets[i]) / 8)
rowVec := vecData[offsets[i]:offsets[i+1]]
for i := 1; i < len(offsets); i++ {
elemCount := int((offsets[i] - offsets[i-1]) / 8)
rowVec := vecData[offsets[i-1]:offsets[i]]
data = append(data, rowVec)
maxIdx := typeutil.SparseFloatRowIndexAt(rowVec, elemCount-1)
if maxIdx+1 > maxDim {

View File

@ -169,14 +169,14 @@ func buildArrayData(schema *schemapb.CollectionSchema, rows int) ([]arrow.Array,
builder := array.NewListBuilder(mem, &arrow.Float32Type{})
offsets := make([]int32, 0, rows)
valid := make([]bool, 0, rows)
for i := 0; i < dim*rows; i++ {
floatVecData = append(floatVecData, float32(i))
}
builder.ValueBuilder().(*array.Float32Builder).AppendValues(floatVecData, nil)
for i := 0; i < rows; i++ {
for j := 0; j < dim; j++ {
floatVecData = append(floatVecData, float32(i*dim+j))
}
offsets = append(offsets, int32(i*dim))
valid = append(valid, true)
}
builder.ValueBuilder().(*array.Float32Builder).AppendValues(floatVecData, nil)
insertData.Data[field.GetFieldID()] = &storage.FloatVectorFieldData{Data: floatVecData, Dim: dim}
builder.AppendValues(offsets, valid)
columns = append(columns, builder.NewListArray())
@ -186,14 +186,14 @@ func buildArrayData(schema *schemapb.CollectionSchema, rows int) ([]arrow.Array,
offsets := make([]int32, 0, rows)
valid := make([]bool, 0, rows)
rowBytes := dim * 2
for i := 0; i < rowBytes*rows; i++ {
float16VecData = append(float16VecData, uint8(i%256))
}
builder.ValueBuilder().(*array.Uint8Builder).AppendValues(float16VecData, nil)
for i := 0; i < rows; i++ {
offsets = append(offsets, int32(rowBytes*i))
for j := 0; j < rowBytes; j++ {
float16VecData = append(float16VecData, uint8((i+j)%256))
}
offsets = append(offsets, int32(i*rowBytes))
valid = append(valid, true)
}
builder.ValueBuilder().(*array.Uint8Builder).AppendValues(float16VecData, nil)
insertData.Data[field.GetFieldID()] = &storage.Float16VectorFieldData{Data: float16VecData, Dim: dim}
builder.AppendValues(offsets, valid)
columns = append(columns, builder.NewListArray())
@ -203,28 +203,29 @@ func buildArrayData(schema *schemapb.CollectionSchema, rows int) ([]arrow.Array,
offsets := make([]int32, 0, rows)
valid := make([]bool, 0, rows)
rowBytes := dim * 2
for i := 0; i < rowBytes*rows; i++ {
bfloat16VecData = append(bfloat16VecData, uint8(i%256))
}
builder.ValueBuilder().(*array.Uint8Builder).AppendValues(bfloat16VecData, nil)
for i := 0; i < rows; i++ {
offsets = append(offsets, int32(rowBytes*i))
for j := 0; j < rowBytes; j++ {
bfloat16VecData = append(bfloat16VecData, uint8((i+j)%256))
}
offsets = append(offsets, int32(i*rowBytes))
valid = append(valid, true)
}
builder.ValueBuilder().(*array.Uint8Builder).AppendValues(bfloat16VecData, nil)
insertData.Data[field.GetFieldID()] = &storage.BFloat16VectorFieldData{Data: bfloat16VecData, Dim: dim}
builder.AppendValues(offsets, valid)
columns = append(columns, builder.NewListArray())
case schemapb.DataType_SparseFloatVector:
sparsefloatVecData := make([]byte, 0)
builder := array.NewListBuilder(mem, &arrow.Uint8Type{})
offsets := make([]int32, 0, rows+1)
offsets := make([]int32, 0, rows)
valid := make([]bool, 0, rows)
vecData := testutils.GenerateSparseFloatVectors(rows)
offsets = append(offsets, 0)
currOffset := int32(0)
for i := 0; i < rows; i++ {
rowVecData := vecData.GetContents()[i]
sparsefloatVecData = append(sparsefloatVecData, rowVecData...)
offsets = append(offsets, offsets[i]+int32(len(rowVecData)))
offsets = append(offsets, currOffset)
currOffset = currOffset + int32(len(rowVecData))
valid = append(valid, true)
}
builder.ValueBuilder().(*array.Uint8Builder).AppendValues(sparsefloatVecData, nil)
@ -253,14 +254,14 @@ func buildArrayData(schema *schemapb.CollectionSchema, rows int) ([]arrow.Array,
offsets := make([]int32, 0, rows)
valid := make([]bool, 0)
rowBytes := dim / 8
for i := 0; i < rowBytes*rows; i++ {
binVecData = append(binVecData, uint8(i))
}
builder.ValueBuilder().(*array.Uint8Builder).AppendValues(binVecData, nil)
for i := 0; i < rows; i++ {
offsets = append(offsets, int32(rowBytes*i))
for j := 0; j < rowBytes; j++ {
binVecData = append(binVecData, uint8((i+j)%256))
}
offsets = append(offsets, int32(i*rowBytes))
valid = append(valid, true)
}
builder.ValueBuilder().(*array.Uint8Builder).AppendValues(binVecData, nil)
builder.AppendValues(offsets, valid)
columns = append(columns, builder.NewListArray())
insertData.Data[field.GetFieldID()] = &storage.BinaryVectorFieldData{Data: binVecData, Dim: dim}