milvus/internal/storage/pk_statistics.go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
	"fmt"

	"github.com/cockroachdb/errors"
	"github.com/samber/lo"

	"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
	"github.com/milvus-io/milvus/internal/util/bloomfilter"
	"github.com/milvus-io/milvus/pkg/common"
)

// pkStatistics contains pk field statistic information
type PkStatistics struct {
	PkFilter bloomfilter.BloomFilterInterface //  bloom filter of pk inside a segment
	MinPK    PrimaryKey                       //	minimal pk value, shortcut for checking whether a pk is inside this segment
	MaxPK    PrimaryKey                       //  maximal pk value, same above
}

// update set pk min/max value if input value is beyond former range.
func (st *PkStatistics) UpdateMinMax(pk PrimaryKey) error {
	if st == nil {
		return errors.New("nil pk statistics")
	}
	if st.MinPK == nil {
		st.MinPK = pk
	} else if st.MinPK.GT(pk) {
		st.MinPK = pk
	}

	if st.MaxPK == nil {
		st.MaxPK = pk
	} else if st.MaxPK.LT(pk) {
		st.MaxPK = pk
	}

	return nil
}

func (st *PkStatistics) UpdatePKRange(ids FieldData) error {
	switch pks := ids.(type) {
	case *Int64FieldData:
		buf := make([]byte, 8)
		for _, pk := range pks.Data {
			id := NewInt64PrimaryKey(pk)
			err := st.UpdateMinMax(id)
			if err != nil {
				return err
			}
			common.Endian.PutUint64(buf, uint64(pk))
			st.PkFilter.Add(buf)
		}
	case *StringFieldData:
		for _, pk := range pks.Data {
			id := NewVarCharPrimaryKey(pk)
			err := st.UpdateMinMax(id)
			if err != nil {
				return err
			}
			st.PkFilter.AddString(pk)
		}
	default:
		return fmt.Errorf("invalid data type for primary key: %T", ids)
	}
	return nil
}

func (st *PkStatistics) PkExist(pk PrimaryKey) bool {
	// empty pkStatics
	if st.MinPK == nil || st.MaxPK == nil || st.PkFilter == nil {
		return false
	}
	// check pk range first, ugly but key it for now
	if st.MinPK.GT(pk) || st.MaxPK.LT(pk) {
		return false
	}

	// if in range, check bloom filter
	switch pk.Type() {
	case schemapb.DataType_Int64:
		buf := make([]byte, 8)
		int64Pk := pk.(*Int64PrimaryKey)
		common.Endian.PutUint64(buf, uint64(int64Pk.Value))
		return st.PkFilter.Test(buf)
	case schemapb.DataType_VarChar:
		varCharPk := pk.(*VarCharPrimaryKey)
		return st.PkFilter.TestString(varCharPk.Value)
	default:
		// TODO::
	}
	// no idea, just make it as false positive
	return true
}

// Locations returns a list of hash locations representing a data item.
func Locations(pk PrimaryKey, k uint, bfType bloomfilter.BFType) []uint64 {
	switch pk.Type() {
	case schemapb.DataType_Int64:
		buf := make([]byte, 8)
		int64Pk := pk.(*Int64PrimaryKey)
		common.Endian.PutUint64(buf, uint64(int64Pk.Value))
		return bloomfilter.Locations(buf, k, bfType)
	case schemapb.DataType_VarChar:
		varCharPk := pk.(*VarCharPrimaryKey)
		return bloomfilter.Locations([]byte(varCharPk.Value), k, bfType)
	default:
		// TODO::
	}
	return nil
}

func (st *PkStatistics) TestLocationCache(lc *LocationsCache) bool {
	// empty pkStatics
	if st.MinPK == nil || st.MaxPK == nil || st.PkFilter == nil {
		return false
	}

	// check bf first, TestLocation just do some bitset compute, cost is cheaper
	if !st.PkFilter.TestLocations(lc.Locations(st.PkFilter.K(), st.PkFilter.Type())) {
		return false
	}

	// check pk range after
	return st.MinPK.LE(lc.pk) && st.MaxPK.GE(lc.pk)
}

func (st *PkStatistics) BatchPkExist(lc *BatchLocationsCache, hits []bool) []bool {
	// empty pkStatics
	if st.MinPK == nil || st.MaxPK == nil || st.PkFilter == nil {
		return hits
	}

	// check bf first, TestLocation just do some bitset compute, cost is cheaper
	locations := lc.Locations(st.PkFilter.K(), st.PkFilter.Type())
	ret := st.PkFilter.BatchTestLocations(locations, hits)

	// todo: a bit ugly, hits[i]'s value will depends on multi bf in single segment,
	// hits array will be removed after we merge bf in segment
	pks := lc.PKs()
	for i := range ret {
		if !hits[i] {
			hits[i] = ret[i] && st.MinPK.LE(pks[i]) && st.MaxPK.GE(pks[i])
		}
	}

	return hits
}

// LocationsCache is a helper struct caching pk bloom filter locations.
// Note that this helper is not concurrent safe and shall be used in same goroutine.
type LocationsCache struct {
	pk               PrimaryKey
	basicBFLocations []uint64
	blockBFLocations []uint64
}

func (lc *LocationsCache) GetPk() PrimaryKey {
	return lc.pk
}

func (lc *LocationsCache) Locations(k uint, bfType bloomfilter.BFType) []uint64 {
	switch bfType {
	case bloomfilter.BasicBF:
		if int(k) > len(lc.basicBFLocations) {
			lc.basicBFLocations = Locations(lc.pk, k, bfType)
		}
		return lc.basicBFLocations[:k]
	case bloomfilter.BlockedBF:
		// for block bf, we only need cache the hash result, which is a uint and only compute once for any k value
		if len(lc.blockBFLocations) != 1 {
			lc.blockBFLocations = Locations(lc.pk, 1, bfType)
		}
		return lc.blockBFLocations
	default:
		return nil
	}
}

func NewLocationsCache(pk PrimaryKey) *LocationsCache {
	return &LocationsCache{
		pk: pk,
	}
}

type BatchLocationsCache struct {
	pks []PrimaryKey
	k   uint

	// for block bf
	blockLocations [][]uint64

	// for basic bf
	basicLocations [][]uint64
}

func (lc *BatchLocationsCache) PKs() []PrimaryKey {
	return lc.pks
}

func (lc *BatchLocationsCache) Size() int {
	return len(lc.pks)
}

func (lc *BatchLocationsCache) Locations(k uint, bfType bloomfilter.BFType) [][]uint64 {
	switch bfType {
	case bloomfilter.BasicBF:
		if k > lc.k {
			lc.k = k
			lc.basicLocations = lo.Map(lc.pks, func(pk PrimaryKey, _ int) []uint64 {
				return Locations(pk, lc.k, bfType)
			})
		}

		return lo.Map(lc.basicLocations, func(locations []uint64, _ int) []uint64 { return locations[:k] })
	case bloomfilter.BlockedBF:
		// for block bf, we only need cache the hash result, which is a uint and only compute once for any k value
		if len(lc.blockLocations) != len(lc.pks) {
			lc.blockLocations = lo.Map(lc.pks, func(pk PrimaryKey, _ int) []uint64 {
				return Locations(pk, lc.k, bfType)
			})
		}

		return lc.blockLocations
	default:
		return nil
	}
}

func NewBatchLocationsCache(pks []PrimaryKey) *BatchLocationsCache {
	return &BatchLocationsCache{
		pks: pks,
	}
}