milvus/internal/util/distance/calc_distance.go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package distance

import (
	"errors"
	"strings"
	"sync"
)

const (
	// L2 represents the Euclidean distance
	L2 = "L2"
	// IP represents the inner product distance
	IP = "IP"
	// HAMMING represents the hamming distance
	HAMMING = "HAMMING"
	// TANIMOTO represents the tanimoto distance
	TANIMOTO = "TANIMOTO"
	// JACCARD in string
	JACCARD = "JACCARD"
	// SUPERSTRUCTURE in string
	SUPERSTRUCTURE = "SUPERSTRUCTURE"
	// SUBSTRUCTURE in string
	SUBSTRUCTURE = "SUBSTRUCTURE"
)

// ValidateMetricType returns metric text or error
func ValidateMetricType(metric string) (string, error) {
	if metric == "" {
		err := errors.New("metric type is empty")
		return "", err
	}

	m := strings.ToUpper(metric)
	if m == L2 || m == IP || m == HAMMING || m == TANIMOTO {
		return m, nil
	}

	err := errors.New("invalid metric type")
	return metric, err
}

// ValidateFloatArrayLength is used validate float vector length
func ValidateFloatArrayLength(dim int64, length int) error {
	if length == 0 || int64(length)%dim != 0 {
		err := errors.New("invalid float vector length")
		return err
	}

	return nil
}

// CalcL2 returns the Euclidean distance of input vectors
func CalcL2(dim int64, left []float32, lIndex int64, right []float32, rIndex int64) float32 {
	var sum float32
	lFrom := lIndex * dim
	rFrom := rIndex * dim
	for i := int64(0); i < dim; i++ {
		gap := left[lFrom+i] - right[rFrom+i]
		sum += gap * gap
	}

	return sum
}

// CalcIP returns the inner product distance of input vectors
func CalcIP(dim int64, left []float32, lIndex int64, right []float32, rIndex int64) float32 {
	var sum float32
	lFrom := lIndex * dim
	rFrom := rIndex * dim
	for i := int64(0); i < dim; i++ {
		sum += left[lFrom+i] * right[rFrom+i]
	}

	return sum
}

// CalcFFBatch calculate the distance of @left & @right vectors in batch by given @metic, store result in @result
func CalcFFBatch(dim int64, left []float32, lIndex int64, right []float32, metric string, result *[]float32) {
	rightNum := int64(len(right)) / dim
	for i := int64(0); i < rightNum; i++ {
		var distance float32 = -1.0
		if metric == L2 {
			distance = CalcL2(dim, left, lIndex, right, i)
		} else if metric == IP {
			distance = CalcIP(dim, left, lIndex, right, i)
		}
		(*result)[lIndex*rightNum+i] = distance
	}
}

// CalcFloatDistance calculate float distance by given metric
// it will checks input, and calculate the distance concurrently
func CalcFloatDistance(dim int64, left, right []float32, metric string) ([]float32, error) {
	if dim <= 0 {
		err := errors.New("invalid dimension")
		return nil, err
	}

	metricUpper := strings.ToUpper(metric)
	if metricUpper != L2 && metricUpper != IP {
		err := errors.New("invalid metric type")
		return nil, err
	}

	err := ValidateFloatArrayLength(dim, len(left))
	if err != nil {
		return nil, err
	}

	err = ValidateFloatArrayLength(dim, len(right))
	if err != nil {
		return nil, err
	}

	leftNum := int64(len(left)) / dim
	rightNum := int64(len(right)) / dim

	distArray := make([]float32, leftNum*rightNum)

	// Multi-threads to calculate distance. TODO: avoid too many go routines
	var waitGroup sync.WaitGroup
	CalcWorker := func(index int64) {
		CalcFFBatch(dim, left, index, right, metricUpper, &distArray)
		waitGroup.Done()
	}
	for i := int64(0); i < leftNum; i++ {
		waitGroup.Add(1)
		go CalcWorker(i)
	}
	waitGroup.Wait()

	return distArray, nil
}

////////////////////////////////////////////////////////////////////////////////

// SingleBitLen returns the bit length of @dim
func SingleBitLen(dim int64) int64 {
	if dim%8 == 0 {
		return dim
	}

	return dim + 8 - dim%8
}

// VectorCount counts bits by @dim & @length
func VectorCount(dim int64, length int) int64 {
	singleBitLen := SingleBitLen(dim)
	return int64(length*8) / singleBitLen
}

// ValidateBinaryArrayLength validates a binary array of @dim & @length
func ValidateBinaryArrayLength(dim int64, length int) error {
	singleBitLen := SingleBitLen(dim)
	totalBitLen := int64(length * 8)
	if length == 0 || totalBitLen%singleBitLen != 0 {
		err := errors.New("invalid binary vector length")
		return err
	}

	return nil
}

// CountOne count 1 of uint8
// For 00000010, return 1
// Fro 11111111, return 8
func CountOne(n uint8) int32 {
	count := int32(0)
	for n != 0 {
		count++
		n = n & (n - 1)
	}
	return count
}

// CalcHamming calculate HAMMING distance
func CalcHamming(dim int64, left []byte, lIndex int64, right []byte, rIndex int64) int32 {
	singleBitLen := SingleBitLen(dim)
	numBytes := singleBitLen / 8
	lFrom := lIndex * numBytes
	rFrom := rIndex * numBytes

	var hamming int32
	for i := int64(0); i < numBytes; i++ {
		var xor = left[lFrom+i] ^ right[rFrom+i]

		// The dimension "dim" may not be an integer multiple of 8
		// For example:
		//   dim = 11, each vector has 2 uint8 value
		//   the second uint8, only need to calculate 3 bits, the other 5 bits will be set to 0
		if i == numBytes-1 && numBytes*8 > dim {
			offset := numBytes*8 - dim
			xor = xor & (255 << offset)
		}

		hamming += CountOne(xor)
	}

	return hamming
}

// CalcHammingBatch calculate HAMMING distance in batch, results are in @result
func CalcHammingBatch(dim int64, left []byte, lIndex int64, right []byte, result *[]int32) {
	rightNum := VectorCount(dim, len(right))

	for i := int64(0); i < rightNum; i++ {
		hamming := CalcHamming(dim, left, lIndex, right, i)
		(*result)[lIndex*rightNum+i] = hamming
	}
}

func CalcHammingDistance(dim int64, left, right []byte) ([]int32, error) {
	if dim <= 0 {
		err := errors.New("invalid dimension")
		return nil, err
	}

	err := ValidateBinaryArrayLength(dim, len(left))
	if err != nil {
		return nil, err
	}

	err = ValidateBinaryArrayLength(dim, len(right))
	if err != nil {
		return nil, err
	}

	leftNum := VectorCount(dim, len(left))
	rightNum := VectorCount(dim, len(right))
	distArray := make([]int32, leftNum*rightNum)

	// Multi-threads to calculate distance. TODO: avoid too many go routines
	var waitGroup sync.WaitGroup
	CalcWorker := func(index int64) {
		CalcHammingBatch(dim, left, index, right, &distArray)
		waitGroup.Done()
	}
	for i := int64(0); i < leftNum; i++ {
		waitGroup.Add(1)
		go CalcWorker(i)
	}
	waitGroup.Wait()

	return distArray, nil
}

func CalcTanimotoCoefficient(dim int64, hamming []int32) ([]float32, error) {
	if dim <= 0 || len(hamming) == 0 {
		err := errors.New("invalid input for tanimoto")
		return nil, err
	}

	array := make([]float32, len(hamming))
	for i := 0; i < len(hamming); i++ {
		if hamming[i] > int32(dim) {
			err := errors.New("invalid hamming for tanimoto")
			return nil, err
		}
		equalBits := int32(dim) - hamming[i]
		array[i] = float32(equalBits) / (float32(dim)*2 - float32(equalBits))
	}

	return array, nil
}