mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-01 11:29:48 +08:00
Caiyd 2039 avx sse (#2136)
* #2039 clean redundant compile options Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * #2039 add distances_avx.h and distances_avx.cpp Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * #2039 add ScalarQuantizerDC_avx.cpp, ScalarQuantizerDC_avx.h and ScalarQuantizerCodec_avx.h Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * #2039 remove -mavx2 from default compile option Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * update changelog Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * #2039 clean code Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * #2039 clean code Signed-off-by: yudong.cai <yudong.cai@zilliz.com>
This commit is contained in:
parent
509b87f103
commit
1384484b7d
@ -22,6 +22,7 @@ Please mark all change in change log and use the issue from GitHub
|
||||
|
||||
## Improvement
|
||||
- \#221 Refactor LOG macro
|
||||
- \#2039 Support Milvus run on SSE CPUs
|
||||
|
||||
## Task
|
||||
|
||||
|
@ -79,12 +79,12 @@ endif ()
|
||||
include(ThirdPartyPackagesCore)
|
||||
|
||||
if (CMAKE_BUILD_TYPE STREQUAL "Release")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -DELPP_THREAD_SAFE -fopenmp -mavx -mf16c -msse4 -mpopcnt")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -DELPP_THREAD_SAFE -fopenmp")
|
||||
if (KNOWHERE_GPU_VERSION)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3")
|
||||
endif ()
|
||||
else ()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -fPIC -DELPP_THREAD_SAFE -fopenmp -mavx -mf16c -msse4 -mpopcnt")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -fPIC -DELPP_THREAD_SAFE -fopenmp")
|
||||
if (KNOWHERE_GPU_VERSION)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O0 -g")
|
||||
endif ()
|
||||
|
@ -436,7 +436,7 @@ macro(build_faiss)
|
||||
set(FAISS_CONFIGURE_ARGS
|
||||
"--prefix=${FAISS_PREFIX}"
|
||||
"CFLAGS=${EP_C_FLAGS}"
|
||||
"CXXFLAGS=${EP_CXX_FLAGS} -mavx2 -mf16c -O3"
|
||||
"CXXFLAGS=${EP_CXX_FLAGS} -mf16c -O3"
|
||||
--without-python)
|
||||
|
||||
if (FAISS_WITH_MKL)
|
||||
|
@ -187,12 +187,14 @@ inline T euclidean_distance(const T* x, const T* y, int f) {
|
||||
|
||||
//#ifdef USE_AVX
|
||||
// Horizontal single sum of 256bit vector.
|
||||
#if 0 /* use FAISS distance calculation algorithm instead */
|
||||
inline float hsum256_ps_avx(__m256 v) {
|
||||
const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v));
|
||||
const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
|
||||
const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
|
||||
return _mm_cvtss_f32(x32);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<>
|
||||
inline float dot<float>(const float* x, const float *y, int f) {
|
||||
|
@ -7,8 +7,10 @@
|
||||
#include <faiss/FaissHook.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
#include <faiss/impl/ScalarQuantizerDC.h>
|
||||
#include <faiss/impl/ScalarQuantizerDC_avx.h>
|
||||
#include <faiss/impl/ScalarQuantizerDC_avx512.h>
|
||||
#include <faiss/utils/distances.h>
|
||||
#include <faiss/utils/distances_avx.h>
|
||||
#include <faiss/utils/distances_avx512.h>
|
||||
#include <faiss/utils/instruction_set.h>
|
||||
|
||||
|
5
core/src/index/thirdparty/faiss/Makefile
vendored
5
core/src/index/thirdparty/faiss/Makefile
vendored
@ -7,6 +7,7 @@
|
||||
|
||||
HEADERS = $(wildcard *.h impl/*.h utils/*.h)
|
||||
SRC = $(wildcard *.cpp impl/*.cpp utils/*.cpp)
|
||||
AVX_SRC = $(wildcard *avx.cpp impl/*avx.cpp utils/*avx.cpp)
|
||||
AVX512_SRC = $(wildcard *avx512.cpp impl/*avx512.cpp utils/*avx512.cpp)
|
||||
OBJ = $(SRC:.cpp=.o)
|
||||
INSTALLDIRS = $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss
|
||||
@ -42,6 +43,10 @@ libfaiss.$(SHAREDEXT): $(OBJ)
|
||||
%.o: %.cpp
|
||||
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c $< -o $@
|
||||
|
||||
# support avx
|
||||
%avx.o: %avx.cpp
|
||||
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx2 -c $< -o $@
|
||||
|
||||
# support avx512
|
||||
%avx512.o: %avx512.cpp
|
||||
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx512f -mavx512dq -mavx512bw -c $< -o $@
|
||||
|
@ -11,13 +11,8 @@
|
||||
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#ifdef __SSE__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include <faiss/FaissHook.h>
|
||||
#include <faiss/utils/utils.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
@ -40,9 +35,6 @@ namespace faiss {
|
||||
* that hides the template mess.
|
||||
********************************************************************/
|
||||
|
||||
#ifdef __AVX__
|
||||
#define USE_AVX
|
||||
#endif
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
@ -444,14 +436,10 @@ InvertedListScanner* ScalarQuantizer::select_InvertedListScanner
|
||||
if (d % 16 == 0 && support_avx512()) {
|
||||
return sel0_InvertedListScanner<16>
|
||||
(mt, this, quantizer, store_pairs, by_residual);
|
||||
}
|
||||
#ifdef USE_AVX
|
||||
if (d % 8 == 0) {
|
||||
} if (d % 8 == 0) {
|
||||
return sel0_InvertedListScanner<8>
|
||||
(mt, this, quantizer, store_pairs, by_residual);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
} else {
|
||||
return sel0_InvertedListScanner<1>
|
||||
(mt, this, quantizer, store_pairs, by_residual);
|
||||
}
|
||||
|
@ -11,13 +11,8 @@
|
||||
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#ifdef __SSE__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include <faiss/utils/utils.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
#include <faiss/impl/ScalarQuantizerOp.h>
|
||||
@ -25,11 +20,6 @@
|
||||
namespace faiss {
|
||||
|
||||
|
||||
#ifdef __AVX__
|
||||
#define USE_AVX
|
||||
#endif
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* Codec: converts between values in [0, 1] and an index in a code
|
||||
* array. The "i" parameter is the vector component index (not byte
|
||||
@ -44,22 +34,6 @@ struct Codec8bit {
|
||||
static float decode_component (const uint8_t *code, int i) {
|
||||
return (code[i] + 0.5f) / 255.0f;
|
||||
}
|
||||
|
||||
#ifdef USE_AVX
|
||||
static __m256 decode_8_components (const uint8_t *code, int i) {
|
||||
uint64_t c8 = *(uint64_t*)(code + i);
|
||||
__m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
|
||||
__m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
|
||||
// __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
|
||||
__m256i i8 = _mm256_castsi128_si256 (c4lo);
|
||||
i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
|
||||
__m256 f8 = _mm256_cvtepi32_ps (i8);
|
||||
__m256 half = _mm256_set1_ps (0.5f);
|
||||
f8 += half;
|
||||
__m256 one_255 = _mm256_set1_ps (1.f / 255.f);
|
||||
return f8 * one_255;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
@ -71,28 +45,6 @@ struct Codec4bit {
|
||||
static float decode_component (const uint8_t *code, int i) {
|
||||
return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
|
||||
}
|
||||
|
||||
#ifdef USE_AVX
|
||||
static __m256 decode_8_components (const uint8_t *code, int i) {
|
||||
uint32_t c4 = *(uint32_t*)(code + (i >> 1));
|
||||
uint32_t mask = 0x0f0f0f0f;
|
||||
uint32_t c4ev = c4 & mask;
|
||||
uint32_t c4od = (c4 >> 4) & mask;
|
||||
|
||||
// the 8 lower bytes of c8 contain the values
|
||||
__m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
|
||||
_mm_set1_epi32(c4od));
|
||||
__m128i c4lo = _mm_cvtepu8_epi32 (c8);
|
||||
__m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
|
||||
__m256i i8 = _mm256_castsi128_si256 (c4lo);
|
||||
i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
|
||||
__m256 f8 = _mm256_cvtepi32_ps (i8);
|
||||
__m256 half = _mm256_set1_ps (0.5f);
|
||||
f8 += half;
|
||||
__m256 one_255 = _mm256_set1_ps (1.f / 15.f);
|
||||
return f8 * one_255;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
struct Codec6bit {
|
||||
@ -138,20 +90,6 @@ struct Codec6bit {
|
||||
}
|
||||
return (bits + 0.5f) / 63.0f;
|
||||
}
|
||||
|
||||
#ifdef USE_AVX
|
||||
static __m256 decode_8_components (const uint8_t *code, int i) {
|
||||
return _mm256_set_ps
|
||||
(decode_component(code, i + 7),
|
||||
decode_component(code, i + 6),
|
||||
decode_component(code, i + 5),
|
||||
decode_component(code, i + 4),
|
||||
decode_component(code, i + 3),
|
||||
decode_component(code, i + 2),
|
||||
decode_component(code, i + 1),
|
||||
decode_component(code, i + 0));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
@ -204,25 +142,6 @@ struct QuantizerTemplate<Codec, true, 1>: Quantizer {
|
||||
};
|
||||
|
||||
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<class Codec>
|
||||
struct QuantizerTemplate<Codec, true, 8>: QuantizerTemplate<Codec, true, 1> {
|
||||
QuantizerTemplate (size_t d, const std::vector<float> &trained):
|
||||
QuantizerTemplate<Codec, true, 1> (d, trained) {}
|
||||
|
||||
__m256 reconstruct_8_components (const uint8_t * code, int i) const
|
||||
{
|
||||
__m256 xi = Codec::decode_8_components (code, i);
|
||||
return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
template<class Codec>
|
||||
struct QuantizerTemplate<Codec, false, 1>: Quantizer {
|
||||
const size_t d;
|
||||
@ -257,22 +176,6 @@ struct QuantizerTemplate<Codec, false, 1>: Quantizer {
|
||||
};
|
||||
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<class Codec>
|
||||
struct QuantizerTemplate<Codec, false, 8>: QuantizerTemplate<Codec, false, 1> {
|
||||
QuantizerTemplate (size_t d, const std::vector<float> &trained):
|
||||
QuantizerTemplate<Codec, false, 1> (d, trained) {}
|
||||
|
||||
__m256 reconstruct_8_components (const uint8_t * code, int i) const
|
||||
{
|
||||
__m256 xi = Codec::decode_8_components (code, i);
|
||||
return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/*******************************************************************
|
||||
* FP16 quantizer
|
||||
*******************************************************************/
|
||||
@ -305,21 +208,6 @@ struct QuantizerFP16<1>: Quantizer {
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<>
|
||||
struct QuantizerFP16<8>: QuantizerFP16<1> {
|
||||
QuantizerFP16 (size_t d, const std::vector<float> &trained):
|
||||
QuantizerFP16<1> (d, trained) {}
|
||||
|
||||
__m256 reconstruct_8_components (const uint8_t * code, int i) const
|
||||
{
|
||||
__m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i));
|
||||
return _mm256_cvtph_ps (codei);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/*******************************************************************
|
||||
* 8bit_direct quantizer
|
||||
@ -354,23 +242,6 @@ struct Quantizer8bitDirect<1>: Quantizer {
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<>
|
||||
struct Quantizer8bitDirect<8>: Quantizer8bitDirect<1> {
|
||||
Quantizer8bitDirect (size_t d, const std::vector<float> &trained):
|
||||
Quantizer8bitDirect<1> (d, trained) {}
|
||||
|
||||
__m256 reconstruct_8_components (const uint8_t * code, int i) const
|
||||
{
|
||||
__m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
|
||||
__m256i y8 = _mm256_cvtepu8_epi32 (x8); // 8 * int32
|
||||
return _mm256_cvtepi32_ps (y8); // 8 * float32
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
template<int SIMDWIDTH>
|
||||
Quantizer *select_quantizer_1 (
|
||||
@ -407,7 +278,6 @@ Quantizer *select_quantizer_1 (
|
||||
template<int SIMDWIDTH>
|
||||
struct SimilarityL2 {};
|
||||
|
||||
|
||||
template<>
|
||||
struct SimilarityL2<1> {
|
||||
static constexpr int simdwidth = 1;
|
||||
@ -441,89 +311,78 @@ struct SimilarityL2<1> {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#ifdef USE_AVX
|
||||
/* as same as SimilarityL2<1>, let build pass */
|
||||
template<>
|
||||
struct SimilarityL2<8> {
|
||||
static constexpr int simdwidth = 8;
|
||||
static constexpr int simdwidth = 1;
|
||||
static constexpr MetricType metric_type = METRIC_L2;
|
||||
|
||||
const float *y, *yi;
|
||||
|
||||
explicit SimilarityL2 (const float * y): y(y) {}
|
||||
__m256 accu8;
|
||||
|
||||
void begin_8 () {
|
||||
accu8 = _mm256_setzero_ps();
|
||||
/******* scalar accumulator *******/
|
||||
|
||||
float accu;
|
||||
|
||||
void begin () {
|
||||
accu = 0;
|
||||
yi = y;
|
||||
}
|
||||
|
||||
void add_8_components (__m256 x) {
|
||||
__m256 yiv = _mm256_loadu_ps (yi);
|
||||
yi += 8;
|
||||
__m256 tmp = yiv - x;
|
||||
accu8 += tmp * tmp;
|
||||
void add_component (float x) {
|
||||
float tmp = *yi++ - x;
|
||||
accu += tmp * tmp;
|
||||
}
|
||||
|
||||
void add_8_components_2 (__m256 x, __m256 y) {
|
||||
__m256 tmp = y - x;
|
||||
accu8 += tmp * tmp;
|
||||
void add_component_2 (float x1, float x2) {
|
||||
float tmp = x1 - x2;
|
||||
accu += tmp * tmp;
|
||||
}
|
||||
|
||||
float result_8 () {
|
||||
__m256 sum = _mm256_hadd_ps(accu8, accu8);
|
||||
__m256 sum2 = _mm256_hadd_ps(sum, sum);
|
||||
// now add the 0th and 4th component
|
||||
return
|
||||
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
|
||||
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
|
||||
float result () {
|
||||
return accu;
|
||||
}
|
||||
};
|
||||
|
||||
/* as same as SimilarityL2<8>, let build pass */
|
||||
/* as same as SimilarityL2<1>, let build pass */
|
||||
template<>
|
||||
struct SimilarityL2<16> {
|
||||
static constexpr int simdwidth = 8;
|
||||
static constexpr int simdwidth = 1;
|
||||
static constexpr MetricType metric_type = METRIC_L2;
|
||||
|
||||
const float *y, *yi;
|
||||
|
||||
explicit SimilarityL2 (const float * y): y(y) {}
|
||||
__m256 accu8;
|
||||
|
||||
void begin_8 () {
|
||||
accu8 = _mm256_setzero_ps();
|
||||
/******* scalar accumulator *******/
|
||||
|
||||
float accu;
|
||||
|
||||
void begin () {
|
||||
accu = 0;
|
||||
yi = y;
|
||||
}
|
||||
|
||||
void add_8_components (__m256 x) {
|
||||
__m256 yiv = _mm256_loadu_ps (yi);
|
||||
yi += 8;
|
||||
__m256 tmp = yiv - x;
|
||||
accu8 += tmp * tmp;
|
||||
void add_component (float x) {
|
||||
float tmp = *yi++ - x;
|
||||
accu += tmp * tmp;
|
||||
}
|
||||
|
||||
void add_8_components_2 (__m256 x, __m256 y) {
|
||||
__m256 tmp = y - x;
|
||||
accu8 += tmp * tmp;
|
||||
void add_component_2 (float x1, float x2) {
|
||||
float tmp = x1 - x2;
|
||||
accu += tmp * tmp;
|
||||
}
|
||||
|
||||
float result_8 () {
|
||||
__m256 sum = _mm256_hadd_ps(accu8, accu8);
|
||||
__m256 sum2 = _mm256_hadd_ps(sum, sum);
|
||||
// now add the 0th and 4th component
|
||||
return
|
||||
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
|
||||
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
|
||||
float result () {
|
||||
return accu;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
template<int SIMDWIDTH>
|
||||
struct SimilarityIP {};
|
||||
|
||||
|
||||
template<>
|
||||
struct SimilarityIP<1> {
|
||||
static constexpr int simdwidth = 1;
|
||||
@ -553,13 +412,11 @@ struct SimilarityIP<1> {
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
/* as same as SimilarityIP<1>, let build pass */
|
||||
template<>
|
||||
struct SimilarityIP<8> {
|
||||
static constexpr int simdwidth = 8;
|
||||
static constexpr int simdwidth = 1;
|
||||
static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
|
||||
|
||||
const float *y, *yi;
|
||||
|
||||
float accu;
|
||||
@ -567,39 +424,29 @@ struct SimilarityIP<8> {
|
||||
explicit SimilarityIP (const float * y):
|
||||
y (y) {}
|
||||
|
||||
__m256 accu8;
|
||||
|
||||
void begin_8 () {
|
||||
accu8 = _mm256_setzero_ps();
|
||||
void begin () {
|
||||
accu = 0;
|
||||
yi = y;
|
||||
}
|
||||
|
||||
void add_8_components (__m256 x) {
|
||||
__m256 yiv = _mm256_loadu_ps (yi);
|
||||
yi += 8;
|
||||
accu8 += yiv * x;
|
||||
void add_component (float x) {
|
||||
accu += *yi++ * x;
|
||||
}
|
||||
|
||||
void add_8_components_2 (__m256 x1, __m256 x2) {
|
||||
accu8 += x1 * x2;
|
||||
void add_component_2 (float x1, float x2) {
|
||||
accu += x1 * x2;
|
||||
}
|
||||
|
||||
float result_8 () {
|
||||
__m256 sum = _mm256_hadd_ps(accu8, accu8);
|
||||
__m256 sum2 = _mm256_hadd_ps(sum, sum);
|
||||
// now add the 0th and 4th component
|
||||
return
|
||||
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
|
||||
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
|
||||
float result () {
|
||||
return accu;
|
||||
}
|
||||
};
|
||||
|
||||
/* as same as SimilarityIP<8>, let build pass */
|
||||
/* as same as SimilarityIP<1>, let build pass */
|
||||
template<>
|
||||
struct SimilarityIP<16> {
|
||||
static constexpr int simdwidth = 8;
|
||||
static constexpr int simdwidth = 1;
|
||||
static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
|
||||
|
||||
const float *y, *yi;
|
||||
|
||||
float accu;
|
||||
@ -607,33 +454,23 @@ struct SimilarityIP<16> {
|
||||
explicit SimilarityIP (const float * y):
|
||||
y (y) {}
|
||||
|
||||
__m256 accu8;
|
||||
|
||||
void begin_8 () {
|
||||
accu8 = _mm256_setzero_ps();
|
||||
void begin () {
|
||||
accu = 0;
|
||||
yi = y;
|
||||
}
|
||||
|
||||
void add_8_components (__m256 x) {
|
||||
__m256 yiv = _mm256_loadu_ps (yi);
|
||||
yi += 8;
|
||||
accu8 += yiv * x;
|
||||
void add_component (float x) {
|
||||
accu += *yi++ * x;
|
||||
}
|
||||
|
||||
void add_8_components_2 (__m256 x1, __m256 x2) {
|
||||
accu8 += x1 * x2;
|
||||
void add_component_2 (float x1, float x2) {
|
||||
accu += x1 * x2;
|
||||
}
|
||||
|
||||
float result_8 () {
|
||||
__m256 sum = _mm256_hadd_ps(accu8, accu8);
|
||||
__m256 sum2 = _mm256_hadd_ps(sum, sum);
|
||||
// now add the 0th and 4th component
|
||||
return
|
||||
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
|
||||
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
|
||||
float result () {
|
||||
return accu;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
@ -696,63 +533,6 @@ struct DCTemplate<Quantizer, Similarity, 1> : SQDistanceComputer
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<class Quantizer, class Similarity>
|
||||
struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer
|
||||
{
|
||||
using Sim = Similarity;
|
||||
|
||||
Quantizer quant;
|
||||
|
||||
DCTemplate(size_t d, const std::vector<float> &trained):
|
||||
quant(d, trained)
|
||||
{}
|
||||
|
||||
float compute_distance(const float* x, const uint8_t* code) const {
|
||||
Similarity sim(x);
|
||||
sim.begin_8();
|
||||
for (size_t i = 0; i < quant.d; i += 8) {
|
||||
__m256 xi = quant.reconstruct_8_components(code, i);
|
||||
sim.add_8_components(xi);
|
||||
}
|
||||
return sim.result_8();
|
||||
}
|
||||
|
||||
float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
|
||||
const {
|
||||
Similarity sim(nullptr);
|
||||
sim.begin_8();
|
||||
for (size_t i = 0; i < quant.d; i += 8) {
|
||||
__m256 x1 = quant.reconstruct_8_components(code1, i);
|
||||
__m256 x2 = quant.reconstruct_8_components(code2, i);
|
||||
sim.add_8_components_2(x1, x2);
|
||||
}
|
||||
return sim.result_8();
|
||||
}
|
||||
|
||||
void set_query (const float *x) final {
|
||||
q = x;
|
||||
}
|
||||
|
||||
/// compute distance of vector i to current query
|
||||
float operator () (idx_t i) final {
|
||||
return compute_distance (q, codes + i * code_size);
|
||||
}
|
||||
|
||||
float symmetric_dis (idx_t i, idx_t j) override {
|
||||
return compute_code_distance (codes + i * code_size,
|
||||
codes + j * code_size);
|
||||
}
|
||||
|
||||
float query_to_code (const uint8_t * code) const {
|
||||
return compute_distance (q, code);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* DistanceComputerByte: computes distances in the integer domain
|
||||
@ -811,84 +591,12 @@ struct DistanceComputerByte<Similarity, 1> : SQDistanceComputer {
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
|
||||
template<class Similarity>
|
||||
struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
|
||||
using Sim = Similarity;
|
||||
|
||||
int d;
|
||||
std::vector<uint8_t> tmp;
|
||||
|
||||
DistanceComputerByte(int d, const std::vector<float> &): d(d), tmp(d) {
|
||||
}
|
||||
|
||||
int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
|
||||
const {
|
||||
// __m256i accu = _mm256_setzero_ps ();
|
||||
__m256i accu = _mm256_setzero_si256 ();
|
||||
for (int i = 0; i < d; i += 16) {
|
||||
// load 16 bytes, convert to 16 uint16_t
|
||||
__m256i c1 = _mm256_cvtepu8_epi16
|
||||
(_mm_loadu_si128((__m128i*)(code1 + i)));
|
||||
__m256i c2 = _mm256_cvtepu8_epi16
|
||||
(_mm_loadu_si128((__m128i*)(code2 + i)));
|
||||
__m256i prod32;
|
||||
if (Sim::metric_type == METRIC_INNER_PRODUCT) {
|
||||
prod32 = _mm256_madd_epi16(c1, c2);
|
||||
} else {
|
||||
__m256i diff = _mm256_sub_epi16(c1, c2);
|
||||
prod32 = _mm256_madd_epi16(diff, diff);
|
||||
}
|
||||
accu = _mm256_add_epi32 (accu, prod32);
|
||||
}
|
||||
__m128i sum = _mm256_extractf128_si256(accu, 0);
|
||||
sum = _mm_add_epi32 (sum, _mm256_extractf128_si256(accu, 1));
|
||||
sum = _mm_hadd_epi32 (sum, sum);
|
||||
sum = _mm_hadd_epi32 (sum, sum);
|
||||
return _mm_cvtsi128_si32 (sum);
|
||||
}
|
||||
|
||||
void set_query (const float *x) final {
|
||||
/*
|
||||
for (int i = 0; i < d; i += 8) {
|
||||
__m256 xi = _mm256_loadu_ps (x + i);
|
||||
__m256i ci = _mm256_cvtps_epi32(xi);
|
||||
*/
|
||||
for (int i = 0; i < d; i++) {
|
||||
tmp[i] = int(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int compute_distance(const float* x, const uint8_t* code) {
|
||||
set_query(x);
|
||||
return compute_code_distance(tmp.data(), code);
|
||||
}
|
||||
|
||||
/// compute distance of vector i to current query
|
||||
float operator () (idx_t i) final {
|
||||
return compute_distance (q, codes + i * code_size);
|
||||
}
|
||||
|
||||
float symmetric_dis (idx_t i, idx_t j) override {
|
||||
return compute_code_distance (codes + i * code_size,
|
||||
codes + j * code_size);
|
||||
}
|
||||
|
||||
float query_to_code (const uint8_t * code) const {
|
||||
return compute_code_distance (tmp.data(), code);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/*******************************************************************
|
||||
* select_distance_computer: runtime selection of template
|
||||
* specialization
|
||||
*******************************************************************/
|
||||
|
||||
|
||||
template<class Sim>
|
||||
SQDistanceComputer *select_distance_computer (
|
||||
QuantizerType qtype,
|
||||
|
936
core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx.h
vendored
Normal file
936
core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx.h
vendored
Normal file
@ -0,0 +1,936 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#ifdef __SSE__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include <faiss/utils/utils.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
#include <faiss/impl/ScalarQuantizerOp.h>
|
||||
|
||||
namespace faiss {
|
||||
|
||||
|
||||
#ifdef __AVX__
|
||||
#define USE_AVX
|
||||
#endif
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* Codec: converts between values in [0, 1] and an index in a code
|
||||
* array. The "i" parameter is the vector component index (not byte
|
||||
* index).
|
||||
*/
|
||||
|
||||
struct Codec8bit_avx {
|
||||
static void encode_component (float x, uint8_t *code, int i) {
|
||||
code[i] = (int)(255 * x);
|
||||
}
|
||||
|
||||
static float decode_component (const uint8_t *code, int i) {
|
||||
return (code[i] + 0.5f) / 255.0f;
|
||||
}
|
||||
|
||||
#ifdef USE_AVX
|
||||
static __m256 decode_8_components (const uint8_t *code, int i) {
|
||||
uint64_t c8 = *(uint64_t*)(code + i);
|
||||
__m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
|
||||
__m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
|
||||
// __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
|
||||
__m256i i8 = _mm256_castsi128_si256 (c4lo);
|
||||
i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
|
||||
__m256 f8 = _mm256_cvtepi32_ps (i8);
|
||||
__m256 half = _mm256_set1_ps (0.5f);
|
||||
f8 += half;
|
||||
__m256 one_255 = _mm256_set1_ps (1.f / 255.f);
|
||||
return f8 * one_255;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
struct Codec4bit_avx {
|
||||
static void encode_component (float x, uint8_t *code, int i) {
|
||||
code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
|
||||
}
|
||||
|
||||
static float decode_component (const uint8_t *code, int i) {
|
||||
return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
|
||||
}
|
||||
|
||||
#ifdef USE_AVX
|
||||
static __m256 decode_8_components (const uint8_t *code, int i) {
|
||||
uint32_t c4 = *(uint32_t*)(code + (i >> 1));
|
||||
uint32_t mask = 0x0f0f0f0f;
|
||||
uint32_t c4ev = c4 & mask;
|
||||
uint32_t c4od = (c4 >> 4) & mask;
|
||||
|
||||
// the 8 lower bytes of c8 contain the values
|
||||
__m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
|
||||
_mm_set1_epi32(c4od));
|
||||
__m128i c4lo = _mm_cvtepu8_epi32 (c8);
|
||||
__m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
|
||||
__m256i i8 = _mm256_castsi128_si256 (c4lo);
|
||||
i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
|
||||
__m256 f8 = _mm256_cvtepi32_ps (i8);
|
||||
__m256 half = _mm256_set1_ps (0.5f);
|
||||
f8 += half;
|
||||
__m256 one_255 = _mm256_set1_ps (1.f / 15.f);
|
||||
return f8 * one_255;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
struct Codec6bit_avx {
|
||||
static void encode_component (float x, uint8_t *code, int i) {
|
||||
int bits = (int)(x * 63.0);
|
||||
code += (i >> 2) * 3;
|
||||
switch(i & 3) {
|
||||
case 0:
|
||||
code[0] |= bits;
|
||||
break;
|
||||
case 1:
|
||||
code[0] |= bits << 6;
|
||||
code[1] |= bits >> 2;
|
||||
break;
|
||||
case 2:
|
||||
code[1] |= bits << 4;
|
||||
code[2] |= bits >> 4;
|
||||
break;
|
||||
case 3:
|
||||
code[2] |= bits << 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static float decode_component (const uint8_t *code, int i) {
|
||||
uint8_t bits;
|
||||
code += (i >> 2) * 3;
|
||||
switch(i & 3) {
|
||||
case 0:
|
||||
bits = code[0] & 0x3f;
|
||||
break;
|
||||
case 1:
|
||||
bits = code[0] >> 6;
|
||||
bits |= (code[1] & 0xf) << 2;
|
||||
break;
|
||||
case 2:
|
||||
bits = code[1] >> 4;
|
||||
bits |= (code[2] & 3) << 4;
|
||||
break;
|
||||
case 3:
|
||||
bits = code[2] >> 2;
|
||||
break;
|
||||
}
|
||||
return (bits + 0.5f) / 63.0f;
|
||||
}
|
||||
|
||||
#ifdef USE_AVX
|
||||
static __m256 decode_8_components (const uint8_t *code, int i) {
|
||||
return _mm256_set_ps
|
||||
(decode_component(code, i + 7),
|
||||
decode_component(code, i + 6),
|
||||
decode_component(code, i + 5),
|
||||
decode_component(code, i + 4),
|
||||
decode_component(code, i + 3),
|
||||
decode_component(code, i + 2),
|
||||
decode_component(code, i + 1),
|
||||
decode_component(code, i + 0));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* Quantizer: normalizes scalar vector components, then passes them
|
||||
* through a codec
|
||||
*******************************************************************/
|
||||
|
||||
|
||||
template<class Codec, bool uniform, int SIMD>
|
||||
struct QuantizerTemplate_avx {};
|
||||
|
||||
|
||||
template<class Codec>
|
||||
struct QuantizerTemplate_avx<Codec, true, 1>: Quantizer {
|
||||
const size_t d;
|
||||
const float vmin, vdiff;
|
||||
|
||||
QuantizerTemplate_avx(size_t d, const std::vector<float> &trained):
|
||||
d(d), vmin(trained[0]), vdiff(trained[1])
|
||||
{
|
||||
}
|
||||
|
||||
void encode_vector(const float* x, uint8_t* code) const final {
|
||||
for (size_t i = 0; i < d; i++) {
|
||||
float xi = (x[i] - vmin) / vdiff;
|
||||
if (xi < 0) {
|
||||
xi = 0;
|
||||
}
|
||||
if (xi > 1.0) {
|
||||
xi = 1.0;
|
||||
}
|
||||
Codec::encode_component(xi, code, i);
|
||||
}
|
||||
}
|
||||
|
||||
void decode_vector(const uint8_t* code, float* x) const final {
|
||||
for (size_t i = 0; i < d; i++) {
|
||||
float xi = Codec::decode_component(code, i);
|
||||
x[i] = vmin + xi * vdiff;
|
||||
}
|
||||
}
|
||||
|
||||
float reconstruct_component (const uint8_t * code, int i) const
|
||||
{
|
||||
float xi = Codec::decode_component (code, i);
|
||||
return vmin + xi * vdiff;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<class Codec>
|
||||
struct QuantizerTemplate_avx<Codec, true, 8>: QuantizerTemplate_avx<Codec, true, 1> {
|
||||
QuantizerTemplate_avx (size_t d, const std::vector<float> &trained):
|
||||
QuantizerTemplate_avx<Codec, true, 1> (d, trained) {}
|
||||
|
||||
__m256 reconstruct_8_components (const uint8_t * code, int i) const
|
||||
{
|
||||
__m256 xi = Codec::decode_8_components (code, i);
|
||||
return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
template<class Codec>
|
||||
struct QuantizerTemplate_avx<Codec, false, 1>: Quantizer {
|
||||
const size_t d;
|
||||
const float *vmin, *vdiff;
|
||||
|
||||
QuantizerTemplate_avx (size_t d, const std::vector<float> &trained):
|
||||
d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
|
||||
|
||||
void encode_vector(const float* x, uint8_t* code) const final {
|
||||
for (size_t i = 0; i < d; i++) {
|
||||
float xi = (x[i] - vmin[i]) / vdiff[i];
|
||||
if (xi < 0)
|
||||
xi = 0;
|
||||
if (xi > 1.0)
|
||||
xi = 1.0;
|
||||
Codec::encode_component(xi, code, i);
|
||||
}
|
||||
}
|
||||
|
||||
void decode_vector(const uint8_t* code, float* x) const final {
|
||||
for (size_t i = 0; i < d; i++) {
|
||||
float xi = Codec::decode_component(code, i);
|
||||
x[i] = vmin[i] + xi * vdiff[i];
|
||||
}
|
||||
}
|
||||
|
||||
float reconstruct_component (const uint8_t * code, int i) const
|
||||
{
|
||||
float xi = Codec::decode_component (code, i);
|
||||
return vmin[i] + xi * vdiff[i];
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<class Codec>
|
||||
struct QuantizerTemplate_avx<Codec, false, 8>: QuantizerTemplate_avx<Codec, false, 1> {
|
||||
QuantizerTemplate_avx (size_t d, const std::vector<float> &trained):
|
||||
QuantizerTemplate_avx<Codec, false, 1> (d, trained) {}
|
||||
|
||||
__m256 reconstruct_8_components (const uint8_t * code, int i) const
|
||||
{
|
||||
__m256 xi = Codec::decode_8_components (code, i);
|
||||
return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/*******************************************************************
|
||||
* FP16 quantizer
|
||||
*******************************************************************/
|
||||
|
||||
template<int SIMDWIDTH>
|
||||
struct QuantizerFP16_avx {};
|
||||
|
||||
template<>
|
||||
struct QuantizerFP16_avx<1>: Quantizer {
|
||||
const size_t d;
|
||||
|
||||
QuantizerFP16_avx(size_t d, const std::vector<float> & /* unused */):
|
||||
d(d) {}
|
||||
|
||||
void encode_vector(const float* x, uint8_t* code) const final {
|
||||
for (size_t i = 0; i < d; i++) {
|
||||
((uint16_t*)code)[i] = encode_fp16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void decode_vector(const uint8_t* code, float* x) const final {
|
||||
for (size_t i = 0; i < d; i++) {
|
||||
x[i] = decode_fp16(((uint16_t*)code)[i]);
|
||||
}
|
||||
}
|
||||
|
||||
float reconstruct_component (const uint8_t * code, int i) const
|
||||
{
|
||||
return decode_fp16(((uint16_t*)code)[i]);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<>
|
||||
struct QuantizerFP16_avx<8>: QuantizerFP16_avx<1> {
|
||||
QuantizerFP16_avx (size_t d, const std::vector<float> &trained):
|
||||
QuantizerFP16_avx<1> (d, trained) {}
|
||||
|
||||
__m256 reconstruct_8_components (const uint8_t * code, int i) const
|
||||
{
|
||||
__m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i));
|
||||
return _mm256_cvtph_ps (codei);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/*******************************************************************
|
||||
* 8bit_direct quantizer
|
||||
*******************************************************************/
|
||||
|
||||
template<int SIMDWIDTH>
|
||||
struct Quantizer8bitDirect_avx {};
|
||||
|
||||
template<>
|
||||
struct Quantizer8bitDirect_avx<1>: Quantizer {
|
||||
const size_t d;
|
||||
|
||||
Quantizer8bitDirect_avx(size_t d, const std::vector<float> & /* unused */):
|
||||
d(d) {}
|
||||
|
||||
|
||||
void encode_vector(const float* x, uint8_t* code) const final {
|
||||
for (size_t i = 0; i < d; i++) {
|
||||
code[i] = (uint8_t)x[i];
|
||||
}
|
||||
}
|
||||
|
||||
void decode_vector(const uint8_t* code, float* x) const final {
|
||||
for (size_t i = 0; i < d; i++) {
|
||||
x[i] = code[i];
|
||||
}
|
||||
}
|
||||
|
||||
float reconstruct_component (const uint8_t * code, int i) const
|
||||
{
|
||||
return code[i];
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<>
|
||||
struct Quantizer8bitDirect_avx<8>: Quantizer8bitDirect_avx<1> {
|
||||
Quantizer8bitDirect_avx (size_t d, const std::vector<float> &trained):
|
||||
Quantizer8bitDirect_avx<1> (d, trained) {}
|
||||
|
||||
__m256 reconstruct_8_components (const uint8_t * code, int i) const
|
||||
{
|
||||
__m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
|
||||
__m256i y8 = _mm256_cvtepu8_epi32 (x8); // 8 * int32
|
||||
return _mm256_cvtepi32_ps (y8); // 8 * float32
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
template<int SIMDWIDTH>
|
||||
Quantizer *select_quantizer_1_avx (
|
||||
QuantizerType qtype,
|
||||
size_t d, const std::vector<float> & trained)
|
||||
{
|
||||
switch(qtype) {
|
||||
case QuantizerType::QT_8bit:
|
||||
return new QuantizerTemplate_avx<Codec8bit_avx, false, SIMDWIDTH>(d, trained);
|
||||
case QuantizerType::QT_6bit:
|
||||
return new QuantizerTemplate_avx<Codec6bit_avx, false, SIMDWIDTH>(d, trained);
|
||||
case QuantizerType::QT_4bit:
|
||||
return new QuantizerTemplate_avx<Codec4bit_avx, false, SIMDWIDTH>(d, trained);
|
||||
case QuantizerType::QT_8bit_uniform:
|
||||
return new QuantizerTemplate_avx<Codec8bit_avx, true, SIMDWIDTH>(d, trained);
|
||||
case QuantizerType::QT_4bit_uniform:
|
||||
return new QuantizerTemplate_avx<Codec4bit_avx, true, SIMDWIDTH>(d, trained);
|
||||
case QuantizerType::QT_fp16:
|
||||
return new QuantizerFP16_avx<SIMDWIDTH> (d, trained);
|
||||
case QuantizerType::QT_8bit_direct:
|
||||
return new Quantizer8bitDirect_avx<SIMDWIDTH> (d, trained);
|
||||
}
|
||||
FAISS_THROW_MSG ("unknown qtype");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* Similarity: gets vector components and computes a similarity wrt. a
|
||||
* query vector stored in the object. The data fields just encapsulate
|
||||
* an accumulator.
|
||||
*/
|
||||
|
||||
template<int SIMDWIDTH>
|
||||
struct SimilarityL2_avx {};
|
||||
|
||||
|
||||
template<>
|
||||
struct SimilarityL2_avx<1> {
|
||||
static constexpr int simdwidth = 1;
|
||||
static constexpr MetricType metric_type = METRIC_L2;
|
||||
|
||||
const float *y, *yi;
|
||||
|
||||
explicit SimilarityL2_avx (const float * y): y(y) {}
|
||||
|
||||
/******* scalar accumulator *******/
|
||||
|
||||
float accu;
|
||||
|
||||
void begin () {
|
||||
accu = 0;
|
||||
yi = y;
|
||||
}
|
||||
|
||||
void add_component (float x) {
|
||||
float tmp = *yi++ - x;
|
||||
accu += tmp * tmp;
|
||||
}
|
||||
|
||||
void add_component_2 (float x1, float x2) {
|
||||
float tmp = x1 - x2;
|
||||
accu += tmp * tmp;
|
||||
}
|
||||
|
||||
float result () {
|
||||
return accu;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
#ifdef USE_AVX
|
||||
template<>
|
||||
struct SimilarityL2_avx<8> {
|
||||
static constexpr int simdwidth = 8;
|
||||
static constexpr MetricType metric_type = METRIC_L2;
|
||||
|
||||
const float *y, *yi;
|
||||
|
||||
explicit SimilarityL2_avx (const float * y): y(y) {}
|
||||
__m256 accu8;
|
||||
|
||||
void begin_8 () {
|
||||
accu8 = _mm256_setzero_ps();
|
||||
yi = y;
|
||||
}
|
||||
|
||||
void add_8_components (__m256 x) {
|
||||
__m256 yiv = _mm256_loadu_ps (yi);
|
||||
yi += 8;
|
||||
__m256 tmp = yiv - x;
|
||||
accu8 += tmp * tmp;
|
||||
}
|
||||
|
||||
void add_8_components_2 (__m256 x, __m256 y) {
|
||||
__m256 tmp = y - x;
|
||||
accu8 += tmp * tmp;
|
||||
}
|
||||
|
||||
float result_8 () {
|
||||
__m256 sum = _mm256_hadd_ps(accu8, accu8);
|
||||
__m256 sum2 = _mm256_hadd_ps(sum, sum);
|
||||
// now add the 0th and 4th component
|
||||
return
|
||||
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
|
||||
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
|
||||
}
|
||||
};
|
||||
|
||||
/* as same as SimilarityL2<8>, let build pass */
|
||||
template<>
|
||||
struct SimilarityL2_avx<16> {
|
||||
static constexpr int simdwidth = 8;
|
||||
static constexpr MetricType metric_type = METRIC_L2;
|
||||
|
||||
const float *y, *yi;
|
||||
|
||||
explicit SimilarityL2_avx (const float * y): y(y) {}
|
||||
__m256 accu8;
|
||||
|
||||
void begin_8 () {
|
||||
accu8 = _mm256_setzero_ps();
|
||||
yi = y;
|
||||
}
|
||||
|
||||
void add_8_components (__m256 x) {
|
||||
__m256 yiv = _mm256_loadu_ps (yi);
|
||||
yi += 8;
|
||||
__m256 tmp = yiv - x;
|
||||
accu8 += tmp * tmp;
|
||||
}
|
||||
|
||||
void add_8_components_2 (__m256 x, __m256 y) {
|
||||
__m256 tmp = y - x;
|
||||
accu8 += tmp * tmp;
|
||||
}
|
||||
|
||||
float result_8 () {
|
||||
__m256 sum = _mm256_hadd_ps(accu8, accu8);
|
||||
__m256 sum2 = _mm256_hadd_ps(sum, sum);
|
||||
// now add the 0th and 4th component
|
||||
return
|
||||
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
|
||||
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
template<int SIMDWIDTH>
|
||||
struct SimilarityIP_avx {};
|
||||
|
||||
|
||||
template<>
|
||||
struct SimilarityIP_avx<1> {
|
||||
static constexpr int simdwidth = 1;
|
||||
static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
|
||||
const float *y, *yi;
|
||||
|
||||
float accu;
|
||||
|
||||
explicit SimilarityIP_avx (const float * y):
|
||||
y (y) {}
|
||||
|
||||
void begin () {
|
||||
accu = 0;
|
||||
yi = y;
|
||||
}
|
||||
|
||||
void add_component (float x) {
|
||||
accu += *yi++ * x;
|
||||
}
|
||||
|
||||
void add_component_2 (float x1, float x2) {
|
||||
accu += x1 * x2;
|
||||
}
|
||||
|
||||
float result () {
|
||||
return accu;
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<>
|
||||
struct SimilarityIP_avx<8> {
|
||||
static constexpr int simdwidth = 8;
|
||||
static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
|
||||
|
||||
const float *y, *yi;
|
||||
|
||||
float accu;
|
||||
|
||||
explicit SimilarityIP_avx (const float * y):
|
||||
y (y) {}
|
||||
|
||||
__m256 accu8;
|
||||
|
||||
void begin_8 () {
|
||||
accu8 = _mm256_setzero_ps();
|
||||
yi = y;
|
||||
}
|
||||
|
||||
void add_8_components (__m256 x) {
|
||||
__m256 yiv = _mm256_loadu_ps (yi);
|
||||
yi += 8;
|
||||
accu8 += yiv * x;
|
||||
}
|
||||
|
||||
void add_8_components_2 (__m256 x1, __m256 x2) {
|
||||
accu8 += x1 * x2;
|
||||
}
|
||||
|
||||
float result_8 () {
|
||||
__m256 sum = _mm256_hadd_ps(accu8, accu8);
|
||||
__m256 sum2 = _mm256_hadd_ps(sum, sum);
|
||||
// now add the 0th and 4th component
|
||||
return
|
||||
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
|
||||
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
|
||||
}
|
||||
};
|
||||
|
||||
/* as same as SimilarityIP<8>, let build pass */
|
||||
template<>
|
||||
struct SimilarityIP_avx<16> {
|
||||
static constexpr int simdwidth = 8;
|
||||
static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
|
||||
|
||||
const float *y, *yi;
|
||||
|
||||
float accu;
|
||||
|
||||
explicit SimilarityIP_avx (const float * y):
|
||||
y (y) {}
|
||||
|
||||
__m256 accu8;
|
||||
|
||||
void begin_8 () {
|
||||
accu8 = _mm256_setzero_ps();
|
||||
yi = y;
|
||||
}
|
||||
|
||||
void add_8_components (__m256 x) {
|
||||
__m256 yiv = _mm256_loadu_ps (yi);
|
||||
yi += 8;
|
||||
accu8 += yiv * x;
|
||||
}
|
||||
|
||||
void add_8_components_2 (__m256 x1, __m256 x2) {
|
||||
accu8 += x1 * x2;
|
||||
}
|
||||
|
||||
float result_8 () {
|
||||
__m256 sum = _mm256_hadd_ps(accu8, accu8);
|
||||
__m256 sum2 = _mm256_hadd_ps(sum, sum);
|
||||
// now add the 0th and 4th component
|
||||
return
|
||||
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
|
||||
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* DistanceComputer: combines a similarity and a quantizer to do
|
||||
* code-to-vector or code-to-code comparisons
|
||||
*******************************************************************/
|
||||
|
||||
template<class Quantizer, class Similarity, int SIMDWIDTH>
|
||||
struct DCTemplate_avx : SQDistanceComputer {};
|
||||
|
||||
template<class Quantizer, class Similarity>
|
||||
struct DCTemplate_avx<Quantizer, Similarity, 1> : SQDistanceComputer
|
||||
{
|
||||
using Sim = Similarity;
|
||||
|
||||
Quantizer quant;
|
||||
|
||||
DCTemplate_avx(size_t d, const std::vector<float> &trained):
|
||||
quant(d, trained)
|
||||
{}
|
||||
|
||||
float compute_distance(const float* x, const uint8_t* code) const {
|
||||
Similarity sim(x);
|
||||
sim.begin();
|
||||
for (size_t i = 0; i < quant.d; i++) {
|
||||
float xi = quant.reconstruct_component(code, i);
|
||||
sim.add_component(xi);
|
||||
}
|
||||
return sim.result();
|
||||
}
|
||||
|
||||
float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
|
||||
const {
|
||||
Similarity sim(nullptr);
|
||||
sim.begin();
|
||||
for (size_t i = 0; i < quant.d; i++) {
|
||||
float x1 = quant.reconstruct_component(code1, i);
|
||||
float x2 = quant.reconstruct_component(code2, i);
|
||||
sim.add_component_2(x1, x2);
|
||||
}
|
||||
return sim.result();
|
||||
}
|
||||
|
||||
void set_query (const float *x) final {
|
||||
q = x;
|
||||
}
|
||||
|
||||
/// compute distance of vector i to current query
|
||||
float operator () (idx_t i) final {
|
||||
return compute_distance (q, codes + i * code_size);
|
||||
}
|
||||
|
||||
float symmetric_dis (idx_t i, idx_t j) override {
|
||||
return compute_code_distance (codes + i * code_size,
|
||||
codes + j * code_size);
|
||||
}
|
||||
|
||||
float query_to_code (const uint8_t * code) const {
|
||||
return compute_distance (q, code);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
template<class Quantizer, class Similarity>
|
||||
struct DCTemplate_avx<Quantizer, Similarity, 8> : SQDistanceComputer
|
||||
{
|
||||
using Sim = Similarity;
|
||||
|
||||
Quantizer quant;
|
||||
|
||||
DCTemplate_avx(size_t d, const std::vector<float> &trained):
|
||||
quant(d, trained)
|
||||
{}
|
||||
|
||||
float compute_distance(const float* x, const uint8_t* code) const {
|
||||
Similarity sim(x);
|
||||
sim.begin_8();
|
||||
for (size_t i = 0; i < quant.d; i += 8) {
|
||||
__m256 xi = quant.reconstruct_8_components(code, i);
|
||||
sim.add_8_components(xi);
|
||||
}
|
||||
return sim.result_8();
|
||||
}
|
||||
|
||||
float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
|
||||
const {
|
||||
Similarity sim(nullptr);
|
||||
sim.begin_8();
|
||||
for (size_t i = 0; i < quant.d; i += 8) {
|
||||
__m256 x1 = quant.reconstruct_8_components(code1, i);
|
||||
__m256 x2 = quant.reconstruct_8_components(code2, i);
|
||||
sim.add_8_components_2(x1, x2);
|
||||
}
|
||||
return sim.result_8();
|
||||
}
|
||||
|
||||
void set_query (const float *x) final {
|
||||
q = x;
|
||||
}
|
||||
|
||||
/// compute distance of vector i to current query
|
||||
float operator () (idx_t i) final {
|
||||
return compute_distance (q, codes + i * code_size);
|
||||
}
|
||||
|
||||
float symmetric_dis (idx_t i, idx_t j) override {
|
||||
return compute_code_distance (codes + i * code_size,
|
||||
codes + j * code_size);
|
||||
}
|
||||
|
||||
float query_to_code (const uint8_t * code) const {
|
||||
return compute_distance (q, code);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* DistanceComputerByte: computes distances in the integer domain
|
||||
*******************************************************************/
|
||||
|
||||
template<class Similarity, int SIMDWIDTH>
|
||||
struct DistanceComputerByte_avx : SQDistanceComputer {};
|
||||
|
||||
template<class Similarity>
|
||||
struct DistanceComputerByte_avx<Similarity, 1> : SQDistanceComputer {
|
||||
using Sim = Similarity;
|
||||
|
||||
int d;
|
||||
std::vector<uint8_t> tmp;
|
||||
|
||||
DistanceComputerByte_avx(int d, const std::vector<float> &): d(d), tmp(d) {
|
||||
}
|
||||
|
||||
int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
|
||||
const {
|
||||
int accu = 0;
|
||||
for (int i = 0; i < d; i++) {
|
||||
if (Sim::metric_type == METRIC_INNER_PRODUCT) {
|
||||
accu += int(code1[i]) * code2[i];
|
||||
} else {
|
||||
int diff = int(code1[i]) - code2[i];
|
||||
accu += diff * diff;
|
||||
}
|
||||
}
|
||||
return accu;
|
||||
}
|
||||
|
||||
void set_query (const float *x) final {
|
||||
for (int i = 0; i < d; i++) {
|
||||
tmp[i] = int(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int compute_distance(const float* x, const uint8_t* code) {
|
||||
set_query(x);
|
||||
return compute_code_distance(tmp.data(), code);
|
||||
}
|
||||
|
||||
/// compute distance of vector i to current query
|
||||
float operator () (idx_t i) final {
|
||||
return compute_distance (q, codes + i * code_size);
|
||||
}
|
||||
|
||||
float symmetric_dis (idx_t i, idx_t j) override {
|
||||
return compute_code_distance (codes + i * code_size,
|
||||
codes + j * code_size);
|
||||
}
|
||||
|
||||
float query_to_code (const uint8_t * code) const {
|
||||
return compute_code_distance (tmp.data(), code);
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_AVX
|
||||
|
||||
|
||||
template<class Similarity>
|
||||
struct DistanceComputerByte_avx<Similarity, 8> : SQDistanceComputer {
|
||||
using Sim = Similarity;
|
||||
|
||||
int d;
|
||||
std::vector<uint8_t> tmp;
|
||||
|
||||
DistanceComputerByte_avx(int d, const std::vector<float> &): d(d), tmp(d) {
|
||||
}
|
||||
|
||||
int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
|
||||
const {
|
||||
// __m256i accu = _mm256_setzero_ps ();
|
||||
__m256i accu = _mm256_setzero_si256 ();
|
||||
for (int i = 0; i < d; i += 16) {
|
||||
// load 16 bytes, convert to 16 uint16_t
|
||||
__m256i c1 = _mm256_cvtepu8_epi16
|
||||
(_mm_loadu_si128((__m128i*)(code1 + i)));
|
||||
__m256i c2 = _mm256_cvtepu8_epi16
|
||||
(_mm_loadu_si128((__m128i*)(code2 + i)));
|
||||
__m256i prod32;
|
||||
if (Sim::metric_type == METRIC_INNER_PRODUCT) {
|
||||
prod32 = _mm256_madd_epi16(c1, c2);
|
||||
} else {
|
||||
__m256i diff = _mm256_sub_epi16(c1, c2);
|
||||
prod32 = _mm256_madd_epi16(diff, diff);
|
||||
}
|
||||
accu = _mm256_add_epi32 (accu, prod32);
|
||||
}
|
||||
__m128i sum = _mm256_extractf128_si256(accu, 0);
|
||||
sum = _mm_add_epi32 (sum, _mm256_extractf128_si256(accu, 1));
|
||||
sum = _mm_hadd_epi32 (sum, sum);
|
||||
sum = _mm_hadd_epi32 (sum, sum);
|
||||
return _mm_cvtsi128_si32 (sum);
|
||||
}
|
||||
|
||||
void set_query (const float *x) final {
|
||||
/*
|
||||
for (int i = 0; i < d; i += 8) {
|
||||
__m256 xi = _mm256_loadu_ps (x + i);
|
||||
__m256i ci = _mm256_cvtps_epi32(xi);
|
||||
*/
|
||||
for (int i = 0; i < d; i++) {
|
||||
tmp[i] = int(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int compute_distance(const float* x, const uint8_t* code) {
|
||||
set_query(x);
|
||||
return compute_code_distance(tmp.data(), code);
|
||||
}
|
||||
|
||||
/// compute distance of vector i to current query
|
||||
float operator () (idx_t i) final {
|
||||
return compute_distance (q, codes + i * code_size);
|
||||
}
|
||||
|
||||
float symmetric_dis (idx_t i, idx_t j) override {
|
||||
return compute_code_distance (codes + i * code_size,
|
||||
codes + j * code_size);
|
||||
}
|
||||
|
||||
float query_to_code (const uint8_t * code) const {
|
||||
return compute_code_distance (tmp.data(), code);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/*******************************************************************
|
||||
* select_distance_computer: runtime selection of template
|
||||
* specialization
|
||||
*******************************************************************/
|
||||
|
||||
|
||||
template<class Sim>
|
||||
SQDistanceComputer *select_distance_computer_avx (
|
||||
QuantizerType qtype,
|
||||
size_t d, const std::vector<float> & trained)
|
||||
{
|
||||
constexpr int SIMDWIDTH = Sim::simdwidth;
|
||||
switch(qtype) {
|
||||
case QuantizerType::QT_8bit_uniform:
|
||||
return new DCTemplate_avx<QuantizerTemplate_avx<Codec8bit_avx, true, SIMDWIDTH>,
|
||||
Sim, SIMDWIDTH>(d, trained);
|
||||
|
||||
case QuantizerType::QT_4bit_uniform:
|
||||
return new DCTemplate_avx<QuantizerTemplate_avx<Codec4bit_avx, true, SIMDWIDTH>,
|
||||
Sim, SIMDWIDTH>(d, trained);
|
||||
|
||||
case QuantizerType::QT_8bit:
|
||||
return new DCTemplate_avx<QuantizerTemplate_avx<Codec8bit_avx, false, SIMDWIDTH>,
|
||||
Sim, SIMDWIDTH>(d, trained);
|
||||
|
||||
case QuantizerType::QT_6bit:
|
||||
return new DCTemplate_avx<QuantizerTemplate_avx<Codec6bit_avx, false, SIMDWIDTH>,
|
||||
Sim, SIMDWIDTH>(d, trained);
|
||||
|
||||
case QuantizerType::QT_4bit:
|
||||
return new DCTemplate_avx<QuantizerTemplate_avx<Codec4bit_avx, false, SIMDWIDTH>,
|
||||
Sim, SIMDWIDTH>(d, trained);
|
||||
|
||||
case QuantizerType::QT_fp16:
|
||||
return new DCTemplate_avx
|
||||
<QuantizerFP16_avx<SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained);
|
||||
|
||||
case QuantizerType::QT_8bit_direct:
|
||||
if (d % 16 == 0) {
|
||||
return new DistanceComputerByte_avx<Sim, SIMDWIDTH>(d, trained);
|
||||
} else {
|
||||
return new DCTemplate_avx
|
||||
<Quantizer8bitDirect_avx<SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained);
|
||||
}
|
||||
}
|
||||
FAISS_THROW_MSG ("unknown qtype");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
} // namespace faiss
|
@ -12,52 +12,10 @@
|
||||
|
||||
namespace faiss {
|
||||
|
||||
#ifdef __AVX__
|
||||
#define USE_AVX
|
||||
#endif
|
||||
|
||||
|
||||
/*******************************************************************
|
||||
* ScalarQuantizer Distance Computer
|
||||
********************************************************************/
|
||||
|
||||
/* AVX */
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_L2_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
|
||||
#ifdef USE_AVX
|
||||
if (dim % 8 == 0) {
|
||||
return select_distance_computer<SimilarityL2<8>> (qtype, dim, trained);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
return select_distance_computer<SimilarityL2<1>> (qtype, dim, trained);
|
||||
}
|
||||
}
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_IP_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
|
||||
#ifdef USE_AVX
|
||||
if (dim % 8 == 0) {
|
||||
return select_distance_computer<SimilarityIP<8>> (qtype, dim, trained);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
return select_distance_computer<SimilarityIP<1>> (qtype, dim, trained);
|
||||
}
|
||||
}
|
||||
|
||||
Quantizer *
|
||||
sq_select_quantizer_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
|
||||
#ifdef USE_AVX
|
||||
if (dim % 8 == 0) {
|
||||
return select_quantizer_1<8> (qtype, dim, trained);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
return select_quantizer_1<1> (qtype, dim, trained);
|
||||
}
|
||||
}
|
||||
|
||||
/* SSE */
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_L2_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
|
||||
|
@ -13,24 +13,13 @@
|
||||
|
||||
namespace faiss {
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_L2_sse(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_L2_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_IP_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
sq_get_distance_computer_IP_sse(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
Quantizer *
|
||||
sq_select_quantizer_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_L2_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_IP_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
Quantizer *
|
||||
sq_select_quantizer_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
sq_select_quantizer_sse(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
} // namespace faiss
|
||||
|
46
core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.cpp
vendored
Normal file
46
core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.cpp
vendored
Normal file
@ -0,0 +1,46 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#include <faiss/impl/ScalarQuantizerDC_avx.h>
|
||||
#include <faiss/impl/ScalarQuantizerCodec_avx.h>
|
||||
|
||||
namespace faiss {
|
||||
|
||||
/*******************************************************************
|
||||
* ScalarQuantizer Distance Computer
|
||||
********************************************************************/
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_L2_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
|
||||
if (dim % 8 == 0) {
|
||||
return select_distance_computer_avx<SimilarityL2_avx<8>>(qtype, dim, trained);
|
||||
} else {
|
||||
return select_distance_computer_avx<SimilarityL2_avx<1>>(qtype, dim, trained);
|
||||
}
|
||||
}
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_IP_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
|
||||
if (dim % 8 == 0) {
|
||||
return select_distance_computer_avx<SimilarityIP_avx<8>>(qtype, dim, trained);
|
||||
} else {
|
||||
return select_distance_computer_avx<SimilarityIP_avx<1>>(qtype, dim, trained);
|
||||
}
|
||||
}
|
||||
|
||||
Quantizer *
|
||||
sq_select_quantizer_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
|
||||
if (dim % 8 == 0) {
|
||||
return select_quantizer_1_avx<8>(qtype, dim, trained);
|
||||
} else {
|
||||
return select_quantizer_1_avx<1> (qtype, dim, trained);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace faiss
|
27
core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.h
vendored
Normal file
27
core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.h
vendored
Normal file
@ -0,0 +1,27 @@
|
||||
/**
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <faiss/impl/ScalarQuantizerOp.h>
|
||||
|
||||
namespace faiss {
|
||||
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_L2_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_IP_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
Quantizer *
|
||||
sq_select_quantizer_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
} // namespace faiss
|
@ -12,65 +12,39 @@
|
||||
|
||||
namespace faiss {
|
||||
|
||||
#ifdef __AVX__
|
||||
#define USE_AVX
|
||||
#endif
|
||||
|
||||
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__))
|
||||
#define USE_AVX_512
|
||||
#endif
|
||||
|
||||
/*******************************************************************
|
||||
* ScalarQuantizer Distance Computer
|
||||
********************************************************************/
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_L2_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
|
||||
#ifdef USE_AVX_512
|
||||
if (dim % 16 == 0) {
|
||||
return select_distance_computer_avx512<SimilarityL2_avx512<16>> (qtype, dim, trained);
|
||||
} else
|
||||
#endif
|
||||
#ifdef USE_AVX
|
||||
if (dim % 8 == 0) {
|
||||
} else if (dim % 8 == 0) {
|
||||
return select_distance_computer_avx512<SimilarityL2_avx512<8>> (qtype, dim, trained);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
} else {
|
||||
return select_distance_computer_avx512<SimilarityL2_avx512<1>> (qtype, dim, trained);
|
||||
}
|
||||
}
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_IP_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
|
||||
#ifdef USE_AVX_512
|
||||
if (dim % 16 == 0) {
|
||||
return select_distance_computer_avx512<SimilarityL2_avx512<16>> (qtype, dim, trained);
|
||||
} else
|
||||
#endif
|
||||
#ifdef USE_AVX
|
||||
if (dim % 8 == 0) {
|
||||
} else if (dim % 8 == 0) {
|
||||
return select_distance_computer_avx512<SimilarityIP_avx512<8>> (qtype, dim, trained);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
} else {
|
||||
return select_distance_computer_avx512<SimilarityIP_avx512<1>> (qtype, dim, trained);
|
||||
}
|
||||
}
|
||||
|
||||
Quantizer *
|
||||
sq_select_quantizer_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
|
||||
#ifdef USE_AVX_512
|
||||
if (dim % 16 == 0) {
|
||||
return select_quantizer_1_avx512<16> (qtype, dim, trained);
|
||||
} else
|
||||
#endif
|
||||
#ifdef USE_AVX
|
||||
if (dim % 8 == 0) {
|
||||
} else if (dim % 8 == 0) {
|
||||
return select_quantizer_1_avx512<8> (qtype, dim, trained);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
} else {
|
||||
return select_quantizer_1_avx512<1> (qtype, dim, trained);
|
||||
}
|
||||
}
|
||||
|
@ -14,15 +14,13 @@
|
||||
|
||||
namespace faiss {
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_L2_avx512(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_L2_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
SQDistanceComputer *
|
||||
sq_get_distance_computer_IP_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
sq_get_distance_computer_IP_avx512(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
Quantizer *
|
||||
sq_select_quantizer_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
sq_select_quantizer_avx512(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
|
||||
|
||||
} // namespace faiss
|
||||
|
@ -7,7 +7,7 @@ CXX = @CXX@
|
||||
CXXCPP = @CXXCPP@
|
||||
CPPFLAGS = -DFINTEGER=int @CPPFLAGS@ @OPENMP_CXXFLAGS@ @NVCC_CPPFLAGS@
|
||||
CXXFLAGS = -fPIC @ARCH_CXXFLAGS@ -Wno-sign-compare @CXXFLAGS@
|
||||
CPUFLAGS = -mavx2 -mf16c @ARCH_CPUFLAGS@
|
||||
CPUFLAGS = @ARCH_CPUFLAGS@
|
||||
LDFLAGS = @OPENMP_LDFLAGS@ @LDFLAGS@ @NVCC_LDFLAGS@
|
||||
LIBS = @BLAS_LIBS@ @LAPACK_LIBS@ @LIBS@ @NVCC_LIBS@
|
||||
PYTHONCFLAGS = @PYTHON_CFLAGS@ -I@NUMPY_INCLUDE@
|
||||
|
@ -24,31 +24,6 @@ namespace faiss {
|
||||
* Optimized distance/norm/inner prod computations
|
||||
*********************************************************/
|
||||
|
||||
#ifdef __AVX__
|
||||
/// Squared L2 distance between two vectors
|
||||
float fvec_L2sqr_avx (
|
||||
const float * x,
|
||||
const float * y,
|
||||
size_t d);
|
||||
|
||||
/// inner product
|
||||
float fvec_inner_product_avx (
|
||||
const float * x,
|
||||
const float * y,
|
||||
size_t d);
|
||||
|
||||
/// L1 distance
|
||||
float fvec_L1_avx (
|
||||
const float * x,
|
||||
const float * y,
|
||||
size_t d);
|
||||
|
||||
float fvec_Linf_avx (
|
||||
const float * x,
|
||||
const float * y,
|
||||
size_t d);
|
||||
#endif
|
||||
|
||||
#ifdef __SSE__
|
||||
float fvec_L2sqr_sse (
|
||||
const float * x,
|
||||
|
32
core/src/index/thirdparty/faiss/utils/distances_avx.h
vendored
Normal file
32
core/src/index/thirdparty/faiss/utils/distances_avx.h
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
/* All distance functions for L2 and IP distances.
|
||||
* The actual functions are implemented in distances_simd_avx512.cpp */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
namespace faiss {
|
||||
|
||||
/*********************************************************
|
||||
* Optimized distance/norm/inner prod computations
|
||||
*********************************************************/
|
||||
|
||||
/// Squared L2 distance between two vectors
|
||||
float
|
||||
fvec_L2sqr_avx(const float* x, const float* y, size_t d);
|
||||
|
||||
/// inner product
|
||||
float
|
||||
fvec_inner_product_avx(const float* x, const float* y, size_t d);
|
||||
|
||||
/// L1 distance
|
||||
float
|
||||
fvec_L1_avx(const float* x, const float* y, size_t d);
|
||||
|
||||
float
|
||||
fvec_Linf_avx(const float* x, const float* y, size_t d);
|
||||
|
||||
} // namespace faiss
|
@ -15,26 +15,18 @@ namespace faiss {
|
||||
*********************************************************/
|
||||
|
||||
/// Squared L2 distance between two vectors
|
||||
float fvec_L2sqr_avx512 (
|
||||
const float * x,
|
||||
const float * y,
|
||||
size_t d);
|
||||
float
|
||||
fvec_L2sqr_avx512(const float* x, const float* y, size_t d);
|
||||
|
||||
/// inner product
|
||||
float fvec_inner_product_avx512 (
|
||||
const float * x,
|
||||
const float * y,
|
||||
size_t d);
|
||||
float
|
||||
fvec_inner_product_avx512(const float * x, const float * y, size_t d);
|
||||
|
||||
/// L1 distance
|
||||
float fvec_L1_avx512 (
|
||||
const float * x,
|
||||
const float * y,
|
||||
size_t d);
|
||||
float
|
||||
fvec_L1_avx512(const float* x, const float* y, size_t d);
|
||||
|
||||
float fvec_Linf_avx512 (
|
||||
const float * x,
|
||||
const float * y,
|
||||
size_t d);
|
||||
float
|
||||
fvec_Linf_avx512(const float* x, const float* y, size_t d);
|
||||
|
||||
} // namespace faiss
|
||||
|
@ -27,10 +27,6 @@
|
||||
|
||||
namespace faiss {
|
||||
|
||||
#ifdef __AVX__
|
||||
#define USE_AVX
|
||||
#endif
|
||||
|
||||
/*********************************************************
|
||||
* Optimized distance computations
|
||||
*********************************************************/
|
||||
@ -313,171 +309,6 @@ void fvec_L2sqr_ny (float * dis, const float * x,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(USE_AVX)
|
||||
|
||||
// reads 0 <= d < 8 floats as __m256
|
||||
static inline __m256 masked_read_8 (int d, const float *x)
|
||||
{
|
||||
assert (0 <= d && d < 8);
|
||||
if (d < 4) {
|
||||
__m256 res = _mm256_setzero_ps ();
|
||||
res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
|
||||
return res;
|
||||
} else {
|
||||
__m256 res = _mm256_setzero_ps ();
|
||||
res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
|
||||
res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
float fvec_inner_product_avx (const float * x,
|
||||
const float * y,
|
||||
size_t d)
|
||||
{
|
||||
__m256 msum1 = _mm256_setzero_ps();
|
||||
|
||||
while (d >= 8) {
|
||||
__m256 mx = _mm256_loadu_ps (x); x += 8;
|
||||
__m256 my = _mm256_loadu_ps (y); y += 8;
|
||||
msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
|
||||
d -= 8;
|
||||
}
|
||||
|
||||
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
|
||||
msum2 += _mm256_extractf128_ps(msum1, 0);
|
||||
|
||||
if (d >= 4) {
|
||||
__m128 mx = _mm_loadu_ps (x); x += 4;
|
||||
__m128 my = _mm_loadu_ps (y); y += 4;
|
||||
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
|
||||
d -= 4;
|
||||
}
|
||||
|
||||
if (d > 0) {
|
||||
__m128 mx = masked_read (d, x);
|
||||
__m128 my = masked_read (d, y);
|
||||
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
|
||||
}
|
||||
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
float fvec_L2sqr_avx (const float * x,
|
||||
const float * y,
|
||||
size_t d)
|
||||
{
|
||||
__m256 msum1 = _mm256_setzero_ps();
|
||||
|
||||
while (d >= 8) {
|
||||
__m256 mx = _mm256_loadu_ps (x); x += 8;
|
||||
__m256 my = _mm256_loadu_ps (y); y += 8;
|
||||
const __m256 a_m_b1 = mx - my;
|
||||
msum1 += a_m_b1 * a_m_b1;
|
||||
d -= 8;
|
||||
}
|
||||
|
||||
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
|
||||
msum2 += _mm256_extractf128_ps(msum1, 0);
|
||||
|
||||
if (d >= 4) {
|
||||
__m128 mx = _mm_loadu_ps (x); x += 4;
|
||||
__m128 my = _mm_loadu_ps (y); y += 4;
|
||||
const __m128 a_m_b1 = mx - my;
|
||||
msum2 += a_m_b1 * a_m_b1;
|
||||
d -= 4;
|
||||
}
|
||||
|
||||
if (d > 0) {
|
||||
__m128 mx = masked_read (d, x);
|
||||
__m128 my = masked_read (d, y);
|
||||
__m128 a_m_b1 = mx - my;
|
||||
msum2 += a_m_b1 * a_m_b1;
|
||||
}
|
||||
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
float fvec_L1_avx (const float * x, const float * y, size_t d)
|
||||
{
|
||||
__m256 msum1 = _mm256_setzero_ps();
|
||||
__m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
|
||||
|
||||
while (d >= 8) {
|
||||
__m256 mx = _mm256_loadu_ps (x); x += 8;
|
||||
__m256 my = _mm256_loadu_ps (y); y += 8;
|
||||
const __m256 a_m_b = mx - my;
|
||||
msum1 += _mm256_and_ps(signmask, a_m_b);
|
||||
d -= 8;
|
||||
}
|
||||
|
||||
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
|
||||
msum2 += _mm256_extractf128_ps(msum1, 0);
|
||||
__m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
|
||||
|
||||
if (d >= 4) {
|
||||
__m128 mx = _mm_loadu_ps (x); x += 4;
|
||||
__m128 my = _mm_loadu_ps (y); y += 4;
|
||||
const __m128 a_m_b = mx - my;
|
||||
msum2 += _mm_and_ps(signmask2, a_m_b);
|
||||
d -= 4;
|
||||
}
|
||||
|
||||
if (d > 0) {
|
||||
__m128 mx = masked_read (d, x);
|
||||
__m128 my = masked_read (d, y);
|
||||
__m128 a_m_b = mx - my;
|
||||
msum2 += _mm_and_ps(signmask2, a_m_b);
|
||||
}
|
||||
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
float fvec_Linf_avx (const float * x, const float * y, size_t d)
|
||||
{
|
||||
__m256 msum1 = _mm256_setzero_ps();
|
||||
__m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
|
||||
|
||||
while (d >= 8) {
|
||||
__m256 mx = _mm256_loadu_ps (x); x += 8;
|
||||
__m256 my = _mm256_loadu_ps (y); y += 8;
|
||||
const __m256 a_m_b = mx - my;
|
||||
msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b));
|
||||
d -= 8;
|
||||
}
|
||||
|
||||
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
|
||||
msum2 = _mm_max_ps (msum2, _mm256_extractf128_ps(msum1, 0));
|
||||
__m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
|
||||
|
||||
if (d >= 4) {
|
||||
__m128 mx = _mm_loadu_ps (x); x += 4;
|
||||
__m128 my = _mm_loadu_ps (y); y += 4;
|
||||
const __m128 a_m_b = mx - my;
|
||||
msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
|
||||
d -= 4;
|
||||
}
|
||||
|
||||
if (d > 0) {
|
||||
__m128 mx = masked_read (d, x);
|
||||
__m128 my = masked_read (d, y);
|
||||
__m128 a_m_b = mx - my;
|
||||
msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
|
||||
}
|
||||
|
||||
msum2 = _mm_max_ps(_mm_movehl_ps(msum2, msum2), msum2);
|
||||
msum2 = _mm_max_ps(msum2, _mm_shuffle_ps (msum2, msum2, 1));
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
#endif /* defined(USE_AVX) */
|
||||
|
||||
#if defined(__SSE__) // But not AVX
|
||||
|
||||
float fvec_L1_sse (const float * x, const float * y, size_t d)
|
||||
|
213
core/src/index/thirdparty/faiss/utils/distances_simd_avx.cpp
vendored
Normal file
213
core/src/index/thirdparty/faiss/utils/distances_simd_avx.cpp
vendored
Normal file
@ -0,0 +1,213 @@
|
||||
|
||||
// -*- c++ -*-
|
||||
|
||||
#include <faiss/utils/distances_avx.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
namespace faiss {
|
||||
|
||||
#ifdef __SSE__
|
||||
// reads 0 <= d < 4 floats as __m128
|
||||
static inline __m128 masked_read (int d, const float *x) {
|
||||
assert (0 <= d && d < 4);
|
||||
__attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0};
|
||||
switch (d) {
|
||||
case 3:
|
||||
buf[2] = x[2];
|
||||
case 2:
|
||||
buf[1] = x[1];
|
||||
case 1:
|
||||
buf[0] = x[0];
|
||||
}
|
||||
return _mm_load_ps(buf);
|
||||
// cannot use AVX2 _mm_mask_set1_epi32
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
|
||||
// reads 0 <= d < 8 floats as __m256
|
||||
static inline __m256 masked_read_8 (int d, const float* x) {
|
||||
assert (0 <= d && d < 8);
|
||||
if (d < 4) {
|
||||
__m256 res = _mm256_setzero_ps ();
|
||||
res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
|
||||
return res;
|
||||
} else {
|
||||
__m256 res = _mm256_setzero_ps ();
|
||||
res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
|
||||
res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
float fvec_inner_product_avx (const float* x, const float* y, size_t d) {
|
||||
__m256 msum1 = _mm256_setzero_ps();
|
||||
|
||||
while (d >= 8) {
|
||||
__m256 mx = _mm256_loadu_ps (x); x += 8;
|
||||
__m256 my = _mm256_loadu_ps (y); y += 8;
|
||||
msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
|
||||
d -= 8;
|
||||
}
|
||||
|
||||
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
|
||||
msum2 += _mm256_extractf128_ps(msum1, 0);
|
||||
|
||||
if (d >= 4) {
|
||||
__m128 mx = _mm_loadu_ps (x); x += 4;
|
||||
__m128 my = _mm_loadu_ps (y); y += 4;
|
||||
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
|
||||
d -= 4;
|
||||
}
|
||||
|
||||
if (d > 0) {
|
||||
__m128 mx = masked_read (d, x);
|
||||
__m128 my = masked_read (d, y);
|
||||
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
|
||||
}
|
||||
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
float fvec_L2sqr_avx (const float* x, const float* y, size_t d) {
|
||||
__m256 msum1 = _mm256_setzero_ps();
|
||||
|
||||
while (d >= 8) {
|
||||
__m256 mx = _mm256_loadu_ps (x); x += 8;
|
||||
__m256 my = _mm256_loadu_ps (y); y += 8;
|
||||
const __m256 a_m_b1 = mx - my;
|
||||
msum1 += a_m_b1 * a_m_b1;
|
||||
d -= 8;
|
||||
}
|
||||
|
||||
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
|
||||
msum2 += _mm256_extractf128_ps(msum1, 0);
|
||||
|
||||
if (d >= 4) {
|
||||
__m128 mx = _mm_loadu_ps (x); x += 4;
|
||||
__m128 my = _mm_loadu_ps (y); y += 4;
|
||||
const __m128 a_m_b1 = mx - my;
|
||||
msum2 += a_m_b1 * a_m_b1;
|
||||
d -= 4;
|
||||
}
|
||||
|
||||
if (d > 0) {
|
||||
__m128 mx = masked_read (d, x);
|
||||
__m128 my = masked_read (d, y);
|
||||
__m128 a_m_b1 = mx - my;
|
||||
msum2 += a_m_b1 * a_m_b1;
|
||||
}
|
||||
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
float fvec_L1_avx (const float * x, const float * y, size_t d)
|
||||
{
|
||||
__m256 msum1 = _mm256_setzero_ps();
|
||||
__m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
|
||||
|
||||
while (d >= 8) {
|
||||
__m256 mx = _mm256_loadu_ps (x); x += 8;
|
||||
__m256 my = _mm256_loadu_ps (y); y += 8;
|
||||
const __m256 a_m_b = mx - my;
|
||||
msum1 += _mm256_and_ps(signmask, a_m_b);
|
||||
d -= 8;
|
||||
}
|
||||
|
||||
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
|
||||
msum2 += _mm256_extractf128_ps(msum1, 0);
|
||||
__m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
|
||||
|
||||
if (d >= 4) {
|
||||
__m128 mx = _mm_loadu_ps (x); x += 4;
|
||||
__m128 my = _mm_loadu_ps (y); y += 4;
|
||||
const __m128 a_m_b = mx - my;
|
||||
msum2 += _mm_and_ps(signmask2, a_m_b);
|
||||
d -= 4;
|
||||
}
|
||||
|
||||
if (d > 0) {
|
||||
__m128 mx = masked_read (d, x);
|
||||
__m128 my = masked_read (d, y);
|
||||
__m128 a_m_b = mx - my;
|
||||
msum2 += _mm_and_ps(signmask2, a_m_b);
|
||||
}
|
||||
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
msum2 = _mm_hadd_ps (msum2, msum2);
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
float fvec_Linf_avx (const float* x, const float* y, size_t d) {
|
||||
__m256 msum1 = _mm256_setzero_ps();
|
||||
__m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
|
||||
|
||||
while (d >= 8) {
|
||||
__m256 mx = _mm256_loadu_ps (x); x += 8;
|
||||
__m256 my = _mm256_loadu_ps (y); y += 8;
|
||||
const __m256 a_m_b = mx - my;
|
||||
msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b));
|
||||
d -= 8;
|
||||
}
|
||||
|
||||
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
|
||||
msum2 = _mm_max_ps (msum2, _mm256_extractf128_ps(msum1, 0));
|
||||
__m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
|
||||
|
||||
if (d >= 4) {
|
||||
__m128 mx = _mm_loadu_ps (x); x += 4;
|
||||
__m128 my = _mm_loadu_ps (y); y += 4;
|
||||
const __m128 a_m_b = mx - my;
|
||||
msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
|
||||
d -= 4;
|
||||
}
|
||||
|
||||
if (d > 0) {
|
||||
__m128 mx = masked_read (d, x);
|
||||
__m128 my = masked_read (d, y);
|
||||
__m128 a_m_b = mx - my;
|
||||
msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
|
||||
}
|
||||
|
||||
msum2 = _mm_max_ps(_mm_movehl_ps(msum2, msum2), msum2);
|
||||
msum2 = _mm_max_ps(msum2, _mm_shuffle_ps (msum2, msum2, 1));
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
float fvec_inner_product_avx(const float* x, const float* y, size_t d) {
|
||||
FAISS_ASSERT(false);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
float fvec_L2sqr_avx(const float* x, const float* y, size_t d) {
|
||||
FAISS_ASSERT(false);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
float fvec_L1_avx(const float* x, const float* y, size_t d) {
|
||||
FAISS_ASSERT(false);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
float fvec_Linf_avx (const float* x, const float* y, size_t d) {
|
||||
FAISS_ASSERT(false);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace faiss
|
@ -2,7 +2,6 @@
|
||||
// -*- c++ -*-
|
||||
|
||||
#include <faiss/utils/distances_avx512.h>
|
||||
#include <faiss/utils/distances.h>
|
||||
#include <faiss/impl/FaissAssert.h>
|
||||
|
||||
#include <cstdio>
|
||||
@ -34,10 +33,8 @@ static inline __m128 masked_read (int d, const float *x) {
|
||||
|
||||
#if (defined(__AVX512F__) && defined(__AVX512DQ__))
|
||||
|
||||
float fvec_inner_product_avx512 (const float * x,
|
||||
const float * y,
|
||||
size_t d)
|
||||
{
|
||||
float
|
||||
fvec_inner_product_avx512(const float* x, const float* y, size_t d) {
|
||||
__m512 msum0 = _mm512_setzero_ps();
|
||||
|
||||
while (d >= 16) {
|
||||
@ -78,10 +75,8 @@ float fvec_inner_product_avx512 (const float * x,
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
float fvec_L2sqr_avx512 (const float * x,
|
||||
const float * y,
|
||||
size_t d)
|
||||
{
|
||||
float
|
||||
fvec_L2sqr_avx512(const float* x, const float* y, size_t d) {
|
||||
__m512 msum0 = _mm512_setzero_ps();
|
||||
|
||||
while (d >= 16) {
|
||||
@ -126,8 +121,8 @@ float fvec_L2sqr_avx512 (const float * x,
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
float fvec_L1_avx512 (const float * x, const float * y, size_t d)
|
||||
{
|
||||
float
|
||||
fvec_L1_avx512(const float* x, const float* y, size_t d) {
|
||||
__m512 msum0 = _mm512_setzero_ps();
|
||||
__m512 signmask0 = __m512(_mm512_set1_epi32 (0x7fffffffUL));
|
||||
|
||||
@ -175,8 +170,8 @@ float fvec_L1_avx512 (const float * x, const float * y, size_t d)
|
||||
return _mm_cvtss_f32 (msum2);
|
||||
}
|
||||
|
||||
float fvec_Linf_avx512 (const float * x, const float * y, size_t d)
|
||||
{
|
||||
float
|
||||
fvec_Linf_avx512(const float* x, const float* y, size_t d) {
|
||||
__m512 msum0 = _mm512_setzero_ps();
|
||||
__m512 signmask0 = __m512(_mm512_set1_epi32 (0x7fffffffUL));
|
||||
|
||||
@ -226,30 +221,26 @@ float fvec_Linf_avx512 (const float * x, const float * y, size_t d)
|
||||
|
||||
#else
|
||||
|
||||
float fvec_inner_product_avx512 (const float * x,
|
||||
const float * y,
|
||||
size_t d)
|
||||
{
|
||||
float
|
||||
fvec_inner_product_avx512(const float* x, const float* y, size_t d) {
|
||||
FAISS_ASSERT(false);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
float fvec_L2sqr_avx512 (const float * x,
|
||||
const float * y,
|
||||
size_t d)
|
||||
{
|
||||
float
|
||||
fvec_L2sqr_avx512(const float* x, const float* y, size_t d) {
|
||||
FAISS_ASSERT(false);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
float fvec_L1_avx512 (const float * x, const float * y, size_t d)
|
||||
{
|
||||
float
|
||||
fvec_L1_avx512(const float* x, const float* y, size_t d) {
|
||||
FAISS_ASSERT(false);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
float fvec_Linf_avx512 (const float * x, const float * y, size_t d)
|
||||
{
|
||||
float
|
||||
fvec_Linf_avx512(const float* x, const float* y, size_t d) {
|
||||
FAISS_ASSERT(false);
|
||||
return 0.0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user