Caiyd 2039 avx sse (#2136)

* #2039 clean redundant compile options

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* #2039 add distances_avx.h and distances_avx.cpp

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* #2039 add ScalarQuantizerDC_avx.cpp, ScalarQuantizerDC_avx.h and ScalarQuantizerCodec_avx.h

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* #2039 remove -mavx2 from default compile option

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* update changelog

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* #2039 clean code

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* #2039 clean code

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>
This commit is contained in:
Cai Yudong 2020-04-27 15:01:47 +08:00 committed by GitHub
parent 509b87f103
commit 1384484b7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 1360 additions and 692 deletions

View File

@ -22,6 +22,7 @@ Please mark all change in change log and use the issue from GitHub
## Improvement
- \#221 Refactor LOG macro
- \#2039 Support Milvus run on SSE CPUs
## Task

View File

@ -79,12 +79,12 @@ endif ()
include(ThirdPartyPackagesCore)
if (CMAKE_BUILD_TYPE STREQUAL "Release")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -DELPP_THREAD_SAFE -fopenmp -mavx -mf16c -msse4 -mpopcnt")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -DELPP_THREAD_SAFE -fopenmp")
if (KNOWHERE_GPU_VERSION)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3")
endif ()
else ()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -fPIC -DELPP_THREAD_SAFE -fopenmp -mavx -mf16c -msse4 -mpopcnt")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -fPIC -DELPP_THREAD_SAFE -fopenmp")
if (KNOWHERE_GPU_VERSION)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O0 -g")
endif ()

View File

@ -436,7 +436,7 @@ macro(build_faiss)
set(FAISS_CONFIGURE_ARGS
"--prefix=${FAISS_PREFIX}"
"CFLAGS=${EP_C_FLAGS}"
"CXXFLAGS=${EP_CXX_FLAGS} -mavx2 -mf16c -O3"
"CXXFLAGS=${EP_CXX_FLAGS} -mf16c -O3"
--without-python)
if (FAISS_WITH_MKL)

View File

@ -187,12 +187,14 @@ inline T euclidean_distance(const T* x, const T* y, int f) {
//#ifdef USE_AVX
// Horizontal single sum of 256bit vector.
#if 0 /* use FAISS distance calculation algorithm instead */
inline float hsum256_ps_avx(__m256 v) {
const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v));
const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
return _mm_cvtss_f32(x32);
}
#endif
template<>
inline float dot<float>(const float* x, const float *y, int f) {

View File

@ -7,8 +7,10 @@
#include <faiss/FaissHook.h>
#include <faiss/impl/FaissAssert.h>
#include <faiss/impl/ScalarQuantizerDC.h>
#include <faiss/impl/ScalarQuantizerDC_avx.h>
#include <faiss/impl/ScalarQuantizerDC_avx512.h>
#include <faiss/utils/distances.h>
#include <faiss/utils/distances_avx.h>
#include <faiss/utils/distances_avx512.h>
#include <faiss/utils/instruction_set.h>

View File

@ -7,6 +7,7 @@
HEADERS = $(wildcard *.h impl/*.h utils/*.h)
SRC = $(wildcard *.cpp impl/*.cpp utils/*.cpp)
AVX_SRC = $(wildcard *avx.cpp impl/*avx.cpp utils/*avx.cpp)
AVX512_SRC = $(wildcard *avx512.cpp impl/*avx512.cpp utils/*avx512.cpp)
OBJ = $(SRC:.cpp=.o)
INSTALLDIRS = $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss
@ -42,6 +43,10 @@ libfaiss.$(SHAREDEXT): $(OBJ)
%.o: %.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c $< -o $@
# support avx
%avx.o: %avx.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx2 -c $< -o $@
# support avx512
%avx512.o: %avx512.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx512f -mavx512dq -mavx512bw -c $< -o $@

View File

@ -11,13 +11,8 @@
#include <cstdio>
#include <algorithm>
#include <omp.h>
#ifdef __SSE__
#include <immintrin.h>
#endif
#include <faiss/FaissHook.h>
#include <faiss/utils/utils.h>
#include <faiss/impl/FaissAssert.h>
@ -40,9 +35,6 @@ namespace faiss {
* that hides the template mess.
********************************************************************/
#ifdef __AVX__
#define USE_AVX
#endif
/*******************************************************************
@ -444,14 +436,10 @@ InvertedListScanner* ScalarQuantizer::select_InvertedListScanner
if (d % 16 == 0 && support_avx512()) {
return sel0_InvertedListScanner<16>
(mt, this, quantizer, store_pairs, by_residual);
}
#ifdef USE_AVX
if (d % 8 == 0) {
} if (d % 8 == 0) {
return sel0_InvertedListScanner<8>
(mt, this, quantizer, store_pairs, by_residual);
} else
#endif
{
} else {
return sel0_InvertedListScanner<1>
(mt, this, quantizer, store_pairs, by_residual);
}

View File

@ -11,13 +11,8 @@
#include <cstdio>
#include <algorithm>
#include <omp.h>
#ifdef __SSE__
#include <immintrin.h>
#endif
#include <faiss/utils/utils.h>
#include <faiss/impl/FaissAssert.h>
#include <faiss/impl/ScalarQuantizerOp.h>
@ -25,11 +20,6 @@
namespace faiss {
#ifdef __AVX__
#define USE_AVX
#endif
/*******************************************************************
* Codec: converts between values in [0, 1] and an index in a code
* array. The "i" parameter is the vector component index (not byte
@ -44,22 +34,6 @@ struct Codec8bit {
static float decode_component (const uint8_t *code, int i) {
return (code[i] + 0.5f) / 255.0f;
}
#ifdef USE_AVX
static __m256 decode_8_components (const uint8_t *code, int i) {
uint64_t c8 = *(uint64_t*)(code + i);
__m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
__m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
// __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
__m256i i8 = _mm256_castsi128_si256 (c4lo);
i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
__m256 f8 = _mm256_cvtepi32_ps (i8);
__m256 half = _mm256_set1_ps (0.5f);
f8 += half;
__m256 one_255 = _mm256_set1_ps (1.f / 255.f);
return f8 * one_255;
}
#endif
};
@ -71,28 +45,6 @@ struct Codec4bit {
static float decode_component (const uint8_t *code, int i) {
return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
}
#ifdef USE_AVX
static __m256 decode_8_components (const uint8_t *code, int i) {
uint32_t c4 = *(uint32_t*)(code + (i >> 1));
uint32_t mask = 0x0f0f0f0f;
uint32_t c4ev = c4 & mask;
uint32_t c4od = (c4 >> 4) & mask;
// the 8 lower bytes of c8 contain the values
__m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
_mm_set1_epi32(c4od));
__m128i c4lo = _mm_cvtepu8_epi32 (c8);
__m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
__m256i i8 = _mm256_castsi128_si256 (c4lo);
i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
__m256 f8 = _mm256_cvtepi32_ps (i8);
__m256 half = _mm256_set1_ps (0.5f);
f8 += half;
__m256 one_255 = _mm256_set1_ps (1.f / 15.f);
return f8 * one_255;
}
#endif
};
struct Codec6bit {
@ -138,20 +90,6 @@ struct Codec6bit {
}
return (bits + 0.5f) / 63.0f;
}
#ifdef USE_AVX
static __m256 decode_8_components (const uint8_t *code, int i) {
return _mm256_set_ps
(decode_component(code, i + 7),
decode_component(code, i + 6),
decode_component(code, i + 5),
decode_component(code, i + 4),
decode_component(code, i + 3),
decode_component(code, i + 2),
decode_component(code, i + 1),
decode_component(code, i + 0));
}
#endif
};
@ -204,25 +142,6 @@ struct QuantizerTemplate<Codec, true, 1>: Quantizer {
};
#ifdef USE_AVX
template<class Codec>
struct QuantizerTemplate<Codec, true, 8>: QuantizerTemplate<Codec, true, 1> {
QuantizerTemplate (size_t d, const std::vector<float> &trained):
QuantizerTemplate<Codec, true, 1> (d, trained) {}
__m256 reconstruct_8_components (const uint8_t * code, int i) const
{
__m256 xi = Codec::decode_8_components (code, i);
return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff);
}
};
#endif
template<class Codec>
struct QuantizerTemplate<Codec, false, 1>: Quantizer {
const size_t d;
@ -257,22 +176,6 @@ struct QuantizerTemplate<Codec, false, 1>: Quantizer {
};
#ifdef USE_AVX
template<class Codec>
struct QuantizerTemplate<Codec, false, 8>: QuantizerTemplate<Codec, false, 1> {
QuantizerTemplate (size_t d, const std::vector<float> &trained):
QuantizerTemplate<Codec, false, 1> (d, trained) {}
__m256 reconstruct_8_components (const uint8_t * code, int i) const
{
__m256 xi = Codec::decode_8_components (code, i);
return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i);
}
};
#endif
/*******************************************************************
* FP16 quantizer
*******************************************************************/
@ -305,21 +208,6 @@ struct QuantizerFP16<1>: Quantizer {
}
};
#ifdef USE_AVX
template<>
struct QuantizerFP16<8>: QuantizerFP16<1> {
QuantizerFP16 (size_t d, const std::vector<float> &trained):
QuantizerFP16<1> (d, trained) {}
__m256 reconstruct_8_components (const uint8_t * code, int i) const
{
__m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i));
return _mm256_cvtph_ps (codei);
}
};
#endif
/*******************************************************************
* 8bit_direct quantizer
@ -354,23 +242,6 @@ struct Quantizer8bitDirect<1>: Quantizer {
}
};
#ifdef USE_AVX
template<>
struct Quantizer8bitDirect<8>: Quantizer8bitDirect<1> {
Quantizer8bitDirect (size_t d, const std::vector<float> &trained):
Quantizer8bitDirect<1> (d, trained) {}
__m256 reconstruct_8_components (const uint8_t * code, int i) const
{
__m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
__m256i y8 = _mm256_cvtepu8_epi32 (x8); // 8 * int32
return _mm256_cvtepi32_ps (y8); // 8 * float32
}
};
#endif
template<int SIMDWIDTH>
Quantizer *select_quantizer_1 (
@ -407,7 +278,6 @@ Quantizer *select_quantizer_1 (
template<int SIMDWIDTH>
struct SimilarityL2 {};
template<>
struct SimilarityL2<1> {
static constexpr int simdwidth = 1;
@ -441,89 +311,78 @@ struct SimilarityL2<1> {
}
};
#ifdef USE_AVX
/* as same as SimilarityL2<1>, let build pass */
template<>
struct SimilarityL2<8> {
static constexpr int simdwidth = 8;
static constexpr int simdwidth = 1;
static constexpr MetricType metric_type = METRIC_L2;
const float *y, *yi;
explicit SimilarityL2 (const float * y): y(y) {}
__m256 accu8;
void begin_8 () {
accu8 = _mm256_setzero_ps();
/******* scalar accumulator *******/
float accu;
void begin () {
accu = 0;
yi = y;
}
void add_8_components (__m256 x) {
__m256 yiv = _mm256_loadu_ps (yi);
yi += 8;
__m256 tmp = yiv - x;
accu8 += tmp * tmp;
void add_component (float x) {
float tmp = *yi++ - x;
accu += tmp * tmp;
}
void add_8_components_2 (__m256 x, __m256 y) {
__m256 tmp = y - x;
accu8 += tmp * tmp;
void add_component_2 (float x1, float x2) {
float tmp = x1 - x2;
accu += tmp * tmp;
}
float result_8 () {
__m256 sum = _mm256_hadd_ps(accu8, accu8);
__m256 sum2 = _mm256_hadd_ps(sum, sum);
// now add the 0th and 4th component
return
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
float result () {
return accu;
}
};
/* as same as SimilarityL2<8>, let build pass */
/* as same as SimilarityL2<1>, let build pass */
template<>
struct SimilarityL2<16> {
static constexpr int simdwidth = 8;
static constexpr int simdwidth = 1;
static constexpr MetricType metric_type = METRIC_L2;
const float *y, *yi;
explicit SimilarityL2 (const float * y): y(y) {}
__m256 accu8;
void begin_8 () {
accu8 = _mm256_setzero_ps();
/******* scalar accumulator *******/
float accu;
void begin () {
accu = 0;
yi = y;
}
void add_8_components (__m256 x) {
__m256 yiv = _mm256_loadu_ps (yi);
yi += 8;
__m256 tmp = yiv - x;
accu8 += tmp * tmp;
void add_component (float x) {
float tmp = *yi++ - x;
accu += tmp * tmp;
}
void add_8_components_2 (__m256 x, __m256 y) {
__m256 tmp = y - x;
accu8 += tmp * tmp;
void add_component_2 (float x1, float x2) {
float tmp = x1 - x2;
accu += tmp * tmp;
}
float result_8 () {
__m256 sum = _mm256_hadd_ps(accu8, accu8);
__m256 sum2 = _mm256_hadd_ps(sum, sum);
// now add the 0th and 4th component
return
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
float result () {
return accu;
}
};
#endif
template<int SIMDWIDTH>
struct SimilarityIP {};
template<>
struct SimilarityIP<1> {
static constexpr int simdwidth = 1;
@ -553,13 +412,11 @@ struct SimilarityIP<1> {
}
};
#ifdef USE_AVX
/* as same as SimilarityIP<1>, let build pass */
template<>
struct SimilarityIP<8> {
static constexpr int simdwidth = 8;
static constexpr int simdwidth = 1;
static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
const float *y, *yi;
float accu;
@ -567,39 +424,29 @@ struct SimilarityIP<8> {
explicit SimilarityIP (const float * y):
y (y) {}
__m256 accu8;
void begin_8 () {
accu8 = _mm256_setzero_ps();
void begin () {
accu = 0;
yi = y;
}
void add_8_components (__m256 x) {
__m256 yiv = _mm256_loadu_ps (yi);
yi += 8;
accu8 += yiv * x;
void add_component (float x) {
accu += *yi++ * x;
}
void add_8_components_2 (__m256 x1, __m256 x2) {
accu8 += x1 * x2;
void add_component_2 (float x1, float x2) {
accu += x1 * x2;
}
float result_8 () {
__m256 sum = _mm256_hadd_ps(accu8, accu8);
__m256 sum2 = _mm256_hadd_ps(sum, sum);
// now add the 0th and 4th component
return
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
float result () {
return accu;
}
};
/* as same as SimilarityIP<8>, let build pass */
/* as same as SimilarityIP<1>, let build pass */
template<>
struct SimilarityIP<16> {
static constexpr int simdwidth = 8;
static constexpr int simdwidth = 1;
static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
const float *y, *yi;
float accu;
@ -607,33 +454,23 @@ struct SimilarityIP<16> {
explicit SimilarityIP (const float * y):
y (y) {}
__m256 accu8;
void begin_8 () {
accu8 = _mm256_setzero_ps();
void begin () {
accu = 0;
yi = y;
}
void add_8_components (__m256 x) {
__m256 yiv = _mm256_loadu_ps (yi);
yi += 8;
accu8 += yiv * x;
void add_component (float x) {
accu += *yi++ * x;
}
void add_8_components_2 (__m256 x1, __m256 x2) {
accu8 += x1 * x2;
void add_component_2 (float x1, float x2) {
accu += x1 * x2;
}
float result_8 () {
__m256 sum = _mm256_hadd_ps(accu8, accu8);
__m256 sum2 = _mm256_hadd_ps(sum, sum);
// now add the 0th and 4th component
return
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
float result () {
return accu;
}
};
#endif
/*******************************************************************
@ -696,63 +533,6 @@ struct DCTemplate<Quantizer, Similarity, 1> : SQDistanceComputer
}
};
#ifdef USE_AVX
template<class Quantizer, class Similarity>
struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer
{
using Sim = Similarity;
Quantizer quant;
DCTemplate(size_t d, const std::vector<float> &trained):
quant(d, trained)
{}
float compute_distance(const float* x, const uint8_t* code) const {
Similarity sim(x);
sim.begin_8();
for (size_t i = 0; i < quant.d; i += 8) {
__m256 xi = quant.reconstruct_8_components(code, i);
sim.add_8_components(xi);
}
return sim.result_8();
}
float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
const {
Similarity sim(nullptr);
sim.begin_8();
for (size_t i = 0; i < quant.d; i += 8) {
__m256 x1 = quant.reconstruct_8_components(code1, i);
__m256 x2 = quant.reconstruct_8_components(code2, i);
sim.add_8_components_2(x1, x2);
}
return sim.result_8();
}
void set_query (const float *x) final {
q = x;
}
/// compute distance of vector i to current query
float operator () (idx_t i) final {
return compute_distance (q, codes + i * code_size);
}
float symmetric_dis (idx_t i, idx_t j) override {
return compute_code_distance (codes + i * code_size,
codes + j * code_size);
}
float query_to_code (const uint8_t * code) const {
return compute_distance (q, code);
}
};
#endif
/*******************************************************************
* DistanceComputerByte: computes distances in the integer domain
@ -811,84 +591,12 @@ struct DistanceComputerByte<Similarity, 1> : SQDistanceComputer {
}
};
#ifdef USE_AVX
template<class Similarity>
struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
using Sim = Similarity;
int d;
std::vector<uint8_t> tmp;
DistanceComputerByte(int d, const std::vector<float> &): d(d), tmp(d) {
}
int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
const {
// __m256i accu = _mm256_setzero_ps ();
__m256i accu = _mm256_setzero_si256 ();
for (int i = 0; i < d; i += 16) {
// load 16 bytes, convert to 16 uint16_t
__m256i c1 = _mm256_cvtepu8_epi16
(_mm_loadu_si128((__m128i*)(code1 + i)));
__m256i c2 = _mm256_cvtepu8_epi16
(_mm_loadu_si128((__m128i*)(code2 + i)));
__m256i prod32;
if (Sim::metric_type == METRIC_INNER_PRODUCT) {
prod32 = _mm256_madd_epi16(c1, c2);
} else {
__m256i diff = _mm256_sub_epi16(c1, c2);
prod32 = _mm256_madd_epi16(diff, diff);
}
accu = _mm256_add_epi32 (accu, prod32);
}
__m128i sum = _mm256_extractf128_si256(accu, 0);
sum = _mm_add_epi32 (sum, _mm256_extractf128_si256(accu, 1));
sum = _mm_hadd_epi32 (sum, sum);
sum = _mm_hadd_epi32 (sum, sum);
return _mm_cvtsi128_si32 (sum);
}
void set_query (const float *x) final {
/*
for (int i = 0; i < d; i += 8) {
__m256 xi = _mm256_loadu_ps (x + i);
__m256i ci = _mm256_cvtps_epi32(xi);
*/
for (int i = 0; i < d; i++) {
tmp[i] = int(x[i]);
}
}
int compute_distance(const float* x, const uint8_t* code) {
set_query(x);
return compute_code_distance(tmp.data(), code);
}
/// compute distance of vector i to current query
float operator () (idx_t i) final {
return compute_distance (q, codes + i * code_size);
}
float symmetric_dis (idx_t i, idx_t j) override {
return compute_code_distance (codes + i * code_size,
codes + j * code_size);
}
float query_to_code (const uint8_t * code) const {
return compute_code_distance (tmp.data(), code);
}
};
#endif
/*******************************************************************
* select_distance_computer: runtime selection of template
* specialization
*******************************************************************/
template<class Sim>
SQDistanceComputer *select_distance_computer (
QuantizerType qtype,

View File

@ -0,0 +1,936 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#pragma once
#include <cstdio>
#include <algorithm>
#include <omp.h>
#ifdef __SSE__
#include <immintrin.h>
#endif
#include <faiss/utils/utils.h>
#include <faiss/impl/FaissAssert.h>
#include <faiss/impl/ScalarQuantizerOp.h>
namespace faiss {
#ifdef __AVX__
#define USE_AVX
#endif
/*******************************************************************
* Codec: converts between values in [0, 1] and an index in a code
* array. The "i" parameter is the vector component index (not byte
* index).
*/
struct Codec8bit_avx {
static void encode_component (float x, uint8_t *code, int i) {
code[i] = (int)(255 * x);
}
static float decode_component (const uint8_t *code, int i) {
return (code[i] + 0.5f) / 255.0f;
}
#ifdef USE_AVX
static __m256 decode_8_components (const uint8_t *code, int i) {
uint64_t c8 = *(uint64_t*)(code + i);
__m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
__m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
// __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
__m256i i8 = _mm256_castsi128_si256 (c4lo);
i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
__m256 f8 = _mm256_cvtepi32_ps (i8);
__m256 half = _mm256_set1_ps (0.5f);
f8 += half;
__m256 one_255 = _mm256_set1_ps (1.f / 255.f);
return f8 * one_255;
}
#endif
};
struct Codec4bit_avx {
static void encode_component (float x, uint8_t *code, int i) {
code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
}
static float decode_component (const uint8_t *code, int i) {
return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
}
#ifdef USE_AVX
static __m256 decode_8_components (const uint8_t *code, int i) {
uint32_t c4 = *(uint32_t*)(code + (i >> 1));
uint32_t mask = 0x0f0f0f0f;
uint32_t c4ev = c4 & mask;
uint32_t c4od = (c4 >> 4) & mask;
// the 8 lower bytes of c8 contain the values
__m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
_mm_set1_epi32(c4od));
__m128i c4lo = _mm_cvtepu8_epi32 (c8);
__m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
__m256i i8 = _mm256_castsi128_si256 (c4lo);
i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
__m256 f8 = _mm256_cvtepi32_ps (i8);
__m256 half = _mm256_set1_ps (0.5f);
f8 += half;
__m256 one_255 = _mm256_set1_ps (1.f / 15.f);
return f8 * one_255;
}
#endif
};
struct Codec6bit_avx {
static void encode_component (float x, uint8_t *code, int i) {
int bits = (int)(x * 63.0);
code += (i >> 2) * 3;
switch(i & 3) {
case 0:
code[0] |= bits;
break;
case 1:
code[0] |= bits << 6;
code[1] |= bits >> 2;
break;
case 2:
code[1] |= bits << 4;
code[2] |= bits >> 4;
break;
case 3:
code[2] |= bits << 2;
break;
}
}
static float decode_component (const uint8_t *code, int i) {
uint8_t bits;
code += (i >> 2) * 3;
switch(i & 3) {
case 0:
bits = code[0] & 0x3f;
break;
case 1:
bits = code[0] >> 6;
bits |= (code[1] & 0xf) << 2;
break;
case 2:
bits = code[1] >> 4;
bits |= (code[2] & 3) << 4;
break;
case 3:
bits = code[2] >> 2;
break;
}
return (bits + 0.5f) / 63.0f;
}
#ifdef USE_AVX
static __m256 decode_8_components (const uint8_t *code, int i) {
return _mm256_set_ps
(decode_component(code, i + 7),
decode_component(code, i + 6),
decode_component(code, i + 5),
decode_component(code, i + 4),
decode_component(code, i + 3),
decode_component(code, i + 2),
decode_component(code, i + 1),
decode_component(code, i + 0));
}
#endif
};
/*******************************************************************
* Quantizer: normalizes scalar vector components, then passes them
* through a codec
*******************************************************************/
template<class Codec, bool uniform, int SIMD>
struct QuantizerTemplate_avx {};
template<class Codec>
struct QuantizerTemplate_avx<Codec, true, 1>: Quantizer {
const size_t d;
const float vmin, vdiff;
QuantizerTemplate_avx(size_t d, const std::vector<float> &trained):
d(d), vmin(trained[0]), vdiff(trained[1])
{
}
void encode_vector(const float* x, uint8_t* code) const final {
for (size_t i = 0; i < d; i++) {
float xi = (x[i] - vmin) / vdiff;
if (xi < 0) {
xi = 0;
}
if (xi > 1.0) {
xi = 1.0;
}
Codec::encode_component(xi, code, i);
}
}
void decode_vector(const uint8_t* code, float* x) const final {
for (size_t i = 0; i < d; i++) {
float xi = Codec::decode_component(code, i);
x[i] = vmin + xi * vdiff;
}
}
float reconstruct_component (const uint8_t * code, int i) const
{
float xi = Codec::decode_component (code, i);
return vmin + xi * vdiff;
}
};
#ifdef USE_AVX
template<class Codec>
struct QuantizerTemplate_avx<Codec, true, 8>: QuantizerTemplate_avx<Codec, true, 1> {
QuantizerTemplate_avx (size_t d, const std::vector<float> &trained):
QuantizerTemplate_avx<Codec, true, 1> (d, trained) {}
__m256 reconstruct_8_components (const uint8_t * code, int i) const
{
__m256 xi = Codec::decode_8_components (code, i);
return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff);
}
};
#endif
template<class Codec>
struct QuantizerTemplate_avx<Codec, false, 1>: Quantizer {
const size_t d;
const float *vmin, *vdiff;
QuantizerTemplate_avx (size_t d, const std::vector<float> &trained):
d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
void encode_vector(const float* x, uint8_t* code) const final {
for (size_t i = 0; i < d; i++) {
float xi = (x[i] - vmin[i]) / vdiff[i];
if (xi < 0)
xi = 0;
if (xi > 1.0)
xi = 1.0;
Codec::encode_component(xi, code, i);
}
}
void decode_vector(const uint8_t* code, float* x) const final {
for (size_t i = 0; i < d; i++) {
float xi = Codec::decode_component(code, i);
x[i] = vmin[i] + xi * vdiff[i];
}
}
float reconstruct_component (const uint8_t * code, int i) const
{
float xi = Codec::decode_component (code, i);
return vmin[i] + xi * vdiff[i];
}
};
#ifdef USE_AVX
template<class Codec>
struct QuantizerTemplate_avx<Codec, false, 8>: QuantizerTemplate_avx<Codec, false, 1> {
QuantizerTemplate_avx (size_t d, const std::vector<float> &trained):
QuantizerTemplate_avx<Codec, false, 1> (d, trained) {}
__m256 reconstruct_8_components (const uint8_t * code, int i) const
{
__m256 xi = Codec::decode_8_components (code, i);
return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i);
}
};
#endif
/*******************************************************************
* FP16 quantizer
*******************************************************************/
template<int SIMDWIDTH>
struct QuantizerFP16_avx {};
template<>
struct QuantizerFP16_avx<1>: Quantizer {
const size_t d;
QuantizerFP16_avx(size_t d, const std::vector<float> & /* unused */):
d(d) {}
void encode_vector(const float* x, uint8_t* code) const final {
for (size_t i = 0; i < d; i++) {
((uint16_t*)code)[i] = encode_fp16(x[i]);
}
}
void decode_vector(const uint8_t* code, float* x) const final {
for (size_t i = 0; i < d; i++) {
x[i] = decode_fp16(((uint16_t*)code)[i]);
}
}
float reconstruct_component (const uint8_t * code, int i) const
{
return decode_fp16(((uint16_t*)code)[i]);
}
};
#ifdef USE_AVX
template<>
struct QuantizerFP16_avx<8>: QuantizerFP16_avx<1> {
QuantizerFP16_avx (size_t d, const std::vector<float> &trained):
QuantizerFP16_avx<1> (d, trained) {}
__m256 reconstruct_8_components (const uint8_t * code, int i) const
{
__m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i));
return _mm256_cvtph_ps (codei);
}
};
#endif
/*******************************************************************
* 8bit_direct quantizer
*******************************************************************/
template<int SIMDWIDTH>
struct Quantizer8bitDirect_avx {};
template<>
struct Quantizer8bitDirect_avx<1>: Quantizer {
const size_t d;
Quantizer8bitDirect_avx(size_t d, const std::vector<float> & /* unused */):
d(d) {}
void encode_vector(const float* x, uint8_t* code) const final {
for (size_t i = 0; i < d; i++) {
code[i] = (uint8_t)x[i];
}
}
void decode_vector(const uint8_t* code, float* x) const final {
for (size_t i = 0; i < d; i++) {
x[i] = code[i];
}
}
float reconstruct_component (const uint8_t * code, int i) const
{
return code[i];
}
};
#ifdef USE_AVX
template<>
struct Quantizer8bitDirect_avx<8>: Quantizer8bitDirect_avx<1> {
Quantizer8bitDirect_avx (size_t d, const std::vector<float> &trained):
Quantizer8bitDirect_avx<1> (d, trained) {}
__m256 reconstruct_8_components (const uint8_t * code, int i) const
{
__m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
__m256i y8 = _mm256_cvtepu8_epi32 (x8); // 8 * int32
return _mm256_cvtepi32_ps (y8); // 8 * float32
}
};
#endif
template<int SIMDWIDTH>
Quantizer *select_quantizer_1_avx (
QuantizerType qtype,
size_t d, const std::vector<float> & trained)
{
switch(qtype) {
case QuantizerType::QT_8bit:
return new QuantizerTemplate_avx<Codec8bit_avx, false, SIMDWIDTH>(d, trained);
case QuantizerType::QT_6bit:
return new QuantizerTemplate_avx<Codec6bit_avx, false, SIMDWIDTH>(d, trained);
case QuantizerType::QT_4bit:
return new QuantizerTemplate_avx<Codec4bit_avx, false, SIMDWIDTH>(d, trained);
case QuantizerType::QT_8bit_uniform:
return new QuantizerTemplate_avx<Codec8bit_avx, true, SIMDWIDTH>(d, trained);
case QuantizerType::QT_4bit_uniform:
return new QuantizerTemplate_avx<Codec4bit_avx, true, SIMDWIDTH>(d, trained);
case QuantizerType::QT_fp16:
return new QuantizerFP16_avx<SIMDWIDTH> (d, trained);
case QuantizerType::QT_8bit_direct:
return new Quantizer8bitDirect_avx<SIMDWIDTH> (d, trained);
}
FAISS_THROW_MSG ("unknown qtype");
}
/*******************************************************************
* Similarity: gets vector components and computes a similarity wrt. a
* query vector stored in the object. The data fields just encapsulate
* an accumulator.
*/
template<int SIMDWIDTH>
struct SimilarityL2_avx {};
template<>
struct SimilarityL2_avx<1> {
static constexpr int simdwidth = 1;
static constexpr MetricType metric_type = METRIC_L2;
const float *y, *yi;
explicit SimilarityL2_avx (const float * y): y(y) {}
/******* scalar accumulator *******/
float accu;
void begin () {
accu = 0;
yi = y;
}
void add_component (float x) {
float tmp = *yi++ - x;
accu += tmp * tmp;
}
void add_component_2 (float x1, float x2) {
float tmp = x1 - x2;
accu += tmp * tmp;
}
float result () {
return accu;
}
};
#ifdef USE_AVX
template<>
struct SimilarityL2_avx<8> {
static constexpr int simdwidth = 8;
static constexpr MetricType metric_type = METRIC_L2;
const float *y, *yi;
explicit SimilarityL2_avx (const float * y): y(y) {}
__m256 accu8;
void begin_8 () {
accu8 = _mm256_setzero_ps();
yi = y;
}
void add_8_components (__m256 x) {
__m256 yiv = _mm256_loadu_ps (yi);
yi += 8;
__m256 tmp = yiv - x;
accu8 += tmp * tmp;
}
void add_8_components_2 (__m256 x, __m256 y) {
__m256 tmp = y - x;
accu8 += tmp * tmp;
}
float result_8 () {
__m256 sum = _mm256_hadd_ps(accu8, accu8);
__m256 sum2 = _mm256_hadd_ps(sum, sum);
// now add the 0th and 4th component
return
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
}
};
/* as same as SimilarityL2<8>, let build pass */
template<>
struct SimilarityL2_avx<16> {
static constexpr int simdwidth = 8;
static constexpr MetricType metric_type = METRIC_L2;
const float *y, *yi;
explicit SimilarityL2_avx (const float * y): y(y) {}
__m256 accu8;
void begin_8 () {
accu8 = _mm256_setzero_ps();
yi = y;
}
void add_8_components (__m256 x) {
__m256 yiv = _mm256_loadu_ps (yi);
yi += 8;
__m256 tmp = yiv - x;
accu8 += tmp * tmp;
}
void add_8_components_2 (__m256 x, __m256 y) {
__m256 tmp = y - x;
accu8 += tmp * tmp;
}
float result_8 () {
__m256 sum = _mm256_hadd_ps(accu8, accu8);
__m256 sum2 = _mm256_hadd_ps(sum, sum);
// now add the 0th and 4th component
return
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
}
};
#endif
template<int SIMDWIDTH>
struct SimilarityIP_avx {};
template<>
struct SimilarityIP_avx<1> {
static constexpr int simdwidth = 1;
static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
const float *y, *yi;
float accu;
explicit SimilarityIP_avx (const float * y):
y (y) {}
void begin () {
accu = 0;
yi = y;
}
void add_component (float x) {
accu += *yi++ * x;
}
void add_component_2 (float x1, float x2) {
accu += x1 * x2;
}
float result () {
return accu;
}
};
#ifdef USE_AVX
template<>
struct SimilarityIP_avx<8> {
static constexpr int simdwidth = 8;
static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
const float *y, *yi;
float accu;
explicit SimilarityIP_avx (const float * y):
y (y) {}
__m256 accu8;
void begin_8 () {
accu8 = _mm256_setzero_ps();
yi = y;
}
void add_8_components (__m256 x) {
__m256 yiv = _mm256_loadu_ps (yi);
yi += 8;
accu8 += yiv * x;
}
void add_8_components_2 (__m256 x1, __m256 x2) {
accu8 += x1 * x2;
}
float result_8 () {
__m256 sum = _mm256_hadd_ps(accu8, accu8);
__m256 sum2 = _mm256_hadd_ps(sum, sum);
// now add the 0th and 4th component
return
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
}
};
/* as same as SimilarityIP<8>, let build pass */
template<>
struct SimilarityIP_avx<16> {
static constexpr int simdwidth = 8;
static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
const float *y, *yi;
float accu;
explicit SimilarityIP_avx (const float * y):
y (y) {}
__m256 accu8;
void begin_8 () {
accu8 = _mm256_setzero_ps();
yi = y;
}
void add_8_components (__m256 x) {
__m256 yiv = _mm256_loadu_ps (yi);
yi += 8;
accu8 += yiv * x;
}
void add_8_components_2 (__m256 x1, __m256 x2) {
accu8 += x1 * x2;
}
float result_8 () {
__m256 sum = _mm256_hadd_ps(accu8, accu8);
__m256 sum2 = _mm256_hadd_ps(sum, sum);
// now add the 0th and 4th component
return
_mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
_mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
}
};
#endif
/*******************************************************************
* DistanceComputer: combines a similarity and a quantizer to do
* code-to-vector or code-to-code comparisons
*******************************************************************/
template<class Quantizer, class Similarity, int SIMDWIDTH>
struct DCTemplate_avx : SQDistanceComputer {};
template<class Quantizer, class Similarity>
struct DCTemplate_avx<Quantizer, Similarity, 1> : SQDistanceComputer
{
using Sim = Similarity;
Quantizer quant;
DCTemplate_avx(size_t d, const std::vector<float> &trained):
quant(d, trained)
{}
float compute_distance(const float* x, const uint8_t* code) const {
Similarity sim(x);
sim.begin();
for (size_t i = 0; i < quant.d; i++) {
float xi = quant.reconstruct_component(code, i);
sim.add_component(xi);
}
return sim.result();
}
float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
const {
Similarity sim(nullptr);
sim.begin();
for (size_t i = 0; i < quant.d; i++) {
float x1 = quant.reconstruct_component(code1, i);
float x2 = quant.reconstruct_component(code2, i);
sim.add_component_2(x1, x2);
}
return sim.result();
}
void set_query (const float *x) final {
q = x;
}
/// compute distance of vector i to current query
float operator () (idx_t i) final {
return compute_distance (q, codes + i * code_size);
}
float symmetric_dis (idx_t i, idx_t j) override {
return compute_code_distance (codes + i * code_size,
codes + j * code_size);
}
float query_to_code (const uint8_t * code) const {
return compute_distance (q, code);
}
};
#ifdef USE_AVX
template<class Quantizer, class Similarity>
struct DCTemplate_avx<Quantizer, Similarity, 8> : SQDistanceComputer
{
using Sim = Similarity;
Quantizer quant;
DCTemplate_avx(size_t d, const std::vector<float> &trained):
quant(d, trained)
{}
float compute_distance(const float* x, const uint8_t* code) const {
Similarity sim(x);
sim.begin_8();
for (size_t i = 0; i < quant.d; i += 8) {
__m256 xi = quant.reconstruct_8_components(code, i);
sim.add_8_components(xi);
}
return sim.result_8();
}
float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
const {
Similarity sim(nullptr);
sim.begin_8();
for (size_t i = 0; i < quant.d; i += 8) {
__m256 x1 = quant.reconstruct_8_components(code1, i);
__m256 x2 = quant.reconstruct_8_components(code2, i);
sim.add_8_components_2(x1, x2);
}
return sim.result_8();
}
void set_query (const float *x) final {
q = x;
}
/// compute distance of vector i to current query
float operator () (idx_t i) final {
return compute_distance (q, codes + i * code_size);
}
float symmetric_dis (idx_t i, idx_t j) override {
return compute_code_distance (codes + i * code_size,
codes + j * code_size);
}
float query_to_code (const uint8_t * code) const {
return compute_distance (q, code);
}
};
#endif
/*******************************************************************
* DistanceComputerByte: computes distances in the integer domain
*******************************************************************/
template<class Similarity, int SIMDWIDTH>
struct DistanceComputerByte_avx : SQDistanceComputer {};
template<class Similarity>
struct DistanceComputerByte_avx<Similarity, 1> : SQDistanceComputer {
using Sim = Similarity;
int d;
std::vector<uint8_t> tmp;
DistanceComputerByte_avx(int d, const std::vector<float> &): d(d), tmp(d) {
}
int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
const {
int accu = 0;
for (int i = 0; i < d; i++) {
if (Sim::metric_type == METRIC_INNER_PRODUCT) {
accu += int(code1[i]) * code2[i];
} else {
int diff = int(code1[i]) - code2[i];
accu += diff * diff;
}
}
return accu;
}
void set_query (const float *x) final {
for (int i = 0; i < d; i++) {
tmp[i] = int(x[i]);
}
}
int compute_distance(const float* x, const uint8_t* code) {
set_query(x);
return compute_code_distance(tmp.data(), code);
}
/// compute distance of vector i to current query
float operator () (idx_t i) final {
return compute_distance (q, codes + i * code_size);
}
float symmetric_dis (idx_t i, idx_t j) override {
return compute_code_distance (codes + i * code_size,
codes + j * code_size);
}
float query_to_code (const uint8_t * code) const {
return compute_code_distance (tmp.data(), code);
}
};
#ifdef USE_AVX
template<class Similarity>
struct DistanceComputerByte_avx<Similarity, 8> : SQDistanceComputer {
using Sim = Similarity;
int d;
std::vector<uint8_t> tmp;
DistanceComputerByte_avx(int d, const std::vector<float> &): d(d), tmp(d) {
}
int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
const {
// __m256i accu = _mm256_setzero_ps ();
__m256i accu = _mm256_setzero_si256 ();
for (int i = 0; i < d; i += 16) {
// load 16 bytes, convert to 16 uint16_t
__m256i c1 = _mm256_cvtepu8_epi16
(_mm_loadu_si128((__m128i*)(code1 + i)));
__m256i c2 = _mm256_cvtepu8_epi16
(_mm_loadu_si128((__m128i*)(code2 + i)));
__m256i prod32;
if (Sim::metric_type == METRIC_INNER_PRODUCT) {
prod32 = _mm256_madd_epi16(c1, c2);
} else {
__m256i diff = _mm256_sub_epi16(c1, c2);
prod32 = _mm256_madd_epi16(diff, diff);
}
accu = _mm256_add_epi32 (accu, prod32);
}
__m128i sum = _mm256_extractf128_si256(accu, 0);
sum = _mm_add_epi32 (sum, _mm256_extractf128_si256(accu, 1));
sum = _mm_hadd_epi32 (sum, sum);
sum = _mm_hadd_epi32 (sum, sum);
return _mm_cvtsi128_si32 (sum);
}
void set_query (const float *x) final {
/*
for (int i = 0; i < d; i += 8) {
__m256 xi = _mm256_loadu_ps (x + i);
__m256i ci = _mm256_cvtps_epi32(xi);
*/
for (int i = 0; i < d; i++) {
tmp[i] = int(x[i]);
}
}
int compute_distance(const float* x, const uint8_t* code) {
set_query(x);
return compute_code_distance(tmp.data(), code);
}
/// compute distance of vector i to current query
float operator () (idx_t i) final {
return compute_distance (q, codes + i * code_size);
}
float symmetric_dis (idx_t i, idx_t j) override {
return compute_code_distance (codes + i * code_size,
codes + j * code_size);
}
float query_to_code (const uint8_t * code) const {
return compute_code_distance (tmp.data(), code);
}
};
#endif
/*******************************************************************
* select_distance_computer: runtime selection of template
* specialization
*******************************************************************/
template<class Sim>
SQDistanceComputer *select_distance_computer_avx (
QuantizerType qtype,
size_t d, const std::vector<float> & trained)
{
constexpr int SIMDWIDTH = Sim::simdwidth;
switch(qtype) {
case QuantizerType::QT_8bit_uniform:
return new DCTemplate_avx<QuantizerTemplate_avx<Codec8bit_avx, true, SIMDWIDTH>,
Sim, SIMDWIDTH>(d, trained);
case QuantizerType::QT_4bit_uniform:
return new DCTemplate_avx<QuantizerTemplate_avx<Codec4bit_avx, true, SIMDWIDTH>,
Sim, SIMDWIDTH>(d, trained);
case QuantizerType::QT_8bit:
return new DCTemplate_avx<QuantizerTemplate_avx<Codec8bit_avx, false, SIMDWIDTH>,
Sim, SIMDWIDTH>(d, trained);
case QuantizerType::QT_6bit:
return new DCTemplate_avx<QuantizerTemplate_avx<Codec6bit_avx, false, SIMDWIDTH>,
Sim, SIMDWIDTH>(d, trained);
case QuantizerType::QT_4bit:
return new DCTemplate_avx<QuantizerTemplate_avx<Codec4bit_avx, false, SIMDWIDTH>,
Sim, SIMDWIDTH>(d, trained);
case QuantizerType::QT_fp16:
return new DCTemplate_avx
<QuantizerFP16_avx<SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained);
case QuantizerType::QT_8bit_direct:
if (d % 16 == 0) {
return new DistanceComputerByte_avx<Sim, SIMDWIDTH>(d, trained);
} else {
return new DCTemplate_avx
<Quantizer8bitDirect_avx<SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained);
}
}
FAISS_THROW_MSG ("unknown qtype");
return nullptr;
}
} // namespace faiss

View File

@ -12,52 +12,10 @@
namespace faiss {
#ifdef __AVX__
#define USE_AVX
#endif
/*******************************************************************
* ScalarQuantizer Distance Computer
********************************************************************/
/* AVX */
SQDistanceComputer *
sq_get_distance_computer_L2_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
#ifdef USE_AVX
if (dim % 8 == 0) {
return select_distance_computer<SimilarityL2<8>> (qtype, dim, trained);
} else
#endif
{
return select_distance_computer<SimilarityL2<1>> (qtype, dim, trained);
}
}
SQDistanceComputer *
sq_get_distance_computer_IP_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
#ifdef USE_AVX
if (dim % 8 == 0) {
return select_distance_computer<SimilarityIP<8>> (qtype, dim, trained);
} else
#endif
{
return select_distance_computer<SimilarityIP<1>> (qtype, dim, trained);
}
}
Quantizer *
sq_select_quantizer_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
#ifdef USE_AVX
if (dim % 8 == 0) {
return select_quantizer_1<8> (qtype, dim, trained);
} else
#endif
{
return select_quantizer_1<1> (qtype, dim, trained);
}
}
/* SSE */
SQDistanceComputer *
sq_get_distance_computer_L2_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {

View File

@ -13,24 +13,13 @@
namespace faiss {
SQDistanceComputer *
sq_get_distance_computer_L2_sse(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
SQDistanceComputer *
sq_get_distance_computer_L2_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
SQDistanceComputer *
sq_get_distance_computer_IP_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
sq_get_distance_computer_IP_sse(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
Quantizer *
sq_select_quantizer_avx (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
SQDistanceComputer *
sq_get_distance_computer_L2_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
SQDistanceComputer *
sq_get_distance_computer_IP_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
Quantizer *
sq_select_quantizer_sse (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
sq_select_quantizer_sse(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
} // namespace faiss

View File

@ -0,0 +1,46 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#include <faiss/impl/ScalarQuantizerDC_avx.h>
#include <faiss/impl/ScalarQuantizerCodec_avx.h>
namespace faiss {
/*******************************************************************
* ScalarQuantizer Distance Computer
********************************************************************/
SQDistanceComputer *
sq_get_distance_computer_L2_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
if (dim % 8 == 0) {
return select_distance_computer_avx<SimilarityL2_avx<8>>(qtype, dim, trained);
} else {
return select_distance_computer_avx<SimilarityL2_avx<1>>(qtype, dim, trained);
}
}
SQDistanceComputer *
sq_get_distance_computer_IP_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
if (dim % 8 == 0) {
return select_distance_computer_avx<SimilarityIP_avx<8>>(qtype, dim, trained);
} else {
return select_distance_computer_avx<SimilarityIP_avx<1>>(qtype, dim, trained);
}
}
Quantizer *
sq_select_quantizer_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
if (dim % 8 == 0) {
return select_quantizer_1_avx<8>(qtype, dim, trained);
} else {
return select_quantizer_1_avx<1> (qtype, dim, trained);
}
}
} // namespace faiss

View File

@ -0,0 +1,27 @@
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#pragma once
#include <vector>
#include <faiss/impl/ScalarQuantizerOp.h>
namespace faiss {
SQDistanceComputer *
sq_get_distance_computer_L2_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
SQDistanceComputer *
sq_get_distance_computer_IP_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
Quantizer *
sq_select_quantizer_avx(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
} // namespace faiss

View File

@ -12,65 +12,39 @@
namespace faiss {
#ifdef __AVX__
#define USE_AVX
#endif
#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__))
#define USE_AVX_512
#endif
/*******************************************************************
* ScalarQuantizer Distance Computer
********************************************************************/
SQDistanceComputer *
sq_get_distance_computer_L2_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
#ifdef USE_AVX_512
if (dim % 16 == 0) {
return select_distance_computer_avx512<SimilarityL2_avx512<16>> (qtype, dim, trained);
} else
#endif
#ifdef USE_AVX
if (dim % 8 == 0) {
} else if (dim % 8 == 0) {
return select_distance_computer_avx512<SimilarityL2_avx512<8>> (qtype, dim, trained);
} else
#endif
{
} else {
return select_distance_computer_avx512<SimilarityL2_avx512<1>> (qtype, dim, trained);
}
}
SQDistanceComputer *
sq_get_distance_computer_IP_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
#ifdef USE_AVX_512
if (dim % 16 == 0) {
return select_distance_computer_avx512<SimilarityL2_avx512<16>> (qtype, dim, trained);
} else
#endif
#ifdef USE_AVX
if (dim % 8 == 0) {
} else if (dim % 8 == 0) {
return select_distance_computer_avx512<SimilarityIP_avx512<8>> (qtype, dim, trained);
} else
#endif
{
} else {
return select_distance_computer_avx512<SimilarityIP_avx512<1>> (qtype, dim, trained);
}
}
Quantizer *
sq_select_quantizer_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained) {
#ifdef USE_AVX_512
if (dim % 16 == 0) {
return select_quantizer_1_avx512<16> (qtype, dim, trained);
} else
#endif
#ifdef USE_AVX
if (dim % 8 == 0) {
} else if (dim % 8 == 0) {
return select_quantizer_1_avx512<8> (qtype, dim, trained);
} else
#endif
{
} else {
return select_quantizer_1_avx512<1> (qtype, dim, trained);
}
}

View File

@ -14,15 +14,13 @@
namespace faiss {
SQDistanceComputer *
sq_get_distance_computer_L2_avx512(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
SQDistanceComputer *
sq_get_distance_computer_L2_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
SQDistanceComputer *
sq_get_distance_computer_IP_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
sq_get_distance_computer_IP_avx512(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
Quantizer *
sq_select_quantizer_avx512 (QuantizerType qtype, size_t dim, const std::vector<float>& trained);
sq_select_quantizer_avx512(QuantizerType qtype, size_t dim, const std::vector<float>& trained);
} // namespace faiss

View File

@ -7,7 +7,7 @@ CXX = @CXX@
CXXCPP = @CXXCPP@
CPPFLAGS = -DFINTEGER=int @CPPFLAGS@ @OPENMP_CXXFLAGS@ @NVCC_CPPFLAGS@
CXXFLAGS = -fPIC @ARCH_CXXFLAGS@ -Wno-sign-compare @CXXFLAGS@
CPUFLAGS = -mavx2 -mf16c @ARCH_CPUFLAGS@
CPUFLAGS = @ARCH_CPUFLAGS@
LDFLAGS = @OPENMP_LDFLAGS@ @LDFLAGS@ @NVCC_LDFLAGS@
LIBS = @BLAS_LIBS@ @LAPACK_LIBS@ @LIBS@ @NVCC_LIBS@
PYTHONCFLAGS = @PYTHON_CFLAGS@ -I@NUMPY_INCLUDE@

View File

@ -24,31 +24,6 @@ namespace faiss {
* Optimized distance/norm/inner prod computations
*********************************************************/
#ifdef __AVX__
/// Squared L2 distance between two vectors
float fvec_L2sqr_avx (
const float * x,
const float * y,
size_t d);
/// inner product
float fvec_inner_product_avx (
const float * x,
const float * y,
size_t d);
/// L1 distance
float fvec_L1_avx (
const float * x,
const float * y,
size_t d);
float fvec_Linf_avx (
const float * x,
const float * y,
size_t d);
#endif
#ifdef __SSE__
float fvec_L2sqr_sse (
const float * x,

View File

@ -0,0 +1,32 @@
// -*- c++ -*-
/* All distance functions for L2 and IP distances.
* The actual functions are implemented in distances_simd_avx512.cpp */
#pragma once
#include <stddef.h>
namespace faiss {
/*********************************************************
* Optimized distance/norm/inner prod computations
*********************************************************/
/// Squared L2 distance between two vectors
float
fvec_L2sqr_avx(const float* x, const float* y, size_t d);
/// inner product
float
fvec_inner_product_avx(const float* x, const float* y, size_t d);
/// L1 distance
float
fvec_L1_avx(const float* x, const float* y, size_t d);
float
fvec_Linf_avx(const float* x, const float* y, size_t d);
} // namespace faiss

View File

@ -15,26 +15,18 @@ namespace faiss {
*********************************************************/
/// Squared L2 distance between two vectors
float fvec_L2sqr_avx512 (
const float * x,
const float * y,
size_t d);
float
fvec_L2sqr_avx512(const float* x, const float* y, size_t d);
/// inner product
float fvec_inner_product_avx512 (
const float * x,
const float * y,
size_t d);
float
fvec_inner_product_avx512(const float * x, const float * y, size_t d);
/// L1 distance
float fvec_L1_avx512 (
const float * x,
const float * y,
size_t d);
float
fvec_L1_avx512(const float* x, const float* y, size_t d);
float fvec_Linf_avx512 (
const float * x,
const float * y,
size_t d);
float
fvec_Linf_avx512(const float* x, const float* y, size_t d);
} // namespace faiss

View File

@ -27,10 +27,6 @@
namespace faiss {
#ifdef __AVX__
#define USE_AVX
#endif
/*********************************************************
* Optimized distance computations
*********************************************************/
@ -313,171 +309,6 @@ void fvec_L2sqr_ny (float * dis, const float * x,
#endif
#if defined(USE_AVX)
// reads 0 <= d < 8 floats as __m256
static inline __m256 masked_read_8 (int d, const float *x)
{
assert (0 <= d && d < 8);
if (d < 4) {
__m256 res = _mm256_setzero_ps ();
res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
return res;
} else {
__m256 res = _mm256_setzero_ps ();
res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
return res;
}
}
float fvec_inner_product_avx (const float * x,
const float * y,
size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
}
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
float fvec_L2sqr_avx (const float * x,
const float * y,
size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
const __m256 a_m_b1 = mx - my;
msum1 += a_m_b1 * a_m_b1;
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
const __m128 a_m_b1 = mx - my;
msum2 += a_m_b1 * a_m_b1;
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
__m128 a_m_b1 = mx - my;
msum2 += a_m_b1 * a_m_b1;
}
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
float fvec_L1_avx (const float * x, const float * y, size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
__m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
const __m256 a_m_b = mx - my;
msum1 += _mm256_and_ps(signmask, a_m_b);
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
__m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
const __m128 a_m_b = mx - my;
msum2 += _mm_and_ps(signmask2, a_m_b);
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
__m128 a_m_b = mx - my;
msum2 += _mm_and_ps(signmask2, a_m_b);
}
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
float fvec_Linf_avx (const float * x, const float * y, size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
__m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
const __m256 a_m_b = mx - my;
msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b));
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 = _mm_max_ps (msum2, _mm256_extractf128_ps(msum1, 0));
__m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
const __m128 a_m_b = mx - my;
msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
__m128 a_m_b = mx - my;
msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
}
msum2 = _mm_max_ps(_mm_movehl_ps(msum2, msum2), msum2);
msum2 = _mm_max_ps(msum2, _mm_shuffle_ps (msum2, msum2, 1));
return _mm_cvtss_f32 (msum2);
}
#endif /* defined(USE_AVX) */
#if defined(__SSE__) // But not AVX
float fvec_L1_sse (const float * x, const float * y, size_t d)

View File

@ -0,0 +1,213 @@
// -*- c++ -*-
#include <faiss/utils/distances_avx.h>
#include <faiss/impl/FaissAssert.h>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <cmath>
#include <immintrin.h>
namespace faiss {
#ifdef __SSE__
// reads 0 <= d < 4 floats as __m128
static inline __m128 masked_read (int d, const float *x) {
assert (0 <= d && d < 4);
__attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0};
switch (d) {
case 3:
buf[2] = x[2];
case 2:
buf[1] = x[1];
case 1:
buf[0] = x[0];
}
return _mm_load_ps(buf);
// cannot use AVX2 _mm_mask_set1_epi32
}
#endif
#ifdef __AVX__
// reads 0 <= d < 8 floats as __m256
static inline __m256 masked_read_8 (int d, const float* x) {
assert (0 <= d && d < 8);
if (d < 4) {
__m256 res = _mm256_setzero_ps ();
res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
return res;
} else {
__m256 res = _mm256_setzero_ps ();
res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
return res;
}
}
float fvec_inner_product_avx (const float* x, const float* y, size_t d) {
__m256 msum1 = _mm256_setzero_ps();
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
}
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
float fvec_L2sqr_avx (const float* x, const float* y, size_t d) {
__m256 msum1 = _mm256_setzero_ps();
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
const __m256 a_m_b1 = mx - my;
msum1 += a_m_b1 * a_m_b1;
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
const __m128 a_m_b1 = mx - my;
msum2 += a_m_b1 * a_m_b1;
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
__m128 a_m_b1 = mx - my;
msum2 += a_m_b1 * a_m_b1;
}
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
float fvec_L1_avx (const float * x, const float * y, size_t d)
{
__m256 msum1 = _mm256_setzero_ps();
__m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
const __m256 a_m_b = mx - my;
msum1 += _mm256_and_ps(signmask, a_m_b);
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 += _mm256_extractf128_ps(msum1, 0);
__m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
const __m128 a_m_b = mx - my;
msum2 += _mm_and_ps(signmask2, a_m_b);
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
__m128 a_m_b = mx - my;
msum2 += _mm_and_ps(signmask2, a_m_b);
}
msum2 = _mm_hadd_ps (msum2, msum2);
msum2 = _mm_hadd_ps (msum2, msum2);
return _mm_cvtss_f32 (msum2);
}
float fvec_Linf_avx (const float* x, const float* y, size_t d) {
__m256 msum1 = _mm256_setzero_ps();
__m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
while (d >= 8) {
__m256 mx = _mm256_loadu_ps (x); x += 8;
__m256 my = _mm256_loadu_ps (y); y += 8;
const __m256 a_m_b = mx - my;
msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b));
d -= 8;
}
__m128 msum2 = _mm256_extractf128_ps(msum1, 1);
msum2 = _mm_max_ps (msum2, _mm256_extractf128_ps(msum1, 0));
__m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
if (d >= 4) {
__m128 mx = _mm_loadu_ps (x); x += 4;
__m128 my = _mm_loadu_ps (y); y += 4;
const __m128 a_m_b = mx - my;
msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
d -= 4;
}
if (d > 0) {
__m128 mx = masked_read (d, x);
__m128 my = masked_read (d, y);
__m128 a_m_b = mx - my;
msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
}
msum2 = _mm_max_ps(_mm_movehl_ps(msum2, msum2), msum2);
msum2 = _mm_max_ps(msum2, _mm_shuffle_ps (msum2, msum2, 1));
return _mm_cvtss_f32 (msum2);
}
#else
float fvec_inner_product_avx(const float* x, const float* y, size_t d) {
FAISS_ASSERT(false);
return 0.0;
}
float fvec_L2sqr_avx(const float* x, const float* y, size_t d) {
FAISS_ASSERT(false);
return 0.0;
}
float fvec_L1_avx(const float* x, const float* y, size_t d) {
FAISS_ASSERT(false);
return 0.0;
}
float fvec_Linf_avx (const float* x, const float* y, size_t d) {
FAISS_ASSERT(false);
return 0.0;
}
#endif
} // namespace faiss

View File

@ -2,7 +2,6 @@
// -*- c++ -*-
#include <faiss/utils/distances_avx512.h>
#include <faiss/utils/distances.h>
#include <faiss/impl/FaissAssert.h>
#include <cstdio>
@ -34,10 +33,8 @@ static inline __m128 masked_read (int d, const float *x) {
#if (defined(__AVX512F__) && defined(__AVX512DQ__))
float fvec_inner_product_avx512 (const float * x,
const float * y,
size_t d)
{
float
fvec_inner_product_avx512(const float* x, const float* y, size_t d) {
__m512 msum0 = _mm512_setzero_ps();
while (d >= 16) {
@ -78,10 +75,8 @@ float fvec_inner_product_avx512 (const float * x,
return _mm_cvtss_f32 (msum2);
}
float fvec_L2sqr_avx512 (const float * x,
const float * y,
size_t d)
{
float
fvec_L2sqr_avx512(const float* x, const float* y, size_t d) {
__m512 msum0 = _mm512_setzero_ps();
while (d >= 16) {
@ -126,8 +121,8 @@ float fvec_L2sqr_avx512 (const float * x,
return _mm_cvtss_f32 (msum2);
}
float fvec_L1_avx512 (const float * x, const float * y, size_t d)
{
float
fvec_L1_avx512(const float* x, const float* y, size_t d) {
__m512 msum0 = _mm512_setzero_ps();
__m512 signmask0 = __m512(_mm512_set1_epi32 (0x7fffffffUL));
@ -175,8 +170,8 @@ float fvec_L1_avx512 (const float * x, const float * y, size_t d)
return _mm_cvtss_f32 (msum2);
}
float fvec_Linf_avx512 (const float * x, const float * y, size_t d)
{
float
fvec_Linf_avx512(const float* x, const float* y, size_t d) {
__m512 msum0 = _mm512_setzero_ps();
__m512 signmask0 = __m512(_mm512_set1_epi32 (0x7fffffffUL));
@ -226,30 +221,26 @@ float fvec_Linf_avx512 (const float * x, const float * y, size_t d)
#else
float fvec_inner_product_avx512 (const float * x,
const float * y,
size_t d)
{
float
fvec_inner_product_avx512(const float* x, const float* y, size_t d) {
FAISS_ASSERT(false);
return 0.0;
}
float fvec_L2sqr_avx512 (const float * x,
const float * y,
size_t d)
{
float
fvec_L2sqr_avx512(const float* x, const float* y, size_t d) {
FAISS_ASSERT(false);
return 0.0;
}
float fvec_L1_avx512 (const float * x, const float * y, size_t d)
{
float
fvec_L1_avx512(const float* x, const float* y, size_t d) {
FAISS_ASSERT(false);
return 0.0;
}
float fvec_Linf_avx512 (const float * x, const float * y, size_t d)
{
float
fvec_Linf_avx512(const float* x, const float* y, size_t d) {
FAISS_ASSERT(false);
return 0.0;
}