mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-02 11:59:00 +08:00
Opt faiss builder (#3067)
Signed-off-by: shengjun.li <shengjun.li@zilliz.com>
This commit is contained in:
parent
68b9a033a2
commit
b1b1165c56
@ -7,12 +7,13 @@ Please mark all change in change log and use the issue from GitHub
|
|||||||
## Bug
|
## Bug
|
||||||
- \#2890 Fix the index size caculation in cache
|
- \#2890 Fix the index size caculation in cache
|
||||||
- \#2952 Fix the result merging of IVF_PQ IP
|
- \#2952 Fix the result merging of IVF_PQ IP
|
||||||
- \#2975 Fix config ut failed
|
- \#2975 Fix config UT failed
|
||||||
|
|
||||||
## Feature
|
## Feature
|
||||||
|
|
||||||
## Improvement
|
## Improvement
|
||||||
- \#2653 Improve IVF search performance when NQ and nProbe are both large
|
- \#2653 Improve IVF search performance when NQ and nProbe are both large
|
||||||
|
- \#2828 Let Faiss not to compile half float by default
|
||||||
|
|
||||||
## Task
|
## Task
|
||||||
|
|
||||||
|
@ -27,6 +27,7 @@ GpuIndexIVF::GpuIndexIVF(GpuResources* resources,
|
|||||||
nlist(nlistIn),
|
nlist(nlistIn),
|
||||||
nprobe(1),
|
nprobe(1),
|
||||||
quantizer(nullptr) {
|
quantizer(nullptr) {
|
||||||
|
|
||||||
init_();
|
init_();
|
||||||
|
|
||||||
// Only IP and L2 are supported for now
|
// Only IP and L2 are supported for now
|
||||||
|
@ -261,3 +261,4 @@ GpuIndexIVFFlat::searchImpl_(int n,
|
|||||||
|
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
|
||||||
|
@ -34,6 +34,10 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
|
|||||||
bitsPerCode_(0),
|
bitsPerCode_(0),
|
||||||
reserveMemoryVecs_(0),
|
reserveMemoryVecs_(0),
|
||||||
index_(nullptr) {
|
index_(nullptr) {
|
||||||
|
#ifndef FAISS_USE_FLOAT16
|
||||||
|
FAISS_ASSERT(!ivfpqConfig_.useFloat16LookupTables);
|
||||||
|
#endif
|
||||||
|
|
||||||
copyFrom(index);
|
copyFrom(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -55,6 +59,10 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
|
|||||||
bitsPerCode_(bitsPerCode),
|
bitsPerCode_(bitsPerCode),
|
||||||
reserveMemoryVecs_(0),
|
reserveMemoryVecs_(0),
|
||||||
index_(nullptr) {
|
index_(nullptr) {
|
||||||
|
#ifndef FAISS_USE_FLOAT16
|
||||||
|
FAISS_ASSERT(!config.useFloat16LookupTables);
|
||||||
|
#endif
|
||||||
|
|
||||||
verifySettings_();
|
verifySettings_();
|
||||||
|
|
||||||
// We haven't trained ourselves, so don't construct the PQ index yet
|
// We haven't trained ourselves, so don't construct the PQ index yet
|
||||||
@ -424,9 +432,11 @@ GpuIndexIVFPQ::verifySettings_() const {
|
|||||||
// We must have enough shared memory on the current device to store
|
// We must have enough shared memory on the current device to store
|
||||||
// our lookup distances
|
// our lookup distances
|
||||||
int lookupTableSize = sizeof(float);
|
int lookupTableSize = sizeof(float);
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (ivfpqConfig_.useFloat16LookupTables) {
|
if (ivfpqConfig_.useFloat16LookupTables) {
|
||||||
lookupTableSize = sizeof(half);
|
lookupTableSize = sizeof(half);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// 64 bytes per code is only supported with usage of float16, at 2^8
|
// 64 bytes per code is only supported with usage of float16, at 2^8
|
||||||
// codes per subquantizer
|
// codes per subquantizer
|
||||||
|
@ -262,11 +262,13 @@ void runSumAlongColumns(Tensor<float, 1, true>& input,
|
|||||||
runSumAlongColumns<float, float4>(input, output, stream);
|
runSumAlongColumns<float, float4>(input, output, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runSumAlongColumns(Tensor<half, 1, true>& input,
|
void runSumAlongColumns(Tensor<half, 1, true>& input,
|
||||||
Tensor<half, 2, true>& output,
|
Tensor<half, 2, true>& output,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
runSumAlongColumns<half, half2>(input, output, stream);
|
runSumAlongColumns<half, half2>(input, output, stream);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename T, typename TVec>
|
template <typename T, typename TVec>
|
||||||
void runAssignAlongColumns(Tensor<T, 1, true>& input,
|
void runAssignAlongColumns(Tensor<T, 1, true>& input,
|
||||||
@ -310,11 +312,13 @@ void runAssignAlongColumns(Tensor<float, 1, true>& input,
|
|||||||
runAssignAlongColumns<float, float4>(input, output, stream);
|
runAssignAlongColumns<float, float4>(input, output, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runAssignAlongColumns(Tensor<half, 1, true>& input,
|
void runAssignAlongColumns(Tensor<half, 1, true>& input,
|
||||||
Tensor<half, 2, true>& output,
|
Tensor<half, 2, true>& output,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
runAssignAlongColumns<half, half2>(input, output, stream);
|
runAssignAlongColumns<half, half2>(input, output, stream);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void runSumAlongRows(Tensor<T, 1, true>& input,
|
void runSumAlongRows(Tensor<T, 1, true>& input,
|
||||||
@ -344,11 +348,13 @@ void runSumAlongRows(Tensor<float, 1, true>& input,
|
|||||||
runSumAlongRows<float>(input, output, zeroClamp, stream);
|
runSumAlongRows<float>(input, output, zeroClamp, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runSumAlongRows(Tensor<half, 1, true>& input,
|
void runSumAlongRows(Tensor<half, 1, true>& input,
|
||||||
Tensor<half, 2, true>& output,
|
Tensor<half, 2, true>& output,
|
||||||
bool zeroClamp,
|
bool zeroClamp,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
runSumAlongRows<half>(input, output, zeroClamp, stream);
|
runSumAlongRows<half>(input, output, zeroClamp, stream);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -17,18 +17,22 @@ void runSumAlongColumns(Tensor<float, 1, true>& input,
|
|||||||
Tensor<float, 2, true>& output,
|
Tensor<float, 2, true>& output,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runSumAlongColumns(Tensor<half, 1, true>& input,
|
void runSumAlongColumns(Tensor<half, 1, true>& input,
|
||||||
Tensor<half, 2, true>& output,
|
Tensor<half, 2, true>& output,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
#endif
|
||||||
|
|
||||||
// output[x][i] = input[i] for all x
|
// output[x][i] = input[i] for all x
|
||||||
void runAssignAlongColumns(Tensor<float, 1, true>& input,
|
void runAssignAlongColumns(Tensor<float, 1, true>& input,
|
||||||
Tensor<float, 2, true>& output,
|
Tensor<float, 2, true>& output,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runAssignAlongColumns(Tensor<half, 1, true>& input,
|
void runAssignAlongColumns(Tensor<half, 1, true>& input,
|
||||||
Tensor<half, 2, true>& output,
|
Tensor<half, 2, true>& output,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
#endif
|
||||||
|
|
||||||
// output[i][x] += input[i] for all x
|
// output[i][x] += input[i] for all x
|
||||||
// If zeroClamp, output[i][x] = max(output[i][x] + input[i], 0) for all x
|
// If zeroClamp, output[i][x] = max(output[i][x] + input[i], 0) for all x
|
||||||
@ -37,9 +41,11 @@ void runSumAlongRows(Tensor<float, 1, true>& input,
|
|||||||
bool zeroClamp,
|
bool zeroClamp,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runSumAlongRows(Tensor<half, 1, true>& input,
|
void runSumAlongRows(Tensor<half, 1, true>& input,
|
||||||
Tensor<half, 2, true>& output,
|
Tensor<half, 2, true>& output,
|
||||||
bool zeroClamp,
|
bool zeroClamp,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
#endif
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -370,6 +370,7 @@ runIPDistance(GpuResources* resources,
|
|||||||
outIndices);
|
outIndices);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void
|
void
|
||||||
runIPDistance(GpuResources* resources,
|
runIPDistance(GpuResources* resources,
|
||||||
Tensor<half, 2, true>& vectors,
|
Tensor<half, 2, true>& vectors,
|
||||||
@ -390,6 +391,7 @@ runIPDistance(GpuResources* resources,
|
|||||||
outDistances,
|
outDistances,
|
||||||
outIndices);
|
outIndices);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void
|
void
|
||||||
runL2Distance(GpuResources* resources,
|
runL2Distance(GpuResources* resources,
|
||||||
@ -416,6 +418,7 @@ runL2Distance(GpuResources* resources,
|
|||||||
ignoreOutDistances);
|
ignoreOutDistances);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void
|
void
|
||||||
runL2Distance(GpuResources* resources,
|
runL2Distance(GpuResources* resources,
|
||||||
Tensor<half, 2, true>& vectors,
|
Tensor<half, 2, true>& vectors,
|
||||||
@ -440,5 +443,6 @@ runL2Distance(GpuResources* resources,
|
|||||||
outIndices,
|
outIndices,
|
||||||
ignoreOutDistances);
|
ignoreOutDistances);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -57,6 +57,7 @@ void runIPDistance(GpuResources* resources,
|
|||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices);
|
Tensor<int, 2, true>& outIndices);
|
||||||
|
|
||||||
|
|
||||||
void runL2Distance(GpuResources* resources,
|
void runL2Distance(GpuResources* resources,
|
||||||
Tensor<half, 2, true>& vectors,
|
Tensor<half, 2, true>& vectors,
|
||||||
bool vectorsRowMajor,
|
bool vectorsRowMajor,
|
||||||
|
@ -29,6 +29,9 @@ FlatIndex::FlatIndex(GpuResources* res,
|
|||||||
space_(space),
|
space_(space),
|
||||||
num_(0),
|
num_(0),
|
||||||
rawData_(space) {
|
rawData_(space) {
|
||||||
|
#ifndef FAISS_USE_FLOAT16
|
||||||
|
FAISS_ASSERT(!useFloat16_);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
@ -38,28 +41,40 @@ FlatIndex::getUseFloat16() const {
|
|||||||
|
|
||||||
/// Returns the number of vectors we contain
|
/// Returns the number of vectors we contain
|
||||||
int FlatIndex::getSize() const {
|
int FlatIndex::getSize() const {
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
return vectorsHalf_.getSize(0);
|
return vectorsHalf_.getSize(0);
|
||||||
} else {
|
} else {
|
||||||
return vectors_.getSize(0);
|
return vectors_.getSize(0);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
return vectors_.getSize(0);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
int FlatIndex::getDim() const {
|
int FlatIndex::getDim() const {
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
return vectorsHalf_.getSize(1);
|
return vectorsHalf_.getSize(1);
|
||||||
} else {
|
} else {
|
||||||
return vectors_.getSize(1);
|
return vectors_.getSize(1);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
return vectors_.getSize(1);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
|
FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
rawData_.reserve(numVecs * dim_ * sizeof(half), stream);
|
rawData_.reserve(numVecs * dim_ * sizeof(half), stream);
|
||||||
} else {
|
} else {
|
||||||
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
|
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
rawData_.reserve(numVecs * dim_ * sizeof(float), stream);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
@ -70,6 +85,7 @@ FlatIndex::getVectorsRef<float>() {
|
|||||||
return getVectorsFloat32Ref();
|
return getVectorsFloat32Ref();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
template <>
|
template <>
|
||||||
Tensor<half, 2, true>&
|
Tensor<half, 2, true>&
|
||||||
FlatIndex::getVectorsRef<half>() {
|
FlatIndex::getVectorsRef<half>() {
|
||||||
@ -77,6 +93,7 @@ FlatIndex::getVectorsRef<half>() {
|
|||||||
FAISS_ASSERT(useFloat16_);
|
FAISS_ASSERT(useFloat16_);
|
||||||
return getVectorsFloat16Ref();
|
return getVectorsFloat16Ref();
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
Tensor<float, 2, true>&
|
Tensor<float, 2, true>&
|
||||||
FlatIndex::getVectorsFloat32Ref() {
|
FlatIndex::getVectorsFloat32Ref() {
|
||||||
@ -86,6 +103,7 @@ FlatIndex::getVectorsFloat32Ref() {
|
|||||||
return vectors_;
|
return vectors_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
Tensor<half, 2, true>&
|
Tensor<half, 2, true>&
|
||||||
FlatIndex::getVectorsFloat16Ref() {
|
FlatIndex::getVectorsFloat16Ref() {
|
||||||
// Should not call this unless we are in float16 mode
|
// Should not call this unless we are in float16 mode
|
||||||
@ -93,6 +111,7 @@ FlatIndex::getVectorsFloat16Ref() {
|
|||||||
|
|
||||||
return vectorsHalf_;
|
return vectorsHalf_;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
DeviceTensor<float, 2, true>
|
DeviceTensor<float, 2, true>
|
||||||
FlatIndex::getVectorsFloat32Copy(cudaStream_t stream) {
|
FlatIndex::getVectorsFloat32Copy(cudaStream_t stream) {
|
||||||
@ -103,12 +122,16 @@ DeviceTensor<float, 2, true>
|
|||||||
FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
|
FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
|
||||||
DeviceTensor<float, 2, true> vecFloat32({num, dim_}, space_);
|
DeviceTensor<float, 2, true> vecFloat32({num, dim_}, space_);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
auto halfNarrow = vectorsHalf_.narrowOutermost(from, num);
|
auto halfNarrow = vectorsHalf_.narrowOutermost(from, num);
|
||||||
convertTensor<half, float, 2>(stream, halfNarrow, vecFloat32);
|
convertTensor<half, float, 2>(stream, halfNarrow, vecFloat32);
|
||||||
} else {
|
} else {
|
||||||
vectors_.copyTo(vecFloat32, stream);
|
vectors_.copyTo(vecFloat32, stream);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
vectors_.copyTo(vecFloat32, stream);
|
||||||
|
#endif
|
||||||
|
|
||||||
return vecFloat32;
|
return vecFloat32;
|
||||||
}
|
}
|
||||||
@ -125,13 +148,16 @@ FlatIndex::query(Tensor<float, 2, true>& input,
|
|||||||
auto stream = resources_->getDefaultStreamCurrentDevice();
|
auto stream = resources_->getDefaultStreamCurrentDevice();
|
||||||
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
auto& mem = resources_->getMemoryManagerCurrentDevice();
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
// We need to convert the input to float16 for comparison to ourselves
|
// We need to convert the input to float16 for comparison to ourselves
|
||||||
|
|
||||||
auto inputHalf =
|
auto inputHalf =
|
||||||
convertTensor<float, half, 2>(resources_, stream, input);
|
convertTensor<float, half, 2>(resources_, stream, input);
|
||||||
|
|
||||||
query(inputHalf, bitset, k, metric, metricArg,
|
query(inputHalf, bitset, k, metric, metricArg,
|
||||||
outDistances, outIndices, exactDistance);
|
outDistances, outIndices, exactDistance);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
bfKnnOnDevice(resources_,
|
bfKnnOnDevice(resources_,
|
||||||
getCurrentDevice(),
|
getCurrentDevice(),
|
||||||
@ -149,8 +175,26 @@ FlatIndex::query(Tensor<float, 2, true>& input,
|
|||||||
outIndices,
|
outIndices,
|
||||||
!exactDistance);
|
!exactDistance);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
bfKnnOnDevice(resources_,
|
||||||
|
getCurrentDevice(),
|
||||||
|
stream,
|
||||||
|
storeTransposed_ ? vectorsTransposed_ : vectors_,
|
||||||
|
!storeTransposed_, // is vectors row major?
|
||||||
|
&norms_,
|
||||||
|
input,
|
||||||
|
true, // input is row major
|
||||||
|
bitset,
|
||||||
|
k,
|
||||||
|
metric,
|
||||||
|
metricArg,
|
||||||
|
outDistances,
|
||||||
|
outIndices,
|
||||||
|
!exactDistance);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void
|
void
|
||||||
FlatIndex::query(Tensor<half, 2, true>& input,
|
FlatIndex::query(Tensor<half, 2, true>& input,
|
||||||
Tensor<uint8_t, 1, true>& bitset,
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
@ -178,11 +222,13 @@ FlatIndex::query(Tensor<half, 2, true>& input,
|
|||||||
outIndices,
|
outIndices,
|
||||||
!exactDistance);
|
!exactDistance);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void
|
void
|
||||||
FlatIndex::computeResidual(Tensor<float, 2, true>& vecs,
|
FlatIndex::computeResidual(Tensor<float, 2, true>& vecs,
|
||||||
Tensor<int, 1, true>& listIds,
|
Tensor<int, 1, true>& listIds,
|
||||||
Tensor<float, 2, true>& residuals) {
|
Tensor<float, 2, true>& residuals) {
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
runCalcResidual(vecs,
|
runCalcResidual(vecs,
|
||||||
getVectorsFloat16Ref(),
|
getVectorsFloat16Ref(),
|
||||||
@ -196,11 +242,19 @@ FlatIndex::computeResidual(Tensor<float, 2, true>& vecs,
|
|||||||
residuals,
|
residuals,
|
||||||
resources_->getDefaultStreamCurrentDevice());
|
resources_->getDefaultStreamCurrentDevice());
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
runCalcResidual(vecs,
|
||||||
|
getVectorsFloat32Ref(),
|
||||||
|
listIds,
|
||||||
|
residuals,
|
||||||
|
resources_->getDefaultStreamCurrentDevice());
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
FlatIndex::reconstruct(Tensor<int, 1, true>& listIds,
|
FlatIndex::reconstruct(Tensor<int, 1, true>& listIds,
|
||||||
Tensor<float, 2, true>& vecs) {
|
Tensor<float, 2, true>& vecs) {
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
runReconstruct(listIds,
|
runReconstruct(listIds,
|
||||||
getVectorsFloat16Ref(),
|
getVectorsFloat16Ref(),
|
||||||
@ -212,8 +266,13 @@ FlatIndex::reconstruct(Tensor<int, 1, true>& listIds,
|
|||||||
vecs,
|
vecs,
|
||||||
resources_->getDefaultStreamCurrentDevice());
|
resources_->getDefaultStreamCurrentDevice());
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
runReconstruct(listIds,
|
||||||
|
getVectorsFloat32Ref(),
|
||||||
|
vecs,
|
||||||
|
resources_->getDefaultStreamCurrentDevice());
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
FlatIndex::reconstruct(Tensor<int, 2, true>& listIds,
|
FlatIndex::reconstruct(Tensor<int, 2, true>& listIds,
|
||||||
Tensor<float, 3, true>& vecs) {
|
Tensor<float, 3, true>& vecs) {
|
||||||
@ -229,6 +288,7 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
// Make sure that `data` is on our device; we'll run the
|
// Make sure that `data` is on our device; we'll run the
|
||||||
// conversion on our device
|
// conversion on our device
|
||||||
@ -252,8 +312,15 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
|
|||||||
true /* reserve exactly */);
|
true /* reserve exactly */);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
rawData_.append((char*) data,
|
||||||
|
(size_t) dim_ * numVecs * sizeof(float),
|
||||||
|
stream,
|
||||||
|
true /* reserve exactly */);
|
||||||
|
#endif
|
||||||
num_ += numVecs;
|
num_ += numVecs;
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
DeviceTensor<half, 2, true> vectorsHalf(
|
DeviceTensor<half, 2, true> vectorsHalf(
|
||||||
(half*) rawData_.data(), {(int) num_, dim_}, space_);
|
(half*) rawData_.data(), {(int) num_, dim_}, space_);
|
||||||
@ -263,8 +330,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
|
|||||||
(float*) rawData_.data(), {(int) num_, dim_}, space_);
|
(float*) rawData_.data(), {(int) num_, dim_}, space_);
|
||||||
vectors_ = std::move(vectors);
|
vectors_ = std::move(vectors);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
DeviceTensor<float, 2, true> vectors(
|
||||||
|
(float*) rawData_.data(), {(int) num_, dim_}, space_);
|
||||||
|
vectors_ = std::move(vectors);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (storeTransposed_) {
|
if (storeTransposed_) {
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
vectorsHalfTransposed_ =
|
vectorsHalfTransposed_ =
|
||||||
std::move(DeviceTensor<half, 2, true>({dim_, (int) num_}, space_));
|
std::move(DeviceTensor<half, 2, true>({dim_, (int) num_}, space_));
|
||||||
@ -274,9 +347,15 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
|
|||||||
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
|
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
|
||||||
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
|
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
vectorsTransposed_ =
|
||||||
|
std::move(DeviceTensor<float, 2, true>({dim_, (int) num_}, space_));
|
||||||
|
runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Precompute L2 norms of our database
|
// Precompute L2 norms of our database
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16_) {
|
if (useFloat16_) {
|
||||||
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
|
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
|
||||||
runL2Norm(vectorsHalf_, true, norms, true, stream);
|
runL2Norm(vectorsHalf_, true, norms, true, stream);
|
||||||
@ -286,6 +365,11 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
|
|||||||
runL2Norm(vectors_, true, norms, true, stream);
|
runL2Norm(vectors_, true, norms, true, stream);
|
||||||
norms_ = std::move(norms);
|
norms_ = std::move(norms);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
DeviceTensor<float, 1, true> norms({(int) num_}, space_);
|
||||||
|
runL2Norm(vectors_, true, norms, true, stream);
|
||||||
|
norms_ = std::move(norms);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@ -293,8 +377,10 @@ FlatIndex::reset() {
|
|||||||
rawData_.clear();
|
rawData_.clear();
|
||||||
vectors_ = std::move(DeviceTensor<float, 2, true>());
|
vectors_ = std::move(DeviceTensor<float, 2, true>());
|
||||||
vectorsTransposed_ = std::move(DeviceTensor<float, 2, true>());
|
vectorsTransposed_ = std::move(DeviceTensor<float, 2, true>());
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
vectorsHalf_ = std::move(DeviceTensor<half, 2, true>());
|
vectorsHalf_ = std::move(DeviceTensor<half, 2, true>());
|
||||||
vectorsHalfTransposed_ = std::move(DeviceTensor<half, 2, true>());
|
vectorsHalfTransposed_ = std::move(DeviceTensor<half, 2, true>());
|
||||||
|
#endif
|
||||||
norms_ = std::move(DeviceTensor<float, 1, true>());
|
norms_ = std::move(DeviceTensor<float, 1, true>());
|
||||||
num_ = 0;
|
num_ = 0;
|
||||||
}
|
}
|
||||||
|
@ -47,7 +47,9 @@ class FlatIndex {
|
|||||||
Tensor<float, 2, true>& getVectorsFloat32Ref();
|
Tensor<float, 2, true>& getVectorsFloat32Ref();
|
||||||
|
|
||||||
/// Returns a reference to our vectors currently in use (useFloat16 mode)
|
/// Returns a reference to our vectors currently in use (useFloat16 mode)
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
Tensor<half, 2, true>& getVectorsFloat16Ref();
|
Tensor<half, 2, true>& getVectorsFloat16Ref();
|
||||||
|
#endif
|
||||||
|
|
||||||
/// Performs a copy of the vectors on the given device, converting
|
/// Performs a copy of the vectors on the given device, converting
|
||||||
/// as needed from float16
|
/// as needed from float16
|
||||||
@ -67,6 +69,7 @@ class FlatIndex {
|
|||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool exactDistance);
|
bool exactDistance);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void query(Tensor<half, 2, true>& vecs,
|
void query(Tensor<half, 2, true>& vecs,
|
||||||
Tensor<uint8_t, 1, true>& bitset,
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
int k,
|
int k,
|
||||||
@ -75,6 +78,7 @@ class FlatIndex {
|
|||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool exactDistance);
|
bool exactDistance);
|
||||||
|
#endif
|
||||||
|
|
||||||
/// Compute residual for set of vectors
|
/// Compute residual for set of vectors
|
||||||
void computeResidual(Tensor<float, 2, true>& vecs,
|
void computeResidual(Tensor<float, 2, true>& vecs,
|
||||||
@ -123,8 +127,10 @@ class FlatIndex {
|
|||||||
DeviceTensor<float, 2, true> vectorsTransposed_;
|
DeviceTensor<float, 2, true> vectorsTransposed_;
|
||||||
|
|
||||||
/// Vectors currently in rawData_, float16 form
|
/// Vectors currently in rawData_, float16 form
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
DeviceTensor<half, 2, true> vectorsHalf_;
|
DeviceTensor<half, 2, true> vectorsHalf_;
|
||||||
DeviceTensor<half, 2, true> vectorsHalfTransposed_;
|
DeviceTensor<half, 2, true> vectorsHalfTransposed_;
|
||||||
|
#endif
|
||||||
|
|
||||||
/// Precomputed L2 norms
|
/// Precomputed L2 norms
|
||||||
DeviceTensor<float, 1, true> norms_;
|
DeviceTensor<float, 1, true> norms_;
|
||||||
|
@ -60,6 +60,10 @@ IVFPQ::IVFPQ(GpuResources* resources,
|
|||||||
FAISS_ASSERT(dim_ % numSubQuantizers_ == 0);
|
FAISS_ASSERT(dim_ % numSubQuantizers_ == 0);
|
||||||
FAISS_ASSERT(isSupportedPQCodeLength(bytesPerVector_));
|
FAISS_ASSERT(isSupportedPQCodeLength(bytesPerVector_));
|
||||||
|
|
||||||
|
#ifndef FAISS_USE_FLOAT16
|
||||||
|
FAISS_ASSERT(!useFloat16LookupTables_);
|
||||||
|
#endif
|
||||||
|
|
||||||
setPQCentroids_(pqCentroidData);
|
setPQCentroids_(pqCentroidData);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,7 +116,9 @@ IVFPQ::setPrecomputedCodes(bool enable) {
|
|||||||
} else {
|
} else {
|
||||||
// Clear out old precomputed code data
|
// Clear out old precomputed code data
|
||||||
precomputedCode_ = std::move(DeviceTensor<float, 3, true>());
|
precomputedCode_ = std::move(DeviceTensor<float, 3, true>());
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
precomputedCodeHalf_ = std::move(DeviceTensor<half, 3, true>());
|
precomputedCodeHalf_ = std::move(DeviceTensor<half, 3, true>());
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -156,6 +162,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
|||||||
DeviceTensor<float, 2, true> residuals(
|
DeviceTensor<float, 2, true> residuals(
|
||||||
mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
|
mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (quantizer_->getUseFloat16()) {
|
if (quantizer_->getUseFloat16()) {
|
||||||
auto& coarseCentroids = quantizer_->getVectorsFloat16Ref();
|
auto& coarseCentroids = quantizer_->getVectorsFloat16Ref();
|
||||||
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
||||||
@ -163,6 +170,10 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
|
|||||||
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
||||||
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
|
||||||
|
runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
|
||||||
|
#endif
|
||||||
|
|
||||||
// Residuals are in the form
|
// Residuals are in the form
|
||||||
// (vec x numSubQuantizer x dimPerSubQuantizer)
|
// (vec x numSubQuantizer x dimPerSubQuantizer)
|
||||||
@ -519,6 +530,7 @@ IVFPQ::precomputeCodesT_() {
|
|||||||
|
|
||||||
// We added into the view, so `coarsePQProductTransposed` is now our
|
// We added into the view, so `coarsePQProductTransposed` is now our
|
||||||
// precomputed term 2.
|
// precomputed term 2.
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16LookupTables_) {
|
if (useFloat16LookupTables_) {
|
||||||
precomputedCodeHalf_ =
|
precomputedCodeHalf_ =
|
||||||
convertTensor<float, half, 3>(resources_,
|
convertTensor<float, half, 3>(resources_,
|
||||||
@ -527,15 +539,23 @@ IVFPQ::precomputeCodesT_() {
|
|||||||
} else {
|
} else {
|
||||||
precomputedCode_ = std::move(coarsePQProductTransposed);
|
precomputedCode_ = std::move(coarsePQProductTransposed);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
precomputedCode_ = std::move(coarsePQProductTransposed);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
IVFPQ::precomputeCodes_() {
|
IVFPQ::precomputeCodes_() {
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (quantizer_->getUseFloat16()) {
|
if (quantizer_->getUseFloat16()) {
|
||||||
precomputeCodesT_<half>();
|
precomputeCodesT_<half>();
|
||||||
} else {
|
} else {
|
||||||
precomputeCodesT_<float>();
|
precomputeCodesT_<float>();
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
precomputeCodesT_<float>();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@ -678,6 +698,7 @@ IVFPQ::runPQPrecomputedCodes_(
|
|||||||
|
|
||||||
NoTypeTensor<3, true> term2;
|
NoTypeTensor<3, true> term2;
|
||||||
NoTypeTensor<3, true> term3;
|
NoTypeTensor<3, true> term3;
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
DeviceTensor<half, 3, true> term3Half;
|
DeviceTensor<half, 3, true> term3Half;
|
||||||
|
|
||||||
if (useFloat16LookupTables_) {
|
if (useFloat16LookupTables_) {
|
||||||
@ -686,7 +707,10 @@ IVFPQ::runPQPrecomputedCodes_(
|
|||||||
|
|
||||||
term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
|
term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
|
||||||
term3 = NoTypeTensor<3, true>(term3Half);
|
term3 = NoTypeTensor<3, true>(term3Half);
|
||||||
} else {
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (!useFloat16LookupTables_) {
|
||||||
term2 = NoTypeTensor<3, true>(precomputedCode_);
|
term2 = NoTypeTensor<3, true>(precomputedCode_);
|
||||||
term3 = NoTypeTensor<3, true>(term3Transposed);
|
term3 = NoTypeTensor<3, true>(term3Transposed);
|
||||||
}
|
}
|
||||||
@ -754,6 +778,7 @@ IVFPQ::runPQNoPrecomputedCodes_(
|
|||||||
int k,
|
int k,
|
||||||
Tensor<float, 2, true>& outDistances,
|
Tensor<float, 2, true>& outDistances,
|
||||||
Tensor<long, 2, true>& outIndices) {
|
Tensor<long, 2, true>& outIndices) {
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (quantizer_->getUseFloat16()) {
|
if (quantizer_->getUseFloat16()) {
|
||||||
runPQNoPrecomputedCodesT_<half>(queries,
|
runPQNoPrecomputedCodesT_<half>(queries,
|
||||||
bitset,
|
bitset,
|
||||||
@ -770,7 +795,17 @@ IVFPQ::runPQNoPrecomputedCodes_(
|
|||||||
k,
|
k,
|
||||||
outDistances,
|
outDistances,
|
||||||
outIndices);
|
outIndices);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
runPQNoPrecomputedCodesT_<float>(queries,
|
||||||
|
bitset,
|
||||||
|
coarseDistances,
|
||||||
|
coarseIndices,
|
||||||
|
k,
|
||||||
|
outDistances,
|
||||||
|
outIndices);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -153,7 +153,9 @@ class IVFPQ : public IVFBase {
|
|||||||
DeviceTensor<float, 3, true> precomputedCode_;
|
DeviceTensor<float, 3, true> precomputedCode_;
|
||||||
|
|
||||||
/// Precomputed term 2 in half form
|
/// Precomputed term 2 in half form
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
DeviceTensor<half, 3, true> precomputedCodeHalf_;
|
DeviceTensor<half, 3, true> precomputedCodeHalf_;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -309,6 +309,7 @@ void runL2Norm(Tensor<float, 2, true>& input,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runL2Norm(Tensor<half, 2, true>& input,
|
void runL2Norm(Tensor<half, 2, true>& input,
|
||||||
bool inputRowMajor,
|
bool inputRowMajor,
|
||||||
Tensor<float, 1, true>& output,
|
Tensor<float, 1, true>& output,
|
||||||
@ -325,5 +326,6 @@ void runL2Norm(Tensor<half, 2, true>& input,
|
|||||||
inputCast, inputRowMajor, outputCast, normSquared, stream);
|
inputCast, inputRowMajor, outputCast, normSquared, stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -18,10 +18,12 @@ void runL2Norm(Tensor<float, 2, true>& input,
|
|||||||
bool normSquared,
|
bool normSquared,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runL2Norm(Tensor<half, 2, true>& input,
|
void runL2Norm(Tensor<half, 2, true>& input,
|
||||||
bool inputRowMajor,
|
bool inputRowMajor,
|
||||||
Tensor<float, 1, true>& output,
|
Tensor<float, 1, true>& output,
|
||||||
bool normSquared,
|
bool normSquared,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -438,6 +438,7 @@ runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
|
|||||||
|
|
||||||
runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
|
runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16Lookup) {
|
if (useFloat16Lookup) {
|
||||||
// Need to convert back
|
// Need to convert back
|
||||||
auto outCodeDistancesH = outCodeDistances.toTensor<half>();
|
auto outCodeDistancesH = outCodeDistances.toTensor<half>();
|
||||||
@ -445,6 +446,7 @@ runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
|
|||||||
outCodeDistancesF,
|
outCodeDistancesF,
|
||||||
outCodeDistancesH);
|
outCodeDistancesH);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename CentroidT>
|
template <typename CentroidT>
|
||||||
@ -477,6 +479,7 @@ runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
|
|||||||
auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
|
auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
|
||||||
+ topQueryToCentroid.getSize(1) * sizeof(int);
|
+ topQueryToCentroid.getSize(1) * sizeof(int);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
#define RUN_CODE(DIMS, L2) \
|
#define RUN_CODE(DIMS, L2) \
|
||||||
do { \
|
do { \
|
||||||
if (useFloat16Lookup) { \
|
if (useFloat16Lookup) { \
|
||||||
@ -495,6 +498,16 @@ runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
|
|||||||
topQueryToCentroid, outCodeDistancesT); \
|
topQueryToCentroid, outCodeDistancesT); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
#else
|
||||||
|
#define RUN_CODE(DIMS, L2) \
|
||||||
|
do { \
|
||||||
|
auto outCodeDistancesT = outCodeDistances.toTensor<float>(); \
|
||||||
|
pqCodeDistances<float, CentroidT, DIMS, L2><<<grid, block, smem, stream>>>( \
|
||||||
|
queries, kQueriesPerBlock, \
|
||||||
|
coarseCentroids, pqCentroids, \
|
||||||
|
topQueryToCentroid, outCodeDistancesT); \
|
||||||
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define CODE_L2(DIMS) \
|
#define CODE_L2(DIMS) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -26,10 +26,12 @@ template <typename T>
|
|||||||
struct Converter {
|
struct Converter {
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
template <>
|
template <>
|
||||||
struct Converter<half> {
|
struct Converter<half> {
|
||||||
inline static __device__ half to(float v) { return __float2half(v); }
|
inline static __device__ half to(float v) { return __float2half(v); }
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct Converter<float> {
|
struct Converter<float> {
|
||||||
@ -394,6 +396,7 @@ runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
|
|||||||
Tensor<float, 4, true> outCodeDistancesF;
|
Tensor<float, 4, true> outCodeDistancesF;
|
||||||
DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
|
DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16Lookup) {
|
if (useFloat16Lookup) {
|
||||||
outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
|
outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
|
||||||
mem, {outCodeDistances.getSize(0),
|
mem, {outCodeDistances.getSize(0),
|
||||||
@ -406,6 +409,9 @@ runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
|
|||||||
} else {
|
} else {
|
||||||
outCodeDistancesF = outCodeDistances.toTensor<float>();
|
outCodeDistancesF = outCodeDistances.toTensor<float>();
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
outCodeDistancesF = outCodeDistances.toTensor<float>();
|
||||||
|
#endif
|
||||||
|
|
||||||
// Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
|
// Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
|
||||||
// is where we build our output distances)
|
// is where we build our output distances)
|
||||||
@ -445,6 +451,7 @@ runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
|
|||||||
|
|
||||||
runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
|
runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
if (useFloat16Lookup) {
|
if (useFloat16Lookup) {
|
||||||
// Need to convert back
|
// Need to convert back
|
||||||
auto outCodeDistancesH = outCodeDistances.toTensor<half>();
|
auto outCodeDistancesH = outCodeDistances.toTensor<half>();
|
||||||
@ -452,6 +459,7 @@ runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
|
|||||||
outCodeDistancesF,
|
outCodeDistancesF,
|
||||||
outCodeDistancesH);
|
outCodeDistancesH);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
@ -483,6 +491,7 @@ runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
|
|||||||
auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
|
auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
|
||||||
+ topQueryToCentroid.getSize(1) * sizeof(int);
|
+ topQueryToCentroid.getSize(1) * sizeof(int);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
#define RUN_CODE(DIMS, L2) \
|
#define RUN_CODE(DIMS, L2) \
|
||||||
do { \
|
do { \
|
||||||
if (useFloat16Lookup) { \
|
if (useFloat16Lookup) { \
|
||||||
@ -492,7 +501,19 @@ runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
|
|||||||
queries, kQueriesPerBlock, \
|
queries, kQueriesPerBlock, \
|
||||||
coarseCentroids, pqCentroids, \
|
coarseCentroids, pqCentroids, \
|
||||||
topQueryToCentroid, outCodeDistancesT); \
|
topQueryToCentroid, outCodeDistancesT); \
|
||||||
} else { \
|
} else { \
|
||||||
|
auto outCodeDistancesT = outCodeDistances.toTensor<float>(); \
|
||||||
|
\
|
||||||
|
pqCodeDistances<float, DIMS, L2><<<grid, block, smem, stream>>>( \
|
||||||
|
queries, kQueriesPerBlock, \
|
||||||
|
coarseCentroids, pqCentroids, \
|
||||||
|
topQueryToCentroid, outCodeDistancesT); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
#else
|
||||||
|
#define RUN_CODE(DIMS, L2) \
|
||||||
|
do { \
|
||||||
|
if(!useFloat16Lookup){ \
|
||||||
auto outCodeDistancesT = outCodeDistances.toTensor<float>(); \
|
auto outCodeDistancesT = outCodeDistances.toTensor<float>(); \
|
||||||
\
|
\
|
||||||
pqCodeDistances<float, DIMS, L2><<<grid, block, smem, stream>>>( \
|
pqCodeDistances<float, DIMS, L2><<<grid, block, smem, stream>>>( \
|
||||||
@ -501,6 +522,7 @@ runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
|
|||||||
topQueryToCentroid, outCodeDistancesT); \
|
topQueryToCentroid, outCodeDistancesT); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define CODE_L2(DIMS) \
|
#define CODE_L2(DIMS) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -275,7 +275,12 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||||||
auto block = dim3(kThreadsPerBlock);
|
auto block = dim3(kThreadsPerBlock);
|
||||||
|
|
||||||
// pq centroid distances
|
// pq centroid distances
|
||||||
auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
|
auto smem = (sizeof(float)== useFloat16Lookup) ? sizeof(half) : sizeof(float);
|
||||||
|
#else
|
||||||
|
auto smem = sizeof(float);
|
||||||
|
#endif
|
||||||
|
|
||||||
smem *= numSubQuantizers * numSubQuantizerCodes;
|
smem *= numSubQuantizers * numSubQuantizerCodes;
|
||||||
FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
|
FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
|
||||||
@ -296,6 +301,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||||||
allDistances); \
|
allDistances); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
#define RUN_PQ(NUM_SUB_Q) \
|
#define RUN_PQ(NUM_SUB_Q) \
|
||||||
do { \
|
do { \
|
||||||
if (useFloat16Lookup) { \
|
if (useFloat16Lookup) { \
|
||||||
@ -304,6 +310,12 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||||||
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
|
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
#else
|
||||||
|
#define RUN_PQ(NUM_SUB_Q) \
|
||||||
|
do { \
|
||||||
|
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
|
||||||
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
switch (bytesPerCode) {
|
switch (bytesPerCode) {
|
||||||
case 1:
|
case 1:
|
||||||
@ -499,7 +511,12 @@ runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
|
|||||||
sizeof(int),
|
sizeof(int),
|
||||||
stream));
|
stream));
|
||||||
|
|
||||||
int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
|
int codeDistanceTypeSize = sizeof(float);
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
|
if (useFloat16Lookup) {
|
||||||
|
codeDistanceTypeSize = sizeof(half);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
int totalCodeDistancesSize =
|
int totalCodeDistancesSize =
|
||||||
queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
|
queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
|
||||||
|
@ -248,8 +248,11 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||||||
metric == MetricType::METRIC_L2);
|
metric == MetricType::METRIC_L2);
|
||||||
|
|
||||||
bool l2Distance = metric == MetricType::METRIC_L2;
|
bool l2Distance = metric == MetricType::METRIC_L2;
|
||||||
|
|
||||||
// Calculate offset lengths, so we know where to write out
|
// Calculate offset lengths, so we know where to write out
|
||||||
|
#ifndef FAISS_USE_FLOAT16
|
||||||
|
FAISS_ASSERT(!useFloat16Lookup);
|
||||||
|
#endif
|
||||||
|
// Calculate offset lengths, so we know where to write out
|
||||||
// intermediate results
|
// intermediate results
|
||||||
runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
|
runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
|
||||||
thrustMem, stream);
|
thrustMem, stream);
|
||||||
@ -275,7 +278,13 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||||||
auto block = dim3(kThreadsPerBlock);
|
auto block = dim3(kThreadsPerBlock);
|
||||||
|
|
||||||
// pq centroid distances
|
// pq centroid distances
|
||||||
auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
|
//auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
|
||||||
|
auto smem = sizeof(float);
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
|
if (useFloat16Lookup) {
|
||||||
|
smem = sizeof(half);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
smem *= numSubQuantizers * numSubQuantizerCodes;
|
smem *= numSubQuantizers * numSubQuantizerCodes;
|
||||||
FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
|
FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
|
||||||
@ -296,6 +305,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||||||
allDistances); \
|
allDistances); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
#define RUN_PQ(NUM_SUB_Q) \
|
#define RUN_PQ(NUM_SUB_Q) \
|
||||||
do { \
|
do { \
|
||||||
if (useFloat16Lookup) { \
|
if (useFloat16Lookup) { \
|
||||||
@ -304,6 +314,12 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||||||
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
|
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
#else
|
||||||
|
#define RUN_PQ(NUM_SUB_Q) \
|
||||||
|
do { \
|
||||||
|
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
|
||||||
|
} while (0)
|
||||||
|
#endif // FAISS_USE_FLOAT16
|
||||||
|
|
||||||
switch (bytesPerCode) {
|
switch (bytesPerCode) {
|
||||||
case 1:
|
case 1:
|
||||||
@ -497,7 +513,14 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
|
|||||||
sizeof(int),
|
sizeof(int),
|
||||||
stream));
|
stream));
|
||||||
|
|
||||||
int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
|
int codeDistanceTypeSize = sizeof(float);
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
|
if (useFloat16Lookup) {
|
||||||
|
codeDistanceTypeSize = sizeof(half);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
FAISS_ASSERT(!useFloat16Lookup);
|
||||||
|
#endif
|
||||||
|
|
||||||
int totalCodeDistancesSize =
|
int totalCodeDistancesSize =
|
||||||
queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
|
queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
|
||||||
|
@ -252,7 +252,12 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||||||
auto block = dim3(kThreadsPerBlock);
|
auto block = dim3(kThreadsPerBlock);
|
||||||
|
|
||||||
// pq precomputed terms (2 + 3)
|
// pq precomputed terms (2 + 3)
|
||||||
auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
|
auto smem = sizeof(float);
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
|
if (useFloat16Lookup) {
|
||||||
|
smem = sizeof(half);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
smem *= numSubQuantizers * numSubQuantizerCodes;
|
smem *= numSubQuantizers * numSubQuantizerCodes;
|
||||||
FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
|
FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
|
||||||
@ -275,6 +280,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||||||
allDistances); \
|
allDistances); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
#define RUN_PQ(NUM_SUB_Q) \
|
#define RUN_PQ(NUM_SUB_Q) \
|
||||||
do { \
|
do { \
|
||||||
if (useFloat16Lookup) { \
|
if (useFloat16Lookup) { \
|
||||||
@ -283,6 +289,12 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
|
|||||||
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
|
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
#else
|
||||||
|
#define RUN_PQ(NUM_SUB_Q) \
|
||||||
|
do { \
|
||||||
|
RUN_PQ_OPT(NUM_SUB_Q, float, float4); \
|
||||||
|
} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
switch (bytesPerCode) {
|
switch (bytesPerCode) {
|
||||||
case 1:
|
case 1:
|
||||||
|
@ -119,6 +119,7 @@ void runCalcResidual(Tensor<float, 2, true>& vecs,
|
|||||||
calcResidual<float>(vecs, centroids, vecToCentroid, residuals, stream);
|
calcResidual<float>(vecs, centroids, vecToCentroid, residuals, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runCalcResidual(Tensor<float, 2, true>& vecs,
|
void runCalcResidual(Tensor<float, 2, true>& vecs,
|
||||||
Tensor<half, 2, true>& centroids,
|
Tensor<half, 2, true>& centroids,
|
||||||
Tensor<int, 1, true>& vecToCentroid,
|
Tensor<int, 1, true>& vecToCentroid,
|
||||||
@ -126,6 +127,7 @@ void runCalcResidual(Tensor<float, 2, true>& vecs,
|
|||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
calcResidual<half>(vecs, centroids, vecToCentroid, residuals, stream);
|
calcResidual<half>(vecs, centroids, vecToCentroid, residuals, stream);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void runReconstruct(Tensor<int, 1, true>& listIds,
|
void runReconstruct(Tensor<int, 1, true>& listIds,
|
||||||
Tensor<float, 2, true>& vecs,
|
Tensor<float, 2, true>& vecs,
|
||||||
@ -134,11 +136,13 @@ void runReconstruct(Tensor<int, 1, true>& listIds,
|
|||||||
gatherReconstruct<float>(listIds, vecs, out, stream);
|
gatherReconstruct<float>(listIds, vecs, out, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runReconstruct(Tensor<int, 1, true>& listIds,
|
void runReconstruct(Tensor<int, 1, true>& listIds,
|
||||||
Tensor<half, 2, true>& vecs,
|
Tensor<half, 2, true>& vecs,
|
||||||
Tensor<float, 2, true>& out,
|
Tensor<float, 2, true>& out,
|
||||||
cudaStream_t stream) {
|
cudaStream_t stream) {
|
||||||
gatherReconstruct<half>(listIds, vecs, out, stream);
|
gatherReconstruct<half>(listIds, vecs, out, stream);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -31,9 +31,11 @@ void runReconstruct(Tensor<int, 1, true>& listIds,
|
|||||||
Tensor<float, 2, true>& out,
|
Tensor<float, 2, true>& out,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runReconstruct(Tensor<int, 1, true>& listIds,
|
void runReconstruct(Tensor<int, 1, true>& listIds,
|
||||||
Tensor<half, 2, true>& vecs,
|
Tensor<half, 2, true>& vecs,
|
||||||
Tensor<float, 2, true>& out,
|
Tensor<float, 2, true>& out,
|
||||||
cudaStream_t stream);
|
cudaStream_t stream);
|
||||||
|
# endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
|
|
||||||
// warp Q to thread Q:
|
// warp Q to thread Q:
|
||||||
// 1, 1
|
// 1, 1
|
||||||
// 32, 2
|
// 32, 2
|
||||||
@ -143,4 +145,6 @@ void runBlockSelectPair(Tensor<half, 2, true>& inK,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // FAISS_USE_FLOAT16
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -241,6 +241,7 @@ void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
|
|||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool dir, int k, cudaStream_t stream);
|
bool dir, int k, cudaStream_t stream);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runBlockSelect(Tensor<half, 2, true>& in,
|
void runBlockSelect(Tensor<half, 2, true>& in,
|
||||||
Tensor<uint8_t, 1, true>& bitset,
|
Tensor<uint8_t, 1, true>& bitset,
|
||||||
Tensor<half, 2, true>& outKeys,
|
Tensor<half, 2, true>& outKeys,
|
||||||
@ -253,5 +254,6 @@ void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
|
|||||||
Tensor<half, 2, true>& outKeys,
|
Tensor<half, 2, true>& outKeys,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool dir, int k, cudaStream_t stream);
|
bool dir, int k, cudaStream_t stream);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -29,6 +29,7 @@ struct Convert {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
template <>
|
template <>
|
||||||
struct Convert<float, half> {
|
struct Convert<float, half> {
|
||||||
inline __device__ half operator()(float v) const {
|
inline __device__ half operator()(float v) const {
|
||||||
@ -42,6 +43,7 @@ struct Convert<half, float> {
|
|||||||
return __half2float(v);
|
return __half2float(v);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct ConvertTo {
|
struct ConvertTo {
|
||||||
@ -50,38 +52,50 @@ struct ConvertTo {
|
|||||||
template <>
|
template <>
|
||||||
struct ConvertTo<float> {
|
struct ConvertTo<float> {
|
||||||
static inline __device__ float to(float v) { return v; }
|
static inline __device__ float to(float v) { return v; }
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
static inline __device__ float to(half v) { return __half2float(v); }
|
static inline __device__ float to(half v) { return __half2float(v); }
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct ConvertTo<float2> {
|
struct ConvertTo<float2> {
|
||||||
static inline __device__ float2 to(float2 v) { return v; }
|
static inline __device__ float2 to(float2 v) { return v; }
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
static inline __device__ float2 to(half2 v) { return __half22float2(v); }
|
static inline __device__ float2 to(half2 v) { return __half22float2(v); }
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct ConvertTo<float4> {
|
struct ConvertTo<float4> {
|
||||||
static inline __device__ float4 to(float4 v) { return v; }
|
static inline __device__ float4 to(float4 v) { return v; }
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
static inline __device__ float4 to(Half4 v) { return half4ToFloat4(v); }
|
static inline __device__ float4 to(Half4 v) { return half4ToFloat4(v); }
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
template <>
|
template <>
|
||||||
struct ConvertTo<half> {
|
struct ConvertTo<half> {
|
||||||
static inline __device__ half to(float v) { return __float2half(v); }
|
static inline __device__ half to(float v) { return __float2half(v); }
|
||||||
static inline __device__ half to(half v) { return v; }
|
static inline __device__ half to(half v) { return v; }
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
template <>
|
template <>
|
||||||
struct ConvertTo<half2> {
|
struct ConvertTo<half2> {
|
||||||
static inline __device__ half2 to(float2 v) { return __float22half2_rn(v); }
|
static inline __device__ half2 to(float2 v) { return __float22half2_rn(v); }
|
||||||
static inline __device__ half2 to(half2 v) { return v; }
|
static inline __device__ half2 to(half2 v) { return v; }
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
template <>
|
template <>
|
||||||
struct ConvertTo<Half4> {
|
struct ConvertTo<Half4> {
|
||||||
static inline __device__ Half4 to(float4 v) { return float4ToHalf4(v); }
|
static inline __device__ Half4 to(float4 v) { return float4ToHalf4(v); }
|
||||||
static inline __device__ Half4 to(Half4 v) { return v; }
|
static inline __device__ Half4 to(Half4 v) { return v; }
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
// Tensor conversion
|
// Tensor conversion
|
||||||
template <typename From, typename To>
|
template <typename From, typename To>
|
||||||
|
@ -12,6 +12,8 @@
|
|||||||
#include <thrust/execution_policy.h>
|
#include <thrust/execution_policy.h>
|
||||||
#include <thrust/transform.h>
|
#include <thrust/transform.h>
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
bool getDeviceSupportsFloat16Math(int device) {
|
bool getDeviceSupportsFloat16Math(int device) {
|
||||||
@ -36,3 +38,5 @@ __half hostFloat2Half(float a) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
|
||||||
|
#endif // FAISS_USE_FLOAT16
|
||||||
|
@ -22,10 +22,14 @@
|
|||||||
#define FAISS_USE_FULL_FLOAT16 1
|
#define FAISS_USE_FULL_FLOAT16 1
|
||||||
#endif // __CUDA_ARCH__ types
|
#endif // __CUDA_ARCH__ types
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
|
|
||||||
// 64 bytes containing 4 half (float16) values
|
// 64 bytes containing 4 half (float16) values
|
||||||
struct Half4 {
|
struct Half4 {
|
||||||
half2 a;
|
half2 a;
|
||||||
@ -72,4 +76,6 @@ bool getDeviceSupportsFloat16Math(int device);
|
|||||||
|
|
||||||
__half hostFloat2Half(float v);
|
__half hostFloat2Half(float v);
|
||||||
|
|
||||||
|
#endif // FAISS_USE_FLOAT16
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -35,6 +35,8 @@ struct LoadStore {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct LoadStore<Half4> {
|
struct LoadStore<Half4> {
|
||||||
static inline __device__ Half4 load(void* p) {
|
static inline __device__ Half4 load(void* p) {
|
||||||
@ -87,4 +89,6 @@ struct LoadStore<Half8> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -217,6 +217,7 @@ struct Math<float4> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
template <>
|
template <>
|
||||||
struct Math<half> {
|
struct Math<half> {
|
||||||
typedef half ScalarType;
|
typedef half ScalarType;
|
||||||
@ -555,5 +556,6 @@ struct Math<Half8> {
|
|||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
#endif // FAISS_USE_FLOAT16
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
|
|
||||||
// warp Q to thread Q:
|
// warp Q to thread Q:
|
||||||
// 1, 1
|
// 1, 1
|
||||||
// 32, 2
|
// 32, 2
|
||||||
@ -91,4 +93,6 @@ void runWarpSelect(Tensor<half, 2, true>& in,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif // FAISS_USE_FLOAT16
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -62,9 +62,11 @@ void runWarpSelect(Tensor<float, 2, true>& in,
|
|||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool dir, int k, cudaStream_t stream);
|
bool dir, int k, cudaStream_t stream);
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
void runWarpSelect(Tensor<half, 2, true>& in,
|
void runWarpSelect(Tensor<half, 2, true>& in,
|
||||||
Tensor<half, 2, true>& outKeys,
|
Tensor<half, 2, true>& outKeys,
|
||||||
Tensor<int, 2, true>& outIndices,
|
Tensor<int, 2, true>& outIndices,
|
||||||
bool dir, int k, cudaStream_t stream);
|
bool dir, int k, cudaStream_t stream);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -91,6 +91,7 @@ inline __device__ T* shfl_xor(T* const val,
|
|||||||
return (T*) shfl_xor(v, laneMask, width);
|
return (T*) shfl_xor(v, laneMask, width);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
// CUDA 9.0+ has half shuffle
|
// CUDA 9.0+ has half shuffle
|
||||||
#if CUDA_VERSION < 9000
|
#if CUDA_VERSION < 9000
|
||||||
inline __device__ half shfl(half v,
|
inline __device__ half shfl(half v,
|
||||||
@ -113,5 +114,6 @@ inline __device__ half shfl_xor(half v,
|
|||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
#endif // CUDA_VERSION
|
#endif // CUDA_VERSION
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, true, 1, 1);
|
BLOCK_SELECT_IMPL(half, true, 1, 1);
|
||||||
BLOCK_SELECT_IMPL(half, false, 1, 1);
|
BLOCK_SELECT_IMPL(half, false, 1, 1);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, true, 128, 3);
|
BLOCK_SELECT_IMPL(half, true, 128, 3);
|
||||||
BLOCK_SELECT_IMPL(half, false, 128, 3);
|
BLOCK_SELECT_IMPL(half, false, 128, 3);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, true, 256, 4);
|
BLOCK_SELECT_IMPL(half, true, 256, 4);
|
||||||
BLOCK_SELECT_IMPL(half, false, 256, 4);
|
BLOCK_SELECT_IMPL(half, false, 256, 4);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, true, 32, 2);
|
BLOCK_SELECT_IMPL(half, true, 32, 2);
|
||||||
BLOCK_SELECT_IMPL(half, false, 32, 2);
|
BLOCK_SELECT_IMPL(half, false, 32, 2);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, true, 64, 3);
|
BLOCK_SELECT_IMPL(half, true, 64, 3);
|
||||||
BLOCK_SELECT_IMPL(half, false, 64, 3);
|
BLOCK_SELECT_IMPL(half, false, 64, 3);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, false, 1024, 8);
|
BLOCK_SELECT_IMPL(half, false, 1024, 8);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -11,7 +11,9 @@
|
|||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
#if GPU_MAX_SELECTION_K >= 2048
|
#if GPU_MAX_SELECTION_K >= 2048
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, false, 2048, 8);
|
BLOCK_SELECT_IMPL(half, false, 2048, 8);
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, false, 512, 8);
|
BLOCK_SELECT_IMPL(half, false, 512, 8);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, true, 1024, 8);
|
BLOCK_SELECT_IMPL(half, true, 1024, 8);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -11,7 +11,9 @@
|
|||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
#if GPU_MAX_SELECTION_K >= 2048
|
#if GPU_MAX_SELECTION_K >= 2048
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, true, 2048, 8);
|
BLOCK_SELECT_IMPL(half, true, 2048, 8);
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
BLOCK_SELECT_IMPL(half, true, 512, 8);
|
BLOCK_SELECT_IMPL(half, true, 512, 8);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, true, 1, 1);
|
WARP_SELECT_IMPL(half, true, 1, 1);
|
||||||
WARP_SELECT_IMPL(half, false, 1, 1);
|
WARP_SELECT_IMPL(half, false, 1, 1);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, true, 128, 3);
|
WARP_SELECT_IMPL(half, true, 128, 3);
|
||||||
WARP_SELECT_IMPL(half, false, 128, 3);
|
WARP_SELECT_IMPL(half, false, 128, 3);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, true, 256, 4);
|
WARP_SELECT_IMPL(half, true, 256, 4);
|
||||||
WARP_SELECT_IMPL(half, false, 256, 4);
|
WARP_SELECT_IMPL(half, false, 256, 4);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, true, 32, 2);
|
WARP_SELECT_IMPL(half, true, 32, 2);
|
||||||
WARP_SELECT_IMPL(half, false, 32, 2);
|
WARP_SELECT_IMPL(half, false, 32, 2);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,7 +9,9 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, true, 64, 3);
|
WARP_SELECT_IMPL(half, true, 64, 3);
|
||||||
WARP_SELECT_IMPL(half, false, 64, 3);
|
WARP_SELECT_IMPL(half, false, 64, 3);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, false, 1024, 8);
|
WARP_SELECT_IMPL(half, false, 1024, 8);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -11,7 +11,9 @@
|
|||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
#if GPU_MAX_SELECTION_K >= 2048
|
#if GPU_MAX_SELECTION_K >= 2048
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, false, 2048, 8);
|
WARP_SELECT_IMPL(half, false, 2048, 8);
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, false, 512, 8);
|
WARP_SELECT_IMPL(half, false, 512, 8);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, true, 1024, 8);
|
WARP_SELECT_IMPL(half, true, 1024, 8);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -11,7 +11,9 @@
|
|||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
#if GPU_MAX_SELECTION_K >= 2048
|
#if GPU_MAX_SELECTION_K >= 2048
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, true, 2048, 8);
|
WARP_SELECT_IMPL(half, true, 2048, 8);
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
namespace faiss { namespace gpu {
|
namespace faiss { namespace gpu {
|
||||||
|
|
||||||
|
#ifdef FAISS_USE_FLOAT16
|
||||||
WARP_SELECT_IMPL(half, true, 512, 8);
|
WARP_SELECT_IMPL(half, true, 512, 8);
|
||||||
|
#endif
|
||||||
|
|
||||||
} } // namespace
|
} } // namespace
|
||||||
|
Loading…
Reference in New Issue
Block a user