Skip to content

Commit

Permalink
fix bugs of the quantized object stream reducing the performance
Browse files Browse the repository at this point in the history
  • Loading branch information
masajiro committed Feb 14, 2021
1 parent ec54799 commit 9078fdc
Show file tree
Hide file tree
Showing 10 changed files with 211 additions and 121 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Neighborhood Graph and Tree for Indexing High-dimensional Data

News
----
- 01/18/2021 NGT v1.13.0 to provide the [quantized graph (NGTQG)](bin/ngtqg/README.md) is released.
- 01/15/2021 NGT v1.13.0 to provide the [quantized graph (NGTQG)](bin/ngtqg/README.md) is released.
- 11/04/2019 [NGT tutorial](https://github.com/yahoojapan/NGT/wiki) has been released.
- 06/26/2019 Jaccard distance is available. (v1.7.6)
- 06/10/2019 PyPI NGT package v1.7.5 is now available.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.13.3
1.13.4
2 changes: 1 addition & 1 deletion bin/ngtqg/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ Quantize the objects of the specified index and build a quantized graph into the
Specify the name of the directory for the existing index such as ANNG or ONNG to be quantized. The index only with L2 distance or normalized L2 distance can be quantized. You should build the ANNG or ONNG with normalized L2 distance in order to use cosine similarity for the quantized graph.

**-E** *max_no_of_edges*
Specify the maximum number of edges to build a qunatized graph. Since every 64 or 32 objects that are associated with edges of each node are processed, the number should be a multple of 64 for avx512 or 32 for avx2.
Specify the maximum number of edges to build a qunatized graph. Since every 16 objects that are associated with edges of each node are processed, the number should be a multiple of 16.

### SEARCH

Expand Down
22 changes: 14 additions & 8 deletions lib/NGT/Command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -923,9 +923,11 @@ using namespace std;
if (removedIDs.find(id) == removedIDs.end() && id < objSize) {
std::cerr << "Not found an object in the tree. However, it might be a duplicated object. " << id << std::endl;
uninsertedTreeObjectCount++;
try {
graphIndex.repository.remove(id);
} catch(...) {}
if (repair) {
try {
graphIndex.repository.remove(id);
} catch(...) {}
}
}
}
}
Expand All @@ -952,14 +954,18 @@ using namespace std;
}
invalidGraphObjectCount++;
}
} catch (...) {
} catch(NGT::Exception &err) {
if (removedIDs.find(id) == removedIDs.end() && id < objSize) {
std::cerr << "Not found an object in the graph. It should be inserted into the graph. " << id << std::endl;
std::cerr << "Not found an object in the graph. It should be inserted into the graph. " << err.what() << " ID=" << id << std::endl;
uninsertedGraphObjectCount++;
try {
graphAndTreeIndex.DVPTree::removeNaively(id);
} catch(...) {}
if (repair) {
try {
graphAndTreeIndex.DVPTree::removeNaively(id);
} catch(...) {}
}
}
} catch(...) {
std::cerr << "Unexpected error!" << std::endl;
}
}

Expand Down
11 changes: 10 additions & 1 deletion lib/NGT/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -856,6 +856,11 @@ NGT::GraphIndex::showStatisticsOfGraph(NGT::GraphIndex &outGraph, char mode, siz
#else
NGT::ObjectDistance &n = (*node)[i];
#endif
if (std::isnan(n.distance)) {
stringstream msg;
msg << "Index::showStatisticsOfGraph: Fatal inner error! The graph has a node with nan distance. " << id << ":" << n.id << ":" << n.distance;
NGTThrowException(msg);
}
if (n.id == 0) {
std::cerr << "ngt info: Warning. id is zero." << std::endl;
valid = false;
Expand Down Expand Up @@ -978,7 +983,11 @@ NGT::GraphIndex::showStatisticsOfGraph(NGT::GraphIndex &outGraph, char mode, siz
}
std::sort(node.begin(), node.end());
for (size_t i = 0; i < node.size(); i++) {
assert(i == 0 || node[i - 1] <= node[i]);
if (i > 0 && node[i - 1] > node[i]) {
stringstream msg;
msg << "Index::showStatisticsOfGraph: Fatal inner error! Wrong distance order " << node[i - 1] << ":" << node[i];
NGTThrowException(msg);
}
if (i >= dcsize) {
break;
}
Expand Down
70 changes: 21 additions & 49 deletions lib/NGT/NGTQ/QuantizedGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,24 +63,10 @@ namespace NGTQG {
return PARENT::at(id).ids;
}


size_t getNumOfPaddedUint8Objects(size_t noEdges) {
if (noEdges == 0) {
return 0;
}
return ((noEdges - 1) / (NGTQ_SIMD_BLOCK_SIZE * NGTQ_BATCH_SIZE) + 1) * (NGTQ_SIMD_BLOCK_SIZE * NGTQ_BATCH_SIZE);
}
size_t getNumOfPaddedUint4Objects(size_t noUint8Objects) {
if (noUint8Objects == 0) {
return 0;
}
return (noUint8Objects * numOfSubspaces) / 2 + 1;
}

void construct(NGT::Index &ngtindex, NGTQ::Index &quantizedIndex, size_t maxNoOfEdges) {
NGTQ::InvertedIndexEntry<uint16_t> invertedIndexObjects(numOfSubspaces);
quantizedIndex.getQuantizer().extractInvertedIndexObject(invertedIndexObjects);

NGT::GraphAndTreeIndex &index = static_cast<NGT::GraphAndTreeIndex&>(ngtindex.getIndex());
NGT::NeighborhoodGraph &graph = static_cast<NGT::NeighborhoodGraph&>(index);

Expand All @@ -89,12 +75,11 @@ namespace NGTQG {

for (size_t id = 1; id < graphRepository.size(); id++) {
NGT::GraphNode &node = *graphRepository.VECTOR::get(id);
size_t noEdges = node.size() < maxNoOfEdges ? node.size() : maxNoOfEdges;
(*this)[id].ids.reserve(noEdges);
size_t noObjects = getNumOfPaddedUint8Objects(noEdges);
uint8_t *objectData = new uint8_t[noObjects * numOfSubspaces]();
size_t numOfEdges = node.size() < maxNoOfEdges ? node.size() : maxNoOfEdges;
(*this)[id].ids.reserve(numOfEdges);
NGTQ::QuantizedObjectProcessingStream quantizedStream(quantizedIndex.getQuantizer(), numOfEdges);
for (auto i = node.begin(); i != node.end(); i++) {
if (distance(node.begin(), i) >= static_cast<int64_t>(noEdges)) {
if (distance(node.begin(), i) >= static_cast<int64_t>(numOfEdges)) {
break;
}
if ((*i).id == 0) {
Expand All @@ -105,79 +90,66 @@ namespace NGTQG {
(*this)[id].ids.push_back((*i).id);
for (size_t idx = 0; idx < numOfSubspaces; idx++) {
size_t dataNo = distance(node.begin(), i);
size_t blkNo = dataNo / NGTQ_SIMD_BLOCK_SIZE;
size_t oft = dataNo - blkNo * NGTQ_SIMD_BLOCK_SIZE;
#if defined(NGT_SHARED_MEMORY_ALLOCATOR)
abort();
#else
objectData[blkNo * (NGTQ_SIMD_BLOCK_SIZE * numOfSubspaces) + NGTQ_SIMD_BLOCK_SIZE * idx + oft] = invertedIndexObjects[(*i).id].localID[idx] - 1;
quantizedStream.arrangeQuantizedObject(dataNo, idx, invertedIndexObjects[(*i).id].localID[idx] - 1);
#endif
}
}

{
size_t idx = 0;
uint8_t *uint4Objects = new uint8_t[getNumOfPaddedUint4Objects(noObjects)]();
for (size_t nidx = 0; nidx < noObjects; nidx += NGTQ_SIMD_BLOCK_SIZE * NGTQ_BATCH_SIZE) {
for (size_t bcnt = 0; bcnt < NGTQ_BATCH_SIZE; bcnt++) {
for (size_t lidx = 0; lidx < numOfSubspaces; lidx++) {
for (size_t bidx = 0; bidx < NGTQ_SIMD_BLOCK_SIZE; bidx++) {
if (idx % 2 == 0) {
uint4Objects[idx / 2] = objectData[idx];
} else {
uint4Objects[idx / 2] |= (objectData[idx] << 4);
}
idx++;
}
}
}
}
delete[] objectData;
(*this)[id].objects = uint4Objects;
}

(*this)[id].objects = quantizedStream.compressIntoUint4();
}
}

void serialize(std::ofstream &os, NGT::ObjectSpace *objspace = 0) {
NGTQ::QuantizedObjectProcessingStream quantizedObjectProcessingStream(numOfSubspaces);
uint64_t n = numOfSubspaces;
NGT::Serializer::write(os, n);
n = PARENT::size();
NGT::Serializer::write(os, n);
for (auto i = PARENT::begin(); i != PARENT::end(); ++i) {
NGT::Serializer::write(os, (*i).ids);
size_t noObjects = getNumOfPaddedUint4Objects(getNumOfPaddedUint8Objects((*i).ids.size()));
NGT::Serializer::write(os, static_cast<uint8_t*>((*i).objects), noObjects);
size_t streamSize = quantizedObjectProcessingStream.getUint4StreamSize((*i).ids.size());
NGT::Serializer::write(os, static_cast<uint8_t*>((*i).objects), streamSize);
}
}

void deserialize(std::ifstream &is, NGT::ObjectSpace *objectspace = 0) {
try {
NGTQ::QuantizedObjectProcessingStream quantizedObjectProcessingStream(numOfSubspaces);
uint64_t n;
NGT::Serializer::read(is, n);
numOfSubspaces = n;
NGT::Serializer::read(is, n);
PARENT::resize(n);
for (auto i = PARENT::begin(); i != PARENT::end(); ++i) {
NGT::Serializer::read(is, (*i).ids);
size_t noObjects = getNumOfPaddedUint4Objects(getNumOfPaddedUint8Objects((*i).ids.size()));
uint8_t *objects = new uint8_t[noObjects];
NGT::Serializer::read(is, objects, noObjects);
(*i).objects = objects;
size_t streamSize = quantizedObjectProcessingStream.getUint4StreamSize((*i).ids.size());
uint8_t *objectStream = new uint8_t[streamSize];
NGT::Serializer::read(is, objectStream, streamSize);
(*i).objects = objectStream;
}
} catch(NGT::Exception &err) {
std::stringstream msg;
msg << "QuantizedGraph::deserialize: Fatal error. " << err.what();
NGTThrowException(msg);
}
}

void save(const string &path) {
const std::string p(path + "/grp");
std::ofstream os(p);
serialize(os);
}

void load(const string &path) {
const std::string p(path + "/grp");
std::ifstream is(p);
deserialize(is);
}

size_t numOfSubspaces;
};

Expand Down
Loading

0 comments on commit 9078fdc

Please sign in to comment.