Skip to content

Commit

Permalink
GH-528 cache the locations in the super blocks (#529)
Browse files Browse the repository at this point in the history
* we need to specify the version of jackson

* GH-528 improve binary search in super blocks by keeping track of the estimated location of values in the underlying long array. This assumes that the values are ordered.

* improved bit shifting in select1 by using Long.numberOfTrailingZeroes

* only create the new Values if we are going to query the native store

* reduce memory pressure

* revert some changes based on review
  • Loading branch information
hmottestad authored Dec 12, 2024
1 parent edd147d commit 2492a84
Show file tree
Hide file tree
Showing 10 changed files with 393 additions and 62 deletions.
11 changes: 11 additions & 0 deletions qendpoint-backend/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,23 @@
<rdf4j.version>5.0.2</rdf4j.version>
<spring.version>3.4.0</spring.version>
<logback.version>1.5.6</logback.version>
<jackson.version>2.18.1</jackson.version>

<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>

<dependencies>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-core</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import com.the_qa_company.qendpoint.core.util.io.CloseSuppressPath;
import com.the_qa_company.qendpoint.core.util.io.Closer;
import com.the_qa_company.qendpoint.core.util.io.IOUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Closeable;
import java.io.IOException;
Expand All @@ -39,14 +41,31 @@
* @author mario.arias
*/
public class Bitmap375Big extends Bitmap64Big {

private static final Logger logger = LoggerFactory.getLogger(Bitmap375Big.class);

private static final boolean oldBinarySearch;

static {
// check if the system property "useOldBinarySeearch" is set to true
String useOldBinarySearch = System.getProperty("useOldBinarySearch");
if (useOldBinarySearch != null && useOldBinarySearch.equalsIgnoreCase("true")) {
oldBinarySearch = true;
logger.debug("Using old binary search");
} else {
logger.debug("Using new binary search");
oldBinarySearch = false;
}

}

/**
* create disk version bitmap with in memory super index
*
* @param location location
* @param nbits number of bits
* @return bitmap
*/

public static Bitmap375Big disk(Path location, long nbits) {
return disk(location, nbits, false);
}
Expand Down Expand Up @@ -181,6 +200,7 @@ public void updateIndex() {
}
pop = countSuperBlock + countBlock;
indexUpToDate = true;
superBlocks.recalculateEstimatedValueLocation();
}

/*
Expand All @@ -189,8 +209,9 @@ public void updateIndex() {
*/
@Override
public boolean access(long bitIndex) {
if (bitIndex < 0)
if (bitIndex < 0) {
throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex);
}

long wordIndex = wordIndex(bitIndex);
if (wordIndex >= words.length()) {
Expand Down Expand Up @@ -324,15 +345,14 @@ public long select1(long x) {
return 0;
}
// Search superblock (binary Search)
long superBlockIndex = binarySearch(superBlocks, x);
long superBlockIndex = oldBinarySearch ? binarySearch(superBlocks, x) : binarySearchNew(superBlocks, x);

// If there is a run of many zeros, two correlative superblocks may have
// the same value,
// We need to position at the first of them.

while (superBlockIndex > 0 && (superBlocks.get(superBlockIndex) >= x)) {
superBlockIndex--;

}

long countdown = x - superBlocks.get(superBlockIndex);
Expand Down Expand Up @@ -444,6 +464,7 @@ public static long binarySearch0(LongArray arr, long fromIndex, long toIndex, lo
* @param val val
* @return index
*/

public static long binarySearch(LongArray arr, long val) {
long min = 0, max = arr.length(), mid;

Expand All @@ -460,11 +481,52 @@ public static long binarySearch(LongArray arr, long val) {
return min;
}

public static long binarySearchNew(LongArray arr, long val) {

long min = arr.getEstimatedLocationLowerBound(val);
long max = arr.getEstimatedLocationUpperBound(val);
long mid = arr.getEstimatedLocation(val, min, max);

int i = 0;
while (min + 1 < max) {
// After the first iteration, the value that we are looking for is
// typically very close to the min value. Using linear search for
// the next two iterations improves the chances that we find the
// value faster than with binary search.
if (i == 1 || i == 2) {
long v = arr.get(min + 1);
if (v >= val) {
max = min + 1;
} else {
min = min + 1;
}
} else {
long v = arr.get(mid);
if (v >= val) {
max = mid;
} else {
min = mid;
}
}
mid = (min + max) / 2;
i++;
}

arr.updateEstimatedValueLocation(val, min);

return min;
}

public CloseSuppressPath getBlocksPath() {
return blocksPath;
}

public CloseSuppressPath getSuperBlocksPath() {
return superBlocksPath;
}

@Override
public String toString() {
return "Bitmap375Big{}";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,10 @@ public static void writeLowerBitsByteAligned(long value, long numbits, OutputStr
public static int select1(long value, int rank) {
int bitpos = 0;
while (rank > 0 && value != 0) {
rank -= value & 1;
bitpos++;
value >>>= 1;
int trailingZeros = Long.numberOfTrailingZeros(value);
bitpos += trailingZeros + 1;
value >>>= trailingZeros + 1;
rank--;
}
return bitpos;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package com.the_qa_company.qendpoint.core.util.disk;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class AbstractLongArray implements LongArray {

private final Logger logger = LoggerFactory.getLogger(getClass());

private static final int ESTIMATED_LOCATION_ARRAY_SIZE;

static {
// get total amount of memory that this java program is allowed to use
long maxMemory = Runtime.getRuntime().maxMemory();

if (maxMemory >= 1024 * 1024 * 512) {
ESTIMATED_LOCATION_ARRAY_SIZE = 1024 * 128;
} else if (maxMemory >= 1024 * 1024 * 256) {
ESTIMATED_LOCATION_ARRAY_SIZE = 1024 * 64;
} else if (maxMemory >= 1024 * 1024 * 128) {
ESTIMATED_LOCATION_ARRAY_SIZE = 1024 * 32;
} else {
ESTIMATED_LOCATION_ARRAY_SIZE = 1024 * 16;
}

}

private final long[] estimatedLocationMax = new long[ESTIMATED_LOCATION_ARRAY_SIZE];
private final long[] estimatedLocationMin = new long[ESTIMATED_LOCATION_ARRAY_SIZE];
private final long[] estimatedLocation = new long[ESTIMATED_LOCATION_ARRAY_SIZE];

private int estimatedLocationBucketSize;

long maxValue = 1;

@Override
public int getEstimatedLocationArrayBucketSize() {
return estimatedLocationBucketSize;
}

private void updateEstimatedLocationArrayBucketSize() {
int minBucketSize = (int) (maxValue / ESTIMATED_LOCATION_ARRAY_SIZE);
// we want to have the next power of 2
int next = 1;
while (next < minBucketSize) {
next <<= 1;
}
this.estimatedLocationBucketSize = next;
}

@Override
public long[] getEstimatedLocationArray() {
return estimatedLocation;
}

@Override
public long[] getEstimatedLocationArrayMin() {
return estimatedLocationMin;
}

@Override
public long[] getEstimatedLocationArrayMax() {
return estimatedLocationMax;
}

@Override
public void recalculateEstimatedValueLocation() {
updateEstimatedLocationArrayBucketSize();
int estimatedLocationBucketSize = getEstimatedLocationArrayBucketSize();
long len = length();
boolean shouldLog = len > 1024 * 1024 * 2;
if (shouldLog) {
logger.info("Recalculating estimated location array 0%");
}

for (int i = 0; i < len; i++) {
long val = get(i);
if (val == 0) {
continue;
}

int index = (int) (val / estimatedLocationBucketSize + 1);
estimatedLocationMax[index] = Math.max(estimatedLocationMax[index], i);
if (estimatedLocationMin[index] == 0) {
estimatedLocationMin[index] = i;
} else {
estimatedLocationMin[index] = Math.min(estimatedLocationMin[index], i);
}
estimatedLocation[index] = (estimatedLocationMax[index] + estimatedLocationMin[index]) / 2;

if (shouldLog && i % (1024 * 1024) == 0) {
logger.info("Recalculating estimated location array {}%", (int) Math.floor(100.0 / len * i));
}
}

if (shouldLog) {
logger.info("Recalculating estimated location array 100%");
}
}

@Override
public final void set(long index, long value) {
maxValue = Math.max(maxValue, value);
innerSet(index, value);
}

abstract protected void innerSet(long index, long value);

}
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,5 @@ public void resize(long newSize) throws IOException {
public void clear() {
array.clear();
}

}
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package com.the_qa_company.qendpoint.core.util.disk;

import com.the_qa_company.qendpoint.core.util.io.IOUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Iterator;
Expand All @@ -10,6 +12,10 @@
* Describe a large array of longs
*/
public interface LongArray extends Iterable<Long> {

Logger logger = LoggerFactory.getLogger(LongArray.class);
long[] EMPTY_ARRAY = new long[0];

/**
* create an in memory long array
*
Expand Down Expand Up @@ -208,4 +214,84 @@ public Long next() {
}
};
}

/**
* @return the estimated location array that contains the highest location
* for a given value
*/
default long[] getEstimatedLocationArrayMax() {
return getEstimatedLocationArray();
}

/**
* @return the estimated location array that contains the lowest location
* for a given value
*/
default long[] getEstimatedLocationArrayMin() {
return getEstimatedLocationArray();
}

/**
* @return the estimated location array
*/
default long[] getEstimatedLocationArray() {
return EMPTY_ARRAY;
}

default int getEstimatedLocationArrayBucketSize() {
return 65536;
}

default long getEstimatedLocationLowerBound(long val) {
int index = (int) (val / getEstimatedLocationArrayBucketSize() + 1);
if (index - 1 >= 0) {
long t = getEstimatedLocationArrayMax()[index - 1];
if (t > 0) {
return t;
}
}
return 0;
}

default long getEstimatedLocationUpperBound(long val) {
int index = (int) (val / getEstimatedLocationArrayBucketSize() + 1);
long[] estimatedLocationMin = getEstimatedLocationArrayMin();
if (index + 1 < estimatedLocationMin.length) {
long t = estimatedLocationMin[index + 1];
if (t > 0) {
return Math.min(length(), t);
}
}

return length();
}

default long getEstimatedLocation(long val, long min, long max) {
int index = (int) (val / getEstimatedLocationArrayBucketSize() + 1);
var estimatedLocation = getEstimatedLocationArray();

if (index >= estimatedLocation.length) {
return (min + max) / 2;
}
long t = estimatedLocation[index];
if (t > min && t < max) {
return t;
} else {
return (min + max) / 2;
}
}

default void recalculateEstimatedValueLocation() {
logger.info("Class {} does not support recalculateEstimatedValueLocation()",
this.getClass().getCanonicalName());
}

default void updateEstimatedValueLocation(long val, long min) {
int index = (int) (val / getEstimatedLocationArrayBucketSize() + 1);
long[] estimatedLocation = getEstimatedLocationArray();
if (index >= estimatedLocation.length) {
return;
}
estimatedLocation[index] = min;
}
}
Loading

0 comments on commit 2492a84

Please sign in to comment.