Skip to content

Commit

Permalink
Buffer writes (IGD), which makes the bit-vector algorithm simpler
Browse files Browse the repository at this point in the history
IGD->IGD conversion is about twice as faster with this change, on
simulated datasets that have a lot of HF variants.
  • Loading branch information
dcdehaas committed Dec 23, 2024
1 parent e6a67c9 commit 897f4bd
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 52 deletions.
49 changes: 13 additions & 36 deletions picovcf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1543,8 +1543,7 @@ class IGDData {
}
return std::move(sampleList);
} else {
const bool phased = (bool)(m_header.flags & IGD_PHASED);
const SampleT numSamples = phased ? (m_header.ploidy * m_header.numIndividuals) : m_header.numIndividuals;
const SampleT numSamples = this->numSamples();
const SampleT readAmount = picovcf_div_ceiling<SampleT, 8>(numSamples);
PICOVCF_RELEASE_ASSERT(readAmount > 0);
std::unique_ptr<uint8_t> buffer(new uint8_t[readAmount]);
Expand Down Expand Up @@ -1672,37 +1671,15 @@ static inline void writeString(const std::string& value, std::ostream& outStream
outStream.write(value.c_str(), value.size());
}

inline void writeAllelesAsOnes(std::ostream& outStream, const IGDSampleList& orderedSampleList, SampleT numSamples) {
// XXX this can be majorly optimized.
uint8_t buffer = 0x0;
SampleT i = 0;
SampleT j = 0;
SampleT nextSample = SAMPLE_INDEX_NOT_SET;
if (i < orderedSampleList.size()) {
nextSample = orderedSampleList[i];
i++;
}
while (j < numSamples) {
if (j != 0 && j % 8 == 0) {
writeScalar<uint8_t>(buffer, outStream);
}
buffer <<= 1;
if (j == nextSample) {
buffer |= 0x1;
if (i < orderedSampleList.size()) {
nextSample = orderedSampleList[i];
i++;
} else {
nextSample = SAMPLE_INDEX_NOT_SET;
}
}
j++;
inline void
writeAllelesAsOnes(std::vector<uint8_t>& outData, const IGDSampleList& orderedSampleList, SampleT numSamples) {
const SampleT writeAmount = picovcf_div_ceiling<SampleT, 8>(numSamples);
outData.resize(writeAmount);
for (const auto& sampleId : orderedSampleList) {
const size_t byteIndex = sampleId / 8;
const size_t bitIndex = 7 - (sampleId % 8);
outData[byteIndex] |= (1 << bitIndex);
}
SampleT leftover = numSamples % 8;
if (leftover > 0) {
buffer <<= (8 - leftover);
}
writeScalar<uint8_t>(buffer, outStream);
}

/**
Expand Down Expand Up @@ -1792,7 +1769,9 @@ class IGDWriter {
writeSparse(outStream, sampleList);
m_sparseCount++;
} else {
writeAllelesAsOnes(outStream, sampleList, numSamples);
std::vector<uint8_t> writeBuffer;
writeAllelesAsOnes(writeBuffer, sampleList, numSamples);
outStream.write((const char*)writeBuffer.data(), writeBuffer.size());
}
m_header.numVariants++;
m_totalCount++;
Expand Down Expand Up @@ -1888,9 +1867,7 @@ class IGDWriter {
private:
void writeSparse(std::ostream& outStream, const IGDSampleList& sampleList) {
writeScalar<SampleT>(sampleList.size(), outStream);
for (SampleT i = 0; i < sampleList.size(); i++) {
writeScalar<SampleT>(sampleList[i], outStream);
}
outStream.write(reinterpret_cast<const char*>(sampleList.data()), sizeof(SampleT) * sampleList.size());
}

IGDData::IndexEntry makeEntry(VariantT genomePosition,
Expand Down
30 changes: 14 additions & 16 deletions test/test_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,27 +56,25 @@ TEST(Helpers, Split) {


TEST(Helper, SerializeAlleleBits) {
InMemBuffer buffer(DEFAULT_BUFFER_SIZE);
std::ostream outStream(&buffer);
std::vector<uint8_t> buffer;

writeAllelesAsOnes(outStream, {0, 1, 2, 3, 4, 5, 6, 7}, 8);
ASSERT_EQ((uint8_t)buffer.m_buffer[0], 0xFF);
ASSERT_EQ((uint8_t)buffer.m_buffer[1], 0x00);
auto sampleSet = getSamplesWithAlt((const uint8_t*)&buffer.m_buffer[0], 8);
writeAllelesAsOnes(buffer, {0, 1, 2, 3, 4, 5, 6, 7}, 8);
ASSERT_EQ((uint8_t)buffer.at(0), 0xFF);
auto sampleSet = getSamplesWithAlt((const uint8_t*)buffer.data(), 8);
ASSERT_EQ(sampleSet, IGDSampleList({0, 1, 2, 3, 4, 5, 6, 7}));

buffer.reset(DEFAULT_BUFFER_SIZE);
writeAllelesAsOnes(outStream, {0, 1, 2, 5, 6, 7, 8}, 10);
ASSERT_EQ((uint8_t)buffer.m_buffer[0], 0xE7);
ASSERT_EQ((uint8_t)buffer.m_buffer[1], 0x80);
sampleSet = getSamplesWithAlt((const uint8_t*)&buffer.m_buffer[0], 10);
buffer.clear();
writeAllelesAsOnes(buffer, {0, 1, 2, 5, 6, 7, 8}, 10);
ASSERT_EQ((uint8_t)buffer.at(0), 0xE7);
ASSERT_EQ((uint8_t)buffer.at(1), 0x80);
sampleSet = getSamplesWithAlt((const uint8_t*)buffer.data(), 10);
ASSERT_EQ(sampleSet, IGDSampleList({0, 1, 2, 5, 6, 7, 8}));

buffer.reset(DEFAULT_BUFFER_SIZE);
writeAllelesAsOnes(outStream, {15}, 16);
ASSERT_EQ((uint8_t)buffer.m_buffer[0], 0x00);
ASSERT_EQ((uint8_t)buffer.m_buffer[1], 0x01);
sampleSet = getSamplesWithAlt((const uint8_t*)&buffer.m_buffer[0], 16);
buffer.clear();
writeAllelesAsOnes(buffer, {15}, 16);
ASSERT_EQ((uint8_t)buffer.at(0), 0x00);
ASSERT_EQ((uint8_t)buffer.at(1), 0x01);
sampleSet = getSamplesWithAlt((const uint8_t*)buffer.data(), 16);
ASSERT_EQ(sampleSet, IGDSampleList({15}));
}

Expand Down

0 comments on commit 897f4bd

Please sign in to comment.