Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow passing different values for nucleotide and aminoacid for spacedKmers #867

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Parameters::Parameters():
scoringMatrixFile(NuclAA<std::string>("INVALID", "INVALID")),
seedScoringMatrixFile(NuclAA<std::string>("INVALID", "INVALID")),
alphabetSize(NuclAA<int>(INT_MAX,INT_MAX)),
spacedKmer(NuclAA<int>(1,0)),
PARAM_S(PARAM_S_ID, "-s", "Sensitivity", "Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive", typeid(float), (void *) &sensitivity, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PREFILTER),
PARAM_K(PARAM_K_ID, "-k", "k-mer length", "k-mer length (0: automatically set to optimum)", typeid(int), (void *) &kmerSize, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
PARAM_TARGET_SEARCH_MODE(PARAM_TARGET_SEARCH_MODE_ID, "--target-search-mode", "Target search mode", "target search mode (0: regular k-mer, 1: similar k-mer)", typeid(int), (void *) &targetSearchMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
Expand All @@ -60,7 +61,7 @@ Parameters::Parameters():
PARAM_NO_COMP_BIAS_CORR(PARAM_NO_COMP_BIAS_CORR_ID, "--comp-bias-corr", "Compositional bias", "Correct for locally biased amino acid composition (range 0-1)", typeid(int), (void *) &compBiasCorrection, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
PARAM_NO_COMP_BIAS_CORR_SCALE(PARAM_NO_COMP_BIAS_CORR_SCALE_ID, "--comp-bias-corr-scale", "Compositional bias", "Correct for locally biased amino acid composition (range 0-1)", typeid(float), (void *) &compBiasCorrectionScale, "^0(\\.[0-9]+)?|^1(\\.0+)?$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),

PARAM_SPACED_KMER_MODE(PARAM_SPACED_KMER_MODE_ID, "--spaced-kmer-mode", "Spaced k-mers", "0: use consecutive positions in k-mers; 1: use spaced k-mers", typeid(int), (void *) &spacedKmer, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
PARAM_SPACED_KMER_MODE(PARAM_SPACED_KMER_MODE_ID, "--spaced-kmer-mode", "Spaced k-mers", "0: use consecutive positions in k-mers; 1: use spaced k-mers", typeid(MultiParam<NuclAA<int>>), (void *) &spacedKmer, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
PARAM_REMOVE_TMP_FILES(PARAM_REMOVE_TMP_FILES_ID, "--remove-tmp-files", "Remove temporary files", "Delete temporary files", typeid(bool), (void *) &removeTmpFiles, "", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
PARAM_INCLUDE_IDENTITY(PARAM_INCLUDE_IDENTITY_ID, "--add-self-matches", "Include identical seq. id.", "Artificially add entries of queries with themselves (for clustering)", typeid(bool), (void *) &includeIdentity, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
PARAM_PRELOAD_MODE(PARAM_PRELOAD_MODE_ID, "--db-load-mode", "Preload mode", "Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch", typeid(int), (void *) &preloadMode, "[0-3]{1}", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
Expand Down Expand Up @@ -2299,7 +2300,7 @@ void Parameters::setDefaults() {
maskProb = 0.9;
maskLowerCaseMode = 0;
minDiagScoreThr = 15;
spacedKmer = true;
spacedKmer = MultiParam<NuclAA<int>>(NuclAA<int>(1,0));;
includeIdentity = false;
alignmentMode = ALIGNMENT_MODE_FAST_AUTO;
alignmentOutputMode = ALIGNMENT_OUTPUT_ALIGNMENT;
Expand Down
2 changes: 1 addition & 1 deletion src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ class Parameters {
int maskLowerCaseMode; // mask lowercase letters in prefilter and kmermatchers

int minDiagScoreThr; // min diagonal score
int spacedKmer; // Spaced Kmers
MultiParam<NuclAA<int>> spacedKmer; // Spaced Kmers
int split; // Split database in n equal chunks
int splitMode; // Split by query or target DB
size_t splitMemoryLimit; // Maximum memory in bytes a split can use
Expand Down
8 changes: 7 additions & 1 deletion src/linclust/LinsearchIndexReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,12 @@ void LinsearchIndexReader::writeKmerIndexToDisk(std::string fileName, KmerPositi

std::string LinsearchIndexReader::findIncompatibleParameter(DBReader<unsigned int> & index, Parameters &par, int dbtype) {
PrefilteringIndexData meta = PrefilteringIndexReader::getMetadata(&index);
int spacedKmer = 0;
if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES)) {
spacedKmer = par.spacedKmer.values.nucleotide();
} else {
spacedKmer = par.spacedKmer.values.aminoacid();
}
if (meta.maxSeqLength != static_cast<int>(par.maxSeqLen))
return "maxSeqLen";
if (meta.seqType != dbtype)
Expand All @@ -266,7 +272,7 @@ std::string LinsearchIndexReader::findIncompatibleParameter(DBReader<unsigned in
return "kmerSize";
if (meta.mask != (par.maskMode > 0))
return "maskMode";
if (meta.spacedKmer != par.spacedKmer)
if (meta.spacedKmer != spacedKmer)
return "spacedKmer";
if (BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.aminoacid().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index) &&
BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.nucleotide().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index))
Expand Down
7 changes: 6 additions & 1 deletion src/linclust/kmerindexdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,12 @@ int kmerindexdb(int argc, const char **argv, const Command &command) {

Debug(Debug::INFO) << "Write META (" << PrefilteringIndexReader::META << ")\n";
const int mask = par.maskMode > 0;
const int spacedKmer = (par.spacedKmer) ? 1 : 0;
int spacedKmer = 0;
if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
spacedKmer = par.spacedKmer.values.nucleotide();
} else {
spacedKmer = par.spacedKmer.values.aminoacid();
}
const bool sameDB = (par.db1 == par.db2);
const int headers1 = 1;
const int headers2 = (sameDB) ? 1 : 0;
Expand Down
11 changes: 8 additions & 3 deletions src/linclust/kmermatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,14 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * kmerArray, siz
#endif
unsigned short * scoreDist= new unsigned short[65536];
unsigned int * hierarchicalScoreDist= new unsigned int[128];

int spacedKmer = 0;
if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
spacedKmer = par.spacedKmer.values.nucleotide();
} else {
spacedKmer = par.spacedKmer.values.aminoacid();
}
const int adjustedKmerSize = (par.adjustKmerLength) ? std::min( par.kmerSize+5, 23) : par.kmerSize;
Sequence seq(par.maxSeqLen, querySeqType, subMat, adjustedKmerSize, par.spacedKmer, false, true, par.spacedKmerPattern);
Sequence seq(par.maxSeqLen, querySeqType, subMat, adjustedKmerSize, spacedKmer, false, true, par.spacedKmerPattern);
KmerGenerator* generator;
if (TYPE == Parameters::DBTYPE_HMM_PROFILE) {
generator = new KmerGenerator( par.kmerSize, subMat->alphabetSize, 150);
Expand Down Expand Up @@ -648,7 +653,7 @@ template size_t assignGroup<1, int>(KmerPosition<int> *kmers, size_t splitKmerCo
void setLinearFilterDefault(Parameters *p) {
p->covThr = 0.8;
p->maskMode = 0;
p->spacedKmer = 0;
p->spacedKmer = MultiParam<NuclAA<int>>(NuclAA<int>(0, 0));
p->kmerSize = Parameters::CLUST_LINEAR_DEFAULT_K;
p->alphabetSize = MultiParam<NuclAA<int>>(NuclAA<int>(Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE, 5));
p->kmersPerSequence = Parameters::CLUST_LINEAR_KMER_PER_SEQ;
Expand Down
14 changes: 10 additions & 4 deletions src/linclust/kmersearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,16 +160,22 @@ int kmersearch(int argc, const char **argv, const Command &command) {
}
}
if(par.PARAM_SPACED_KMER_MODE.wasSet){
if(data.spacedKmer != par.spacedKmer){
Debug(Debug::ERROR) << "Index was created with --spaced-kmer-mode " << data.spacedKmer << " but the prefilter was called with --spaced-kmer-mode " << par.spacedKmer << "!\n";
Debug(Debug::ERROR) << "createindex --spaced-kmer-mode " << par.spacedKmer << "\n";
bool isSpaced = false;
if (Parameters::isEqualDbtype(data.seqType, Parameters::DBTYPE_NUCLEOTIDES)) {
isSpaced = par.spacedKmer.values.nucleotide();
} else {
isSpaced = par.spacedKmer.values.aminoacid();
}
if(data.spacedKmer != isSpaced){
Debug(Debug::ERROR) << "Index was created with --spaced-kmer-mode " << data.spacedKmer << " but the prefilter was called with --spaced-kmer-mode " << isSpaced << "!\n";
Debug(Debug::ERROR) << "createindex --spaced-kmer-mode " << isSpaced << "\n";
EXIT(EXIT_FAILURE);
}
}
par.kmerSize = data.kmerSize;
par.alphabetSize = data.alphabetSize;
targetSeqType = data.seqType;
par.spacedKmer = (data.spacedKmer == 1) ? true : false;
par.spacedKmer = data.spacedKmer;
par.maxSeqLen = data.maxSeqLength;
// Reuse the compBiasCorr field to store the adjustedKmerSize, It is not needed in the linsearch
adjustedKmerSize = data.compBiasCorr;
Expand Down
8 changes: 6 additions & 2 deletions src/prefiltering/Prefiltering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ Prefiltering::Prefiltering(const std::string &queryDB,
kmerSize(par.kmerSize),
spacedKmerPattern(par.spacedKmerPattern),
localTmp(par.localTmp),
spacedKmer(par.spacedKmer != 0),
maskMode(par.maskMode),
maskLowerCaseMode(par.maskLowerCaseMode),
maskProb(par.maskProb),
Expand Down Expand Up @@ -79,6 +78,12 @@ Prefiltering::Prefiltering(const std::string &queryDB,
EXIT(EXIT_FAILURE);
}

if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
spacedKmer = par.spacedKmer.values.nucleotide();
} else {
spacedKmer = par.spacedKmer.values.aminoacid();
}

if (Parameters::isEqualDbtype(FileUtil::parseDbType(targetDB.c_str()), Parameters::DBTYPE_INDEX_DB)) {
if (preloadMode == Parameters::PRELOAD_MODE_AUTO) {
if (sensitivity > 6.0) {
Expand Down Expand Up @@ -135,7 +140,6 @@ Prefiltering::Prefiltering(const std::string &queryDB,
kmerSize = data.kmerSize;
alphabetSize = data.alphabetSize;
targetSeqType = data.seqType;
spacedKmer = data.spacedKmer == 1 ? true : false;
// the query database could have longer sequences than the target database, do not cut them short
maxSeqLen = std::max(maxSeqLen, (size_t)data.maxSeqLength);
aaBiasCorrection = data.compBiasCorr;
Expand Down
10 changes: 8 additions & 2 deletions src/util/alignbykmer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
int pathScore;
};

int spacedKmer = 0;
if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
spacedKmer = par.spacedKmer.values.nucleotide();
} else {
spacedKmer = par.spacedKmer.values.aminoacid();
}

size_t totalMemory = Util::getTotalSystemMemory();
size_t flushSize = 100000000;
Expand All @@ -165,8 +171,8 @@ int alignbykmer(int argc, const char **argv, const Command &command) {

#pragma omp parallel
{
Sequence query(par.maxSeqLen, querySeqType, subMat, par.kmerSize, par.spacedKmer, false, true, par.spacedKmerPattern);
Sequence target(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, par.spacedKmer, false, true, par.spacedKmerPattern);
Sequence query(par.maxSeqLen, querySeqType, subMat, par.kmerSize, spacedKmer, false, true, par.spacedKmerPattern);
Sequence target(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, spacedKmer, false, true, par.spacedKmerPattern);
KmerGenerator kmerGenerator(par.kmerSize, subMat->alphabetSize, 70.0);
kmerGenerator.setDivideStrategy(NULL, &_2merSubMatrix);
size_t lookupSize = MathUtil::ipow<size_t>(subMat->alphabetSize, par.kmerSize);
Expand Down
9 changes: 7 additions & 2 deletions src/util/countkmer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ int countkmer(int argc, const char **argv, const Command& command) {
int seqType = reader.sequenceReader->getDbtype();
BaseMatrix * subMat;
size_t isNucl=Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES);
int spacedKmer = 0;
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES)) {
spacedKmer = par.spacedKmer.values.nucleotide();
} else {
spacedKmer = par.spacedKmer.values.aminoacid();
}
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES)) {
subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0);
} else {
Expand All @@ -41,8 +47,7 @@ int countkmer(int argc, const char **argv, const Command& command) {
#pragma omp parallel
{
Indexer idx(subMat->alphabetSize-1, par.kmerSize);
Sequence s(maxLen, seqType, subMat,
par.kmerSize, par.spacedKmer, false);
Sequence s(maxLen, seqType, subMat, par.kmerSize, spacedKmer, false);

#pragma omp for schedule(dynamic, 1)
for (size_t i = 0; i < reader.sequenceReader->getSize(); i++) {
Expand Down
16 changes: 14 additions & 2 deletions src/util/indexdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ void setIndexDbDefaults(Parameters *p) {

std::string findIncompatibleParameter(DBReader<unsigned int>& index, const Parameters& par, int kmerScore, const int dbtype) {
PrefilteringIndexData meta = PrefilteringIndexReader::getMetadata(&index);
int spacedKmer = 0;
if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES)) {
spacedKmer = par.spacedKmer.values.nucleotide();
} else {
spacedKmer = par.spacedKmer.values.aminoacid();
}
if (meta.compBiasCorr != par.compBiasCorrection)
return "compBiasCorrection";
if (meta.maxSeqLength != static_cast<int>(par.maxSeqLen))
Expand All @@ -29,7 +35,7 @@ std::string findIncompatibleParameter(DBReader<unsigned int>& index, const Param
return "maskMode";
if (meta.kmerThr != kmerScore)
return "kmerScore";
if (meta.spacedKmer != par.spacedKmer)
if (meta.spacedKmer != spacedKmer)
return "spacedKmer";
if (BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.aminoacid().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index) &&
BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.nucleotide().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index))
Expand Down Expand Up @@ -174,8 +180,14 @@ int indexdb(int argc, const char **argv, const Command &command) {
}

DBReader<unsigned int>::removeDb(indexDB);
int spacedKmer = 0;
if (db1IsNucl) {
spacedKmer = par.spacedKmer.values.nucleotide();
} else {
spacedKmer = par.spacedKmer.values.aminoacid();
}
PrefilteringIndexReader::createIndexFile(indexDB, &dbr, dbr2, hdbr1, hdbr2, alndbr, seedSubMat, par.maxSeqLen,
par.spacedKmer, par.spacedKmerPattern, par.compBiasCorrection,
spacedKmer, par.spacedKmerPattern, par.compBiasCorrection,
seedSubMat->alphabetSize, par.kmerSize, par.maskMode, par.maskLowerCaseMode,
par.maskProb, kmerScore, par.targetSearchMode, par.split, par.indexSubset);

Expand Down
2 changes: 1 addition & 1 deletion src/workflow/Cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <cassert>

void setWorkflowDefaults(Parameters *p) {
p->spacedKmer = true;
p->spacedKmer = MultiParam<NuclAA<int>>(NuclAA<int>(1, 0));
p->covThr = 0.8;
p->evalThr = 0.001;
p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
Expand Down
2 changes: 1 addition & 1 deletion src/workflow/EasyCluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


void setEasyClusterDefaults(Parameters *p) {
p->spacedKmer = true;
p->spacedKmer = MultiParam<NuclAA<int>>(NuclAA<int>(1, 0));
p->removeTmpFiles = true;
p->covThr = 0.8;
p->evalThr = 0.001;
Expand Down
2 changes: 1 addition & 1 deletion src/workflow/EasyLinclust.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace linclust {
}

void setEasyLinclustDefaults(Parameters *p) {
p->spacedKmer = false;
p->spacedKmer = MultiParam<NuclAA<int>>(NuclAA<int>(0, 0));
p->removeTmpFiles = true;
p->covThr = 0.8;
p->evalThr = 0.001;
Expand Down
2 changes: 1 addition & 1 deletion src/workflow/Linclust.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <cassert>

void setLinclustWorkflowDefaults(Parameters *p) {
p->spacedKmer = false;
p->spacedKmer = MultiParam<NuclAA<int>>(NuclAA<int>(0, 0));
p->covThr = 0.8;
p->maskMode = 0;
p->evalThr = 0.001;
Expand Down
2 changes: 1 addition & 1 deletion src/workflow/Linsearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace Linsearch {
#include <cassert>

void setLinsearchDefaults(Parameters *p) {
p->spacedKmer = false;
p->spacedKmer = MultiParam<NuclAA<int>>(NuclAA<int>(0, 0));
p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV;
p->sensitivity = 5.7;
p->evalThr = 0.001;
Expand Down
2 changes: 1 addition & 1 deletion src/workflow/Search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


void setSearchDefaults(Parameters *p) {
p->spacedKmer = true;
p->spacedKmer = MultiParam<NuclAA<int>>(NuclAA<int>(1, 0));
p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV;
p->sensitivity = 5.7;
p->evalThr = 0.001;
Expand Down
2 changes: 1 addition & 1 deletion src/workflow/Taxonomy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
extern int computeSearchMode(int queryDbType, int targetDbType, int targetSrcDbType, int searchType);

void setTaxonomyDefaults(Parameters *p) {
p->spacedKmer = true;
p->spacedKmer = MultiParam<NuclAA<int>>(NuclAA<int>(1, 0));
p->sensitivity = 2;
p->evalThr = 1;
p->maxAccept = 30;
Expand Down
Loading