From 68991706c91dceeabc864ef5ebab66e0bb94862f Mon Sep 17 00:00:00 2001 From: Kim Soo Hyun Date: Thu, 18 Jul 2024 15:45:10 +0900 Subject: [PATCH 1/2] Allow passing different values for nucleotide and aminoacid for spacedKmer --- src/commons/Parameters.cpp | 5 +++-- src/commons/Parameters.h | 2 +- src/linclust/LinsearchIndexReader.cpp | 8 +++++++- src/linclust/kmerindexdb.cpp | 7 ++++++- src/linclust/kmermatcher.cpp | 11 ++++++++--- src/linclust/kmersearch.cpp | 14 ++++++++++---- src/prefiltering/Prefiltering.cpp | 8 ++++++-- src/util/alignbykmer.cpp | 10 ++++++++-- src/util/countkmer.cpp | 9 +++++++-- src/util/indexdb.cpp | 16 ++++++++++++++-- src/workflow/Cluster.cpp | 2 +- src/workflow/EasyCluster.cpp | 2 +- src/workflow/Search.cpp | 2 +- src/workflow/Taxonomy.cpp | 2 +- 14 files changed, 74 insertions(+), 24 deletions(-) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index fc289a8e0..2e337da07 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -35,6 +35,7 @@ Parameters::Parameters(): scoringMatrixFile(NuclAA("INVALID", "INVALID")), seedScoringMatrixFile(NuclAA("INVALID", "INVALID")), alphabetSize(NuclAA(INT_MAX,INT_MAX)), + spacedKmer(NuclAA(1,0)), PARAM_S(PARAM_S_ID, "-s", "Sensitivity", "Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive", typeid(float), (void *) &sensitivity, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PREFILTER), PARAM_K(PARAM_K_ID, "-k", "k-mer length", "k-mer length (0: automatically set to optimum)", typeid(int), (void *) &kmerSize, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_TARGET_SEARCH_MODE(PARAM_TARGET_SEARCH_MODE_ID, "--target-search-mode", "Target search mode", "target search mode (0: regular k-mer, 1: similar k-mer)", typeid(int), (void *) &targetSearchMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), @@ -60,7 +61,7 @@ Parameters::Parameters(): PARAM_NO_COMP_BIAS_CORR(PARAM_NO_COMP_BIAS_CORR_ID, "--comp-bias-corr", "Compositional bias", "Correct for locally biased amino acid composition (range 0-1)", typeid(int), (void *) &compBiasCorrection, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), PARAM_NO_COMP_BIAS_CORR_SCALE(PARAM_NO_COMP_BIAS_CORR_SCALE_ID, "--comp-bias-corr-scale", "Compositional bias", "Correct for locally biased amino acid composition (range 0-1)", typeid(float), (void *) &compBiasCorrectionScale, "^0(\\.[0-9]+)?|^1(\\.0+)?$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), - PARAM_SPACED_KMER_MODE(PARAM_SPACED_KMER_MODE_ID, "--spaced-kmer-mode", "Spaced k-mers", "0: use consecutive positions in k-mers; 1: use spaced k-mers", typeid(int), (void *) &spacedKmer, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), + PARAM_SPACED_KMER_MODE(PARAM_SPACED_KMER_MODE_ID, "--spaced-kmer-mode", "Spaced k-mers", "0: use consecutive positions in k-mers; 1: use spaced k-mers", typeid(MultiParam>), (void *) &spacedKmer, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), PARAM_REMOVE_TMP_FILES(PARAM_REMOVE_TMP_FILES_ID, "--remove-tmp-files", "Remove temporary files", "Delete temporary files", typeid(bool), (void *) &removeTmpFiles, "", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT), PARAM_INCLUDE_IDENTITY(PARAM_INCLUDE_IDENTITY_ID, "--add-self-matches", "Include identical seq. id.", "Artificially add entries of queries with themselves (for clustering)", typeid(bool), (void *) &includeIdentity, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), PARAM_PRELOAD_MODE(PARAM_PRELOAD_MODE_ID, "--db-load-mode", "Preload mode", "Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch", typeid(int), (void *) &preloadMode, "[0-3]{1}", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT), @@ -2299,7 +2300,7 @@ void Parameters::setDefaults() { maskProb = 0.9; maskLowerCaseMode = 0; minDiagScoreThr = 15; - spacedKmer = true; + spacedKmer = MultiParam>(NuclAA(1,0));; includeIdentity = false; alignmentMode = ALIGNMENT_MODE_FAST_AUTO; alignmentOutputMode = ALIGNMENT_OUTPUT_ALIGNMENT; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 6db7ddbfc..194c2e6bf 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -407,7 +407,7 @@ class Parameters { int maskLowerCaseMode; // mask lowercase letters in prefilter and kmermatchers int minDiagScoreThr; // min diagonal score - int spacedKmer; // Spaced Kmers + MultiParam> spacedKmer; // Spaced Kmers int split; // Split database in n equal chunks int splitMode; // Split by query or target DB size_t splitMemoryLimit; // Maximum memory in bytes a split can use diff --git a/src/linclust/LinsearchIndexReader.cpp b/src/linclust/LinsearchIndexReader.cpp index 8f303c87e..1f8423704 100644 --- a/src/linclust/LinsearchIndexReader.cpp +++ b/src/linclust/LinsearchIndexReader.cpp @@ -256,6 +256,12 @@ void LinsearchIndexReader::writeKmerIndexToDisk(std::string fileName, KmerPositi std::string LinsearchIndexReader::findIncompatibleParameter(DBReader & index, Parameters &par, int dbtype) { PrefilteringIndexData meta = PrefilteringIndexReader::getMetadata(&index); + int spacedKmer = 0; + if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } if (meta.maxSeqLength != static_cast(par.maxSeqLen)) return "maxSeqLen"; if (meta.seqType != dbtype) @@ -266,7 +272,7 @@ std::string LinsearchIndexReader::findIncompatibleParameter(DBReader 0)) return "maskMode"; - if (meta.spacedKmer != par.spacedKmer) + if (meta.spacedKmer != spacedKmer) return "spacedKmer"; if (BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.aminoacid().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index) && BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.nucleotide().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index)) diff --git a/src/linclust/kmerindexdb.cpp b/src/linclust/kmerindexdb.cpp index e652bdc85..aee6da5c8 100644 --- a/src/linclust/kmerindexdb.cpp +++ b/src/linclust/kmerindexdb.cpp @@ -175,7 +175,12 @@ int kmerindexdb(int argc, const char **argv, const Command &command) { Debug(Debug::INFO) << "Write META (" << PrefilteringIndexReader::META << ")\n"; const int mask = par.maskMode > 0; - const int spacedKmer = (par.spacedKmer) ? 1 : 0; + int spacedKmer = 0; + if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } const bool sameDB = (par.db1 == par.db2); const int headers1 = 1; const int headers2 = (sameDB) ? 1 : 0; diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 3e0740c59..f88fb15ee 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -103,9 +103,14 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz #endif unsigned short * scoreDist= new unsigned short[65536]; unsigned int * hierarchicalScoreDist= new unsigned int[128]; - + int spacedKmer = 0; + if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } const int adjustedKmerSize = (par.adjustKmerLength) ? std::min( par.kmerSize+5, 23) : par.kmerSize; - Sequence seq(par.maxSeqLen, querySeqType, subMat, adjustedKmerSize, par.spacedKmer, false, true, par.spacedKmerPattern); + Sequence seq(par.maxSeqLen, querySeqType, subMat, adjustedKmerSize, spacedKmer, false, true, par.spacedKmerPattern); KmerGenerator* generator; if (TYPE == Parameters::DBTYPE_HMM_PROFILE) { generator = new KmerGenerator( par.kmerSize, subMat->alphabetSize, 150); @@ -648,7 +653,7 @@ template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCo void setLinearFilterDefault(Parameters *p) { p->covThr = 0.8; p->maskMode = 0; - p->spacedKmer = 0; + p->spacedKmer = false; p->kmerSize = Parameters::CLUST_LINEAR_DEFAULT_K; p->alphabetSize = MultiParam>(NuclAA(Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE, 5)); p->kmersPerSequence = Parameters::CLUST_LINEAR_KMER_PER_SEQ; diff --git a/src/linclust/kmersearch.cpp b/src/linclust/kmersearch.cpp index aa4102f48..ec8c8d2c7 100644 --- a/src/linclust/kmersearch.cpp +++ b/src/linclust/kmersearch.cpp @@ -160,16 +160,22 @@ int kmersearch(int argc, const char **argv, const Command &command) { } } if(par.PARAM_SPACED_KMER_MODE.wasSet){ - if(data.spacedKmer != par.spacedKmer){ - Debug(Debug::ERROR) << "Index was created with --spaced-kmer-mode " << data.spacedKmer << " but the prefilter was called with --spaced-kmer-mode " << par.spacedKmer << "!\n"; - Debug(Debug::ERROR) << "createindex --spaced-kmer-mode " << par.spacedKmer << "\n"; + bool isSpaced = false; + if (Parameters::isEqualDbtype(data.seqType, Parameters::DBTYPE_NUCLEOTIDES)) { + isSpaced = par.spacedKmer.values.nucleotide(); + } else { + isSpaced = par.spacedKmer.values.aminoacid(); + } + if(data.spacedKmer != isSpaced){ + Debug(Debug::ERROR) << "Index was created with --spaced-kmer-mode " << data.spacedKmer << " but the prefilter was called with --spaced-kmer-mode " << isSpaced << "!\n"; + Debug(Debug::ERROR) << "createindex --spaced-kmer-mode " << isSpaced << "\n"; EXIT(EXIT_FAILURE); } } par.kmerSize = data.kmerSize; par.alphabetSize = data.alphabetSize; targetSeqType = data.seqType; - par.spacedKmer = (data.spacedKmer == 1) ? true : false; + par.spacedKmer = data.spacedKmer; par.maxSeqLen = data.maxSeqLength; // Reuse the compBiasCorr field to store the adjustedKmerSize, It is not needed in the linsearch adjustedKmerSize = data.compBiasCorr; diff --git a/src/prefiltering/Prefiltering.cpp b/src/prefiltering/Prefiltering.cpp index 9efabdb30..8911d2e90 100644 --- a/src/prefiltering/Prefiltering.cpp +++ b/src/prefiltering/Prefiltering.cpp @@ -33,7 +33,6 @@ Prefiltering::Prefiltering(const std::string &queryDB, kmerSize(par.kmerSize), spacedKmerPattern(par.spacedKmerPattern), localTmp(par.localTmp), - spacedKmer(par.spacedKmer != 0), maskMode(par.maskMode), maskLowerCaseMode(par.maskLowerCaseMode), maskProb(par.maskProb), @@ -79,6 +78,12 @@ Prefiltering::Prefiltering(const std::string &queryDB, EXIT(EXIT_FAILURE); } + if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } + if (Parameters::isEqualDbtype(FileUtil::parseDbType(targetDB.c_str()), Parameters::DBTYPE_INDEX_DB)) { if (preloadMode == Parameters::PRELOAD_MODE_AUTO) { if (sensitivity > 6.0) { @@ -135,7 +140,6 @@ Prefiltering::Prefiltering(const std::string &queryDB, kmerSize = data.kmerSize; alphabetSize = data.alphabetSize; targetSeqType = data.seqType; - spacedKmer = data.spacedKmer == 1 ? true : false; // the query database could have longer sequences than the target database, do not cut them short maxSeqLen = std::max(maxSeqLen, (size_t)data.maxSeqLength); aaBiasCorrection = data.compBiasCorr; diff --git a/src/util/alignbykmer.cpp b/src/util/alignbykmer.cpp index 596902f52..90dd0e811 100644 --- a/src/util/alignbykmer.cpp +++ b/src/util/alignbykmer.cpp @@ -151,6 +151,12 @@ int alignbykmer(int argc, const char **argv, const Command &command) { int pathScore; }; + int spacedKmer = 0; + if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } size_t totalMemory = Util::getTotalSystemMemory(); size_t flushSize = 100000000; @@ -165,8 +171,8 @@ int alignbykmer(int argc, const char **argv, const Command &command) { #pragma omp parallel { - Sequence query(par.maxSeqLen, querySeqType, subMat, par.kmerSize, par.spacedKmer, false, true, par.spacedKmerPattern); - Sequence target(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, par.spacedKmer, false, true, par.spacedKmerPattern); + Sequence query(par.maxSeqLen, querySeqType, subMat, par.kmerSize, spacedKmer, false, true, par.spacedKmerPattern); + Sequence target(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, spacedKmer, false, true, par.spacedKmerPattern); KmerGenerator kmerGenerator(par.kmerSize, subMat->alphabetSize, 70.0); kmerGenerator.setDivideStrategy(NULL, &_2merSubMatrix); size_t lookupSize = MathUtil::ipow(subMat->alphabetSize, par.kmerSize); diff --git a/src/util/countkmer.cpp b/src/util/countkmer.cpp index 91ab8249e..5ab3dd6f2 100644 --- a/src/util/countkmer.cpp +++ b/src/util/countkmer.cpp @@ -26,6 +26,12 @@ int countkmer(int argc, const char **argv, const Command& command) { int seqType = reader.sequenceReader->getDbtype(); BaseMatrix * subMat; size_t isNucl=Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES); + int spacedKmer = 0; + if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES)) { subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0); } else { @@ -41,8 +47,7 @@ int countkmer(int argc, const char **argv, const Command& command) { #pragma omp parallel { Indexer idx(subMat->alphabetSize-1, par.kmerSize); - Sequence s(maxLen, seqType, subMat, - par.kmerSize, par.spacedKmer, false); + Sequence s(maxLen, seqType, subMat, par.kmerSize, spacedKmer, false); #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < reader.sequenceReader->getSize(); i++) { diff --git a/src/util/indexdb.cpp b/src/util/indexdb.cpp index c9c6470a7..71fa3f8e8 100644 --- a/src/util/indexdb.cpp +++ b/src/util/indexdb.cpp @@ -15,6 +15,12 @@ void setIndexDbDefaults(Parameters *p) { std::string findIncompatibleParameter(DBReader& index, const Parameters& par, int kmerScore, const int dbtype) { PrefilteringIndexData meta = PrefilteringIndexReader::getMetadata(&index); + int spacedKmer = 0; + if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } if (meta.compBiasCorr != par.compBiasCorrection) return "compBiasCorrection"; if (meta.maxSeqLength != static_cast(par.maxSeqLen)) @@ -29,7 +35,7 @@ std::string findIncompatibleParameter(DBReader& index, const Param return "maskMode"; if (meta.kmerThr != kmerScore) return "kmerScore"; - if (meta.spacedKmer != par.spacedKmer) + if (meta.spacedKmer != spacedKmer) return "spacedKmer"; if (BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.aminoacid().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index) && BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.nucleotide().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index)) @@ -174,8 +180,14 @@ int indexdb(int argc, const char **argv, const Command &command) { } DBReader::removeDb(indexDB); + int spacedKmer = 0; + if (db1IsNucl) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } PrefilteringIndexReader::createIndexFile(indexDB, &dbr, dbr2, hdbr1, hdbr2, alndbr, seedSubMat, par.maxSeqLen, - par.spacedKmer, par.spacedKmerPattern, par.compBiasCorrection, + spacedKmer, par.spacedKmerPattern, par.compBiasCorrection, seedSubMat->alphabetSize, par.kmerSize, par.maskMode, par.maskLowerCaseMode, par.maskProb, kmerScore, par.targetSearchMode, par.split, par.indexSubset); diff --git a/src/workflow/Cluster.cpp b/src/workflow/Cluster.cpp index 6293eb4a0..07a1d4f0b 100644 --- a/src/workflow/Cluster.cpp +++ b/src/workflow/Cluster.cpp @@ -12,7 +12,7 @@ #include void setWorkflowDefaults(Parameters *p) { - p->spacedKmer = true; + p->spacedKmer = MultiParam>(NuclAA(1, 0)); p->covThr = 0.8; p->evalThr = 0.001; p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; diff --git a/src/workflow/EasyCluster.cpp b/src/workflow/EasyCluster.cpp index aca357bfe..02b131764 100644 --- a/src/workflow/EasyCluster.cpp +++ b/src/workflow/EasyCluster.cpp @@ -10,7 +10,7 @@ void setEasyClusterDefaults(Parameters *p) { - p->spacedKmer = true; + p->spacedKmer = MultiParam>(NuclAA(1, 0)); p->removeTmpFiles = true; p->covThr = 0.8; p->evalThr = 0.001; diff --git a/src/workflow/Search.cpp b/src/workflow/Search.cpp index 62ec132e2..fe35568f7 100644 --- a/src/workflow/Search.cpp +++ b/src/workflow/Search.cpp @@ -19,7 +19,7 @@ void setSearchDefaults(Parameters *p) { - p->spacedKmer = true; + p->spacedKmer = MultiParam>(NuclAA(1, 0)); p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV; p->sensitivity = 5.7; p->evalThr = 0.001; diff --git a/src/workflow/Taxonomy.cpp b/src/workflow/Taxonomy.cpp index 4dbd34c73..5d0c8a61f 100644 --- a/src/workflow/Taxonomy.cpp +++ b/src/workflow/Taxonomy.cpp @@ -11,7 +11,7 @@ extern int computeSearchMode(int queryDbType, int targetDbType, int targetSrcDbType, int searchType); void setTaxonomyDefaults(Parameters *p) { - p->spacedKmer = true; + p->spacedKmer = MultiParam>(NuclAA(1, 0)); p->sensitivity = 2; p->evalThr = 1; p->maxAccept = 30; From 6572c418a9b475a96da8a81e7e94238ca9f2a87d Mon Sep 17 00:00:00 2001 From: Kim Soo Hyun Date: Thu, 18 Jul 2024 15:45:10 +0900 Subject: [PATCH 2/2] Allow passing different values for nucleotide and aminoacid for spacedKmer --- src/commons/Parameters.cpp | 5 +++-- src/commons/Parameters.h | 2 +- src/linclust/LinsearchIndexReader.cpp | 8 +++++++- src/linclust/kmerindexdb.cpp | 7 ++++++- src/linclust/kmermatcher.cpp | 11 ++++++++--- src/linclust/kmersearch.cpp | 14 ++++++++++---- src/prefiltering/Prefiltering.cpp | 8 ++++++-- src/util/alignbykmer.cpp | 10 ++++++++-- src/util/countkmer.cpp | 9 +++++++-- src/util/indexdb.cpp | 16 ++++++++++++++-- src/workflow/Cluster.cpp | 2 +- src/workflow/EasyCluster.cpp | 2 +- src/workflow/EasyLinclust.cpp | 2 +- src/workflow/Linclust.cpp | 2 +- src/workflow/Linsearch.cpp | 2 +- src/workflow/Search.cpp | 2 +- src/workflow/Taxonomy.cpp | 2 +- 17 files changed, 77 insertions(+), 27 deletions(-) diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index fc289a8e0..2e337da07 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -35,6 +35,7 @@ Parameters::Parameters(): scoringMatrixFile(NuclAA("INVALID", "INVALID")), seedScoringMatrixFile(NuclAA("INVALID", "INVALID")), alphabetSize(NuclAA(INT_MAX,INT_MAX)), + spacedKmer(NuclAA(1,0)), PARAM_S(PARAM_S_ID, "-s", "Sensitivity", "Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive", typeid(float), (void *) &sensitivity, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PREFILTER), PARAM_K(PARAM_K_ID, "-k", "k-mer length", "k-mer length (0: automatically set to optimum)", typeid(int), (void *) &kmerSize, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), PARAM_TARGET_SEARCH_MODE(PARAM_TARGET_SEARCH_MODE_ID, "--target-search-mode", "Target search mode", "target search mode (0: regular k-mer, 1: similar k-mer)", typeid(int), (void *) &targetSearchMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT), @@ -60,7 +61,7 @@ Parameters::Parameters(): PARAM_NO_COMP_BIAS_CORR(PARAM_NO_COMP_BIAS_CORR_ID, "--comp-bias-corr", "Compositional bias", "Correct for locally biased amino acid composition (range 0-1)", typeid(int), (void *) &compBiasCorrection, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), PARAM_NO_COMP_BIAS_CORR_SCALE(PARAM_NO_COMP_BIAS_CORR_SCALE_ID, "--comp-bias-corr-scale", "Compositional bias", "Correct for locally biased amino acid composition (range 0-1)", typeid(float), (void *) &compBiasCorrectionScale, "^0(\\.[0-9]+)?|^1(\\.0+)?$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), - PARAM_SPACED_KMER_MODE(PARAM_SPACED_KMER_MODE_ID, "--spaced-kmer-mode", "Spaced k-mers", "0: use consecutive positions in k-mers; 1: use spaced k-mers", typeid(int), (void *) &spacedKmer, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), + PARAM_SPACED_KMER_MODE(PARAM_SPACED_KMER_MODE_ID, "--spaced-kmer-mode", "Spaced k-mers", "0: use consecutive positions in k-mers; 1: use spaced k-mers", typeid(MultiParam>), (void *) &spacedKmer, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT), PARAM_REMOVE_TMP_FILES(PARAM_REMOVE_TMP_FILES_ID, "--remove-tmp-files", "Remove temporary files", "Delete temporary files", typeid(bool), (void *) &removeTmpFiles, "", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT), PARAM_INCLUDE_IDENTITY(PARAM_INCLUDE_IDENTITY_ID, "--add-self-matches", "Include identical seq. id.", "Artificially add entries of queries with themselves (for clustering)", typeid(bool), (void *) &includeIdentity, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), PARAM_PRELOAD_MODE(PARAM_PRELOAD_MODE_ID, "--db-load-mode", "Preload mode", "Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch", typeid(int), (void *) &preloadMode, "[0-3]{1}", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT), @@ -2299,7 +2300,7 @@ void Parameters::setDefaults() { maskProb = 0.9; maskLowerCaseMode = 0; minDiagScoreThr = 15; - spacedKmer = true; + spacedKmer = MultiParam>(NuclAA(1,0));; includeIdentity = false; alignmentMode = ALIGNMENT_MODE_FAST_AUTO; alignmentOutputMode = ALIGNMENT_OUTPUT_ALIGNMENT; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 6db7ddbfc..194c2e6bf 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -407,7 +407,7 @@ class Parameters { int maskLowerCaseMode; // mask lowercase letters in prefilter and kmermatchers int minDiagScoreThr; // min diagonal score - int spacedKmer; // Spaced Kmers + MultiParam> spacedKmer; // Spaced Kmers int split; // Split database in n equal chunks int splitMode; // Split by query or target DB size_t splitMemoryLimit; // Maximum memory in bytes a split can use diff --git a/src/linclust/LinsearchIndexReader.cpp b/src/linclust/LinsearchIndexReader.cpp index 8f303c87e..1f8423704 100644 --- a/src/linclust/LinsearchIndexReader.cpp +++ b/src/linclust/LinsearchIndexReader.cpp @@ -256,6 +256,12 @@ void LinsearchIndexReader::writeKmerIndexToDisk(std::string fileName, KmerPositi std::string LinsearchIndexReader::findIncompatibleParameter(DBReader & index, Parameters &par, int dbtype) { PrefilteringIndexData meta = PrefilteringIndexReader::getMetadata(&index); + int spacedKmer = 0; + if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } if (meta.maxSeqLength != static_cast(par.maxSeqLen)) return "maxSeqLen"; if (meta.seqType != dbtype) @@ -266,7 +272,7 @@ std::string LinsearchIndexReader::findIncompatibleParameter(DBReader 0)) return "maskMode"; - if (meta.spacedKmer != par.spacedKmer) + if (meta.spacedKmer != spacedKmer) return "spacedKmer"; if (BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.aminoacid().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index) && BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.nucleotide().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index)) diff --git a/src/linclust/kmerindexdb.cpp b/src/linclust/kmerindexdb.cpp index e652bdc85..aee6da5c8 100644 --- a/src/linclust/kmerindexdb.cpp +++ b/src/linclust/kmerindexdb.cpp @@ -175,7 +175,12 @@ int kmerindexdb(int argc, const char **argv, const Command &command) { Debug(Debug::INFO) << "Write META (" << PrefilteringIndexReader::META << ")\n"; const int mask = par.maskMode > 0; - const int spacedKmer = (par.spacedKmer) ? 1 : 0; + int spacedKmer = 0; + if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } const bool sameDB = (par.db1 == par.db2); const int headers1 = 1; const int headers2 = (sameDB) ? 1 : 0; diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index 3e0740c59..ea762a0b6 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -103,9 +103,14 @@ std::pair fillKmerPositionArray(KmerPosition * kmerArray, siz #endif unsigned short * scoreDist= new unsigned short[65536]; unsigned int * hierarchicalScoreDist= new unsigned int[128]; - + int spacedKmer = 0; + if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } const int adjustedKmerSize = (par.adjustKmerLength) ? std::min( par.kmerSize+5, 23) : par.kmerSize; - Sequence seq(par.maxSeqLen, querySeqType, subMat, adjustedKmerSize, par.spacedKmer, false, true, par.spacedKmerPattern); + Sequence seq(par.maxSeqLen, querySeqType, subMat, adjustedKmerSize, spacedKmer, false, true, par.spacedKmerPattern); KmerGenerator* generator; if (TYPE == Parameters::DBTYPE_HMM_PROFILE) { generator = new KmerGenerator( par.kmerSize, subMat->alphabetSize, 150); @@ -648,7 +653,7 @@ template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCo void setLinearFilterDefault(Parameters *p) { p->covThr = 0.8; p->maskMode = 0; - p->spacedKmer = 0; + p->spacedKmer = MultiParam>(NuclAA(0, 0)); p->kmerSize = Parameters::CLUST_LINEAR_DEFAULT_K; p->alphabetSize = MultiParam>(NuclAA(Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE, 5)); p->kmersPerSequence = Parameters::CLUST_LINEAR_KMER_PER_SEQ; diff --git a/src/linclust/kmersearch.cpp b/src/linclust/kmersearch.cpp index aa4102f48..ec8c8d2c7 100644 --- a/src/linclust/kmersearch.cpp +++ b/src/linclust/kmersearch.cpp @@ -160,16 +160,22 @@ int kmersearch(int argc, const char **argv, const Command &command) { } } if(par.PARAM_SPACED_KMER_MODE.wasSet){ - if(data.spacedKmer != par.spacedKmer){ - Debug(Debug::ERROR) << "Index was created with --spaced-kmer-mode " << data.spacedKmer << " but the prefilter was called with --spaced-kmer-mode " << par.spacedKmer << "!\n"; - Debug(Debug::ERROR) << "createindex --spaced-kmer-mode " << par.spacedKmer << "\n"; + bool isSpaced = false; + if (Parameters::isEqualDbtype(data.seqType, Parameters::DBTYPE_NUCLEOTIDES)) { + isSpaced = par.spacedKmer.values.nucleotide(); + } else { + isSpaced = par.spacedKmer.values.aminoacid(); + } + if(data.spacedKmer != isSpaced){ + Debug(Debug::ERROR) << "Index was created with --spaced-kmer-mode " << data.spacedKmer << " but the prefilter was called with --spaced-kmer-mode " << isSpaced << "!\n"; + Debug(Debug::ERROR) << "createindex --spaced-kmer-mode " << isSpaced << "\n"; EXIT(EXIT_FAILURE); } } par.kmerSize = data.kmerSize; par.alphabetSize = data.alphabetSize; targetSeqType = data.seqType; - par.spacedKmer = (data.spacedKmer == 1) ? true : false; + par.spacedKmer = data.spacedKmer; par.maxSeqLen = data.maxSeqLength; // Reuse the compBiasCorr field to store the adjustedKmerSize, It is not needed in the linsearch adjustedKmerSize = data.compBiasCorr; diff --git a/src/prefiltering/Prefiltering.cpp b/src/prefiltering/Prefiltering.cpp index 9efabdb30..8911d2e90 100644 --- a/src/prefiltering/Prefiltering.cpp +++ b/src/prefiltering/Prefiltering.cpp @@ -33,7 +33,6 @@ Prefiltering::Prefiltering(const std::string &queryDB, kmerSize(par.kmerSize), spacedKmerPattern(par.spacedKmerPattern), localTmp(par.localTmp), - spacedKmer(par.spacedKmer != 0), maskMode(par.maskMode), maskLowerCaseMode(par.maskLowerCaseMode), maskProb(par.maskProb), @@ -79,6 +78,12 @@ Prefiltering::Prefiltering(const std::string &queryDB, EXIT(EXIT_FAILURE); } + if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } + if (Parameters::isEqualDbtype(FileUtil::parseDbType(targetDB.c_str()), Parameters::DBTYPE_INDEX_DB)) { if (preloadMode == Parameters::PRELOAD_MODE_AUTO) { if (sensitivity > 6.0) { @@ -135,7 +140,6 @@ Prefiltering::Prefiltering(const std::string &queryDB, kmerSize = data.kmerSize; alphabetSize = data.alphabetSize; targetSeqType = data.seqType; - spacedKmer = data.spacedKmer == 1 ? true : false; // the query database could have longer sequences than the target database, do not cut them short maxSeqLen = std::max(maxSeqLen, (size_t)data.maxSeqLength); aaBiasCorrection = data.compBiasCorr; diff --git a/src/util/alignbykmer.cpp b/src/util/alignbykmer.cpp index 596902f52..90dd0e811 100644 --- a/src/util/alignbykmer.cpp +++ b/src/util/alignbykmer.cpp @@ -151,6 +151,12 @@ int alignbykmer(int argc, const char **argv, const Command &command) { int pathScore; }; + int spacedKmer = 0; + if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } size_t totalMemory = Util::getTotalSystemMemory(); size_t flushSize = 100000000; @@ -165,8 +171,8 @@ int alignbykmer(int argc, const char **argv, const Command &command) { #pragma omp parallel { - Sequence query(par.maxSeqLen, querySeqType, subMat, par.kmerSize, par.spacedKmer, false, true, par.spacedKmerPattern); - Sequence target(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, par.spacedKmer, false, true, par.spacedKmerPattern); + Sequence query(par.maxSeqLen, querySeqType, subMat, par.kmerSize, spacedKmer, false, true, par.spacedKmerPattern); + Sequence target(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, spacedKmer, false, true, par.spacedKmerPattern); KmerGenerator kmerGenerator(par.kmerSize, subMat->alphabetSize, 70.0); kmerGenerator.setDivideStrategy(NULL, &_2merSubMatrix); size_t lookupSize = MathUtil::ipow(subMat->alphabetSize, par.kmerSize); diff --git a/src/util/countkmer.cpp b/src/util/countkmer.cpp index 91ab8249e..5ab3dd6f2 100644 --- a/src/util/countkmer.cpp +++ b/src/util/countkmer.cpp @@ -26,6 +26,12 @@ int countkmer(int argc, const char **argv, const Command& command) { int seqType = reader.sequenceReader->getDbtype(); BaseMatrix * subMat; size_t isNucl=Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES); + int spacedKmer = 0; + if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES)) { subMat = new NucleotideMatrix(par.scoringMatrixFile.values.nucleotide().c_str(), 1.0, 0.0); } else { @@ -41,8 +47,7 @@ int countkmer(int argc, const char **argv, const Command& command) { #pragma omp parallel { Indexer idx(subMat->alphabetSize-1, par.kmerSize); - Sequence s(maxLen, seqType, subMat, - par.kmerSize, par.spacedKmer, false); + Sequence s(maxLen, seqType, subMat, par.kmerSize, spacedKmer, false); #pragma omp for schedule(dynamic, 1) for (size_t i = 0; i < reader.sequenceReader->getSize(); i++) { diff --git a/src/util/indexdb.cpp b/src/util/indexdb.cpp index c9c6470a7..71fa3f8e8 100644 --- a/src/util/indexdb.cpp +++ b/src/util/indexdb.cpp @@ -15,6 +15,12 @@ void setIndexDbDefaults(Parameters *p) { std::string findIncompatibleParameter(DBReader& index, const Parameters& par, int kmerScore, const int dbtype) { PrefilteringIndexData meta = PrefilteringIndexReader::getMetadata(&index); + int spacedKmer = 0; + if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES)) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } if (meta.compBiasCorr != par.compBiasCorrection) return "compBiasCorrection"; if (meta.maxSeqLength != static_cast(par.maxSeqLen)) @@ -29,7 +35,7 @@ std::string findIncompatibleParameter(DBReader& index, const Param return "maskMode"; if (meta.kmerThr != kmerScore) return "kmerScore"; - if (meta.spacedKmer != par.spacedKmer) + if (meta.spacedKmer != spacedKmer) return "spacedKmer"; if (BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.aminoacid().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index) && BaseMatrix::unserializeName(par.seedScoringMatrixFile.values.nucleotide().c_str()) != PrefilteringIndexReader::getSubstitutionMatrixName(&index)) @@ -174,8 +180,14 @@ int indexdb(int argc, const char **argv, const Command &command) { } DBReader::removeDb(indexDB); + int spacedKmer = 0; + if (db1IsNucl) { + spacedKmer = par.spacedKmer.values.nucleotide(); + } else { + spacedKmer = par.spacedKmer.values.aminoacid(); + } PrefilteringIndexReader::createIndexFile(indexDB, &dbr, dbr2, hdbr1, hdbr2, alndbr, seedSubMat, par.maxSeqLen, - par.spacedKmer, par.spacedKmerPattern, par.compBiasCorrection, + spacedKmer, par.spacedKmerPattern, par.compBiasCorrection, seedSubMat->alphabetSize, par.kmerSize, par.maskMode, par.maskLowerCaseMode, par.maskProb, kmerScore, par.targetSearchMode, par.split, par.indexSubset); diff --git a/src/workflow/Cluster.cpp b/src/workflow/Cluster.cpp index 6293eb4a0..07a1d4f0b 100644 --- a/src/workflow/Cluster.cpp +++ b/src/workflow/Cluster.cpp @@ -12,7 +12,7 @@ #include void setWorkflowDefaults(Parameters *p) { - p->spacedKmer = true; + p->spacedKmer = MultiParam>(NuclAA(1, 0)); p->covThr = 0.8; p->evalThr = 0.001; p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; diff --git a/src/workflow/EasyCluster.cpp b/src/workflow/EasyCluster.cpp index aca357bfe..02b131764 100644 --- a/src/workflow/EasyCluster.cpp +++ b/src/workflow/EasyCluster.cpp @@ -10,7 +10,7 @@ void setEasyClusterDefaults(Parameters *p) { - p->spacedKmer = true; + p->spacedKmer = MultiParam>(NuclAA(1, 0)); p->removeTmpFiles = true; p->covThr = 0.8; p->evalThr = 0.001; diff --git a/src/workflow/EasyLinclust.cpp b/src/workflow/EasyLinclust.cpp index 375ff1610..eddc281bd 100644 --- a/src/workflow/EasyLinclust.cpp +++ b/src/workflow/EasyLinclust.cpp @@ -11,7 +11,7 @@ namespace linclust { } void setEasyLinclustDefaults(Parameters *p) { - p->spacedKmer = false; + p->spacedKmer = MultiParam>(NuclAA(0, 0)); p->removeTmpFiles = true; p->covThr = 0.8; p->evalThr = 0.001; diff --git a/src/workflow/Linclust.cpp b/src/workflow/Linclust.cpp index 5a5dedce1..68010d780 100644 --- a/src/workflow/Linclust.cpp +++ b/src/workflow/Linclust.cpp @@ -10,7 +10,7 @@ #include void setLinclustWorkflowDefaults(Parameters *p) { - p->spacedKmer = false; + p->spacedKmer = MultiParam>(NuclAA(0, 0)); p->covThr = 0.8; p->maskMode = 0; p->evalThr = 0.001; diff --git a/src/workflow/Linsearch.cpp b/src/workflow/Linsearch.cpp index 5e0fdd991..674bf9be6 100644 --- a/src/workflow/Linsearch.cpp +++ b/src/workflow/Linsearch.cpp @@ -16,7 +16,7 @@ namespace Linsearch { #include void setLinsearchDefaults(Parameters *p) { - p->spacedKmer = false; + p->spacedKmer = MultiParam>(NuclAA(0, 0)); p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV; p->sensitivity = 5.7; p->evalThr = 0.001; diff --git a/src/workflow/Search.cpp b/src/workflow/Search.cpp index 62ec132e2..fe35568f7 100644 --- a/src/workflow/Search.cpp +++ b/src/workflow/Search.cpp @@ -19,7 +19,7 @@ void setSearchDefaults(Parameters *p) { - p->spacedKmer = true; + p->spacedKmer = MultiParam>(NuclAA(1, 0)); p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV; p->sensitivity = 5.7; p->evalThr = 0.001; diff --git a/src/workflow/Taxonomy.cpp b/src/workflow/Taxonomy.cpp index 4dbd34c73..5d0c8a61f 100644 --- a/src/workflow/Taxonomy.cpp +++ b/src/workflow/Taxonomy.cpp @@ -11,7 +11,7 @@ extern int computeSearchMode(int queryDbType, int targetDbType, int targetSrcDbType, int searchType); void setTaxonomyDefaults(Parameters *p) { - p->spacedKmer = true; + p->spacedKmer = MultiParam>(NuclAA(1, 0)); p->sensitivity = 2; p->evalThr = 1; p->maxAccept = 30;