From db7a51264bcdada5b7b115698ee950612426fd4d Mon Sep 17 00:00:00 2001 From: Clint Valentine Date: Sun, 28 Jul 2024 11:27:59 -0400 Subject: [PATCH] feat: honor original algorithm, increase test coverage (#7) --- .github/workflows/unit-tests.yml | 18 +++-- .../cvbio/neodisambiguate/Disambiguate.scala | 4 +- .../io/cvbio/neodisambiguate/MathUtil.scala | 10 --- .../neodisambiguate/TemplateOrdering.scala | 15 ++-- .../io/cvbio/neodisambiguate/bam/Bams.scala | 1 - .../neodisambiguate/metric/MetricPair.scala | 6 ++ .../neodisambiguate/DisambiguateTest.scala | 75 ++++++++++++++++++- .../cvbio/neodisambiguate/bam/BamsTest.scala | 6 ++ 8 files changed, 111 insertions(+), 24 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 182cabb..35926f7 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -8,14 +8,15 @@ jobs: strategy: matrix: suite: [ neodisambiguate ] - java-version: [ 1.8, 11, 17, 21 ] + java-version: [ 8, 11, 17, 21 ] steps: - - uses: actions/checkout@v2 - - uses: actions/setup-java@v1 + - uses: actions/checkout@v4 + - uses: actions/setup-java@v4 with: + distribution: 'temurin' java-version: ${{ matrix.java-version }} - name: Cache for Scala Dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: | ~/.mill/download @@ -33,5 +34,10 @@ jobs: - name: Create Code Coverage Report if: matrix.java-version == '11' run: | - ./mill --no-server --disable-ticker ${{ matrix.suite }}.scoverage.xmlReport - # TODO: build an HTML report and upload as an artifact + ./mill --no-server --disable-ticker ${{ matrix.suite }}.scoverage.htmlReport + - name: Upload Code Coverage Report + uses: actions/upload-artifact@v4 + if: matrix.java-version == '11' + with: + name: code-coverage + path: out/neodisambiguate/scoverage/htmlReport.dest/ diff --git a/neodisambiguate/src/io/cvbio/neodisambiguate/Disambiguate.scala b/neodisambiguate/src/io/cvbio/neodisambiguate/Disambiguate.scala index f350f1f..f7aae82 100644 --- a/neodisambiguate/src/io/cvbio/neodisambiguate/Disambiguate.scala +++ b/neodisambiguate/src/io/cvbio/neodisambiguate/Disambiguate.scala @@ -87,7 +87,7 @@ object Disambiguate { } /** Command line configuration. */ - private class NeodisambiguateConf(args: Seq[String]) extends ScallopConf(args) { + private[neodisambiguate] class NeodisambiguateConf(args: Seq[String]) extends ScallopConf(args) { private val packageName: String = Option(this.getClass.getPackage.getImplementationTitle).getOrElse("neodisambiguate") private val version: String = Option(this.getClass.getPackage.getImplementationVersion).getOrElse("UNKNOWN") version(s"$packageName $version\n") @@ -168,10 +168,12 @@ object Disambiguate { IOUtil.setCompressionLevel(conf.compression()) Io.compressionLevel = conf.compression() + // $COVERAGE-OFF$ if (IntelCompressionLibrarySupported) { BlockCompressedOutputStream.setDefaultDeflaterFactory(new IntelDeflaterFactory) BlockGunzipper.setDefaultInflaterFactory(new IntelInflaterFactory) } + // $COVERAGE-ON$ Io.tmpDir = conf.tmpDir() System.setProperty("java.io.tmpdir", conf.tmpDir().toAbsolutePath.toString) diff --git a/neodisambiguate/src/io/cvbio/neodisambiguate/MathUtil.scala b/neodisambiguate/src/io/cvbio/neodisambiguate/MathUtil.scala index f6daa75..09097eb 100644 --- a/neodisambiguate/src/io/cvbio/neodisambiguate/MathUtil.scala +++ b/neodisambiguate/src/io/cvbio/neodisambiguate/MathUtil.scala @@ -13,16 +13,6 @@ object MathUtil { def countMinBy[B](fn: A => B)(implicit cmp: Ordering[B]): Int = MathUtil.countMin(self.map(fn)) } - /** Implicits for optionally finding the maximum and minimum items in a collection. */ - implicit class WithMaxMinOption[A](self: IterableOnce[A]) { - - /** Optionally return the maximum item from a container. */ - def maxOption(implicit cmp: Ordering[A]): Option[A] = if (self.iterator.isEmpty) None else Some(self.iterator.max) - - /** Optionally return the minimum item from a container. */ - def minOption(implicit cmp: Ordering[A]): Option[A] = if (self.iterator.isEmpty) None else Some(self.iterator.min) - } - /** Implicits picking the single maximum or single minimum item in a collection. */ implicit class WithPickMaxMinBy[A](self: Seq[A]) { diff --git a/neodisambiguate/src/io/cvbio/neodisambiguate/TemplateOrdering.scala b/neodisambiguate/src/io/cvbio/neodisambiguate/TemplateOrdering.scala index c532a0e..a6dce14 100644 --- a/neodisambiguate/src/io/cvbio/neodisambiguate/TemplateOrdering.scala +++ b/neodisambiguate/src/io/cvbio/neodisambiguate/TemplateOrdering.scala @@ -27,14 +27,19 @@ object TemplateOrdering extends FgBioEnum[TemplateOrdering] { * If neither template is clearly better, then the templates are equivalent. */ case object ClassicOrdering extends TemplateOrdering { + // $COVERAGE-OFF$ + private def bestAlignmentScore(template: Template): MetricPair[Int] = MetricPair[Int](template, AS)(_ max _) + private def worstAlignmentScore(template: Template): MetricPair[Int] = MetricPair[Int](template, AS)(_ min _) + private def bestNumMismatches(template: Template): MetricPair[Int] = MetricPair[Int](template, NM)(_ min _) + private def worstNumMismatches(template: Template): MetricPair[Int] = MetricPair[Int](template, NM)(_ max _) + // $COVERAGE-ON$ /** Compare two templates using the original published algorithm. */ override def compare(x: Template, y: Template): Int = { - val alignmentScore = (template: Template) => MetricPair[Int](template, AS.toString)(_ max _) - val numMismatches = (template: Template) => MetricPair[Int](template, NM.toString)(_ min _) - - var compare = alignmentScore(x).compare(alignmentScore(y)) - if (compare == 0) compare = -numMismatches(x).compare(numMismatches(y)) // Negate because less is better. + var compare = bestAlignmentScore(x).compare(bestAlignmentScore(y)) + if (compare == 0) worstAlignmentScore(x).compare(worstAlignmentScore(y)) + if (compare == 0) compare = -bestNumMismatches(x).compare(bestNumMismatches(y)) // Negate because less is better. + if (compare == 0) compare = -worstNumMismatches(x).compare(worstNumMismatches(y)) // Negate because less is better. compare } } diff --git a/neodisambiguate/src/io/cvbio/neodisambiguate/bam/Bams.scala b/neodisambiguate/src/io/cvbio/neodisambiguate/bam/Bams.scala index 04849a0..2a4d1a4 100644 --- a/neodisambiguate/src/io/cvbio/neodisambiguate/bam/Bams.scala +++ b/neodisambiguate/src/io/cvbio/neodisambiguate/bam/Bams.scala @@ -64,7 +64,6 @@ object Bams { /** Advance to the next sequence of templates. */ override def next(): Seq[Template] = { - require(hasNext, "next() called on empty iterator") val templates = iterators.map(_.next()) require( templates.map(_.name).distinct.length <= 1, diff --git a/neodisambiguate/src/io/cvbio/neodisambiguate/metric/MetricPair.scala b/neodisambiguate/src/io/cvbio/neodisambiguate/metric/MetricPair.scala index 4383c48..27ed199 100644 --- a/neodisambiguate/src/io/cvbio/neodisambiguate/metric/MetricPair.scala +++ b/neodisambiguate/src/io/cvbio/neodisambiguate/metric/MetricPair.scala @@ -1,6 +1,7 @@ package io.cvbio.neodisambiguate.metric import com.fulcrumgenomics.bam.Template +import htsjdk.samtools.SAMTag import io.cvbio.neodisambiguate.CommonsDef.SamTag import io.cvbio.neodisambiguate.bam.Bams.ReadOrdinal.{Read1, Read2} import io.cvbio.neodisambiguate.bam.Bams.TemplateUtil @@ -33,6 +34,11 @@ object MetricPair { ) } + /** Build a [[MetricPair]] from a [[Template]]. A function is required to reduce the tag values to one canonical value. */ + def apply[T](template: Template, tag: SAMTag)(fn: (T, T) => T)(implicit cmp: Ordering[T]): MetricPair[T] = { + apply(template = template, tag = tag.toString)(fn = fn)(cmp = cmp) + } + /** Build an empty [[MetricPair]]. */ def empty[T](implicit cmp: Ordering[T]): MetricPair[T] = new MetricPair[T](read1 = None, read2 = None) } diff --git a/neodisambiguate/test/src/io/cvbio/neodisambiguate/DisambiguateTest.scala b/neodisambiguate/test/src/io/cvbio/neodisambiguate/DisambiguateTest.scala index 1f1085f..24a4426 100644 --- a/neodisambiguate/test/src/io/cvbio/neodisambiguate/DisambiguateTest.scala +++ b/neodisambiguate/test/src/io/cvbio/neodisambiguate/DisambiguateTest.scala @@ -9,6 +9,7 @@ import htsjdk.samtools.SAMSequenceRecord import htsjdk.samtools.SAMTag.{AS, NM} import io.cvbio.neodisambiguate.CommonsDef.BamExtension import io.cvbio.neodisambiguate.DisambiguationStrategy.Classic +import io.cvbio.neodisambiguate.Disambiguate.NeodisambiguateConf import io.cvbio.neodisambiguate.testing.{TemplateBuilder, UnitSpec} class DisambiguateTest extends UnitSpec { @@ -113,7 +114,7 @@ class DisambiguateTest extends UnitSpec { val input = builder.toTempFile(deleteOnExit = true) val prefix = PathUtil.pathTo(dir.toString, more = "insilico") - val disambiguate = new Disambiguate(input = Seq(input), prefix = prefix) + val disambiguate = new Disambiguate(input = Seq(input), prefix = prefix, referenceNames = Seq("hg38")) disambiguate.execute() val source = SamSource(PathUtil.pathTo(Seq(prefix, assembly).mkString(".") + BamExtension)) @@ -152,4 +153,76 @@ class DisambiguateTest extends UnitSpec { Io.assertWritableDirectory(PathUtil.pathTo(dir.toString, more = "ambiguous-alignments")) // will raise exception if non-existent. } + + it should "write out ambiguous templates to ambiguous-specific BAM files" in { + val humanAssembly: String = "hg38" + val mouseAssembly: String = "mm10" + val dir: DirPath = Io.makeTempDir("disambiguate") + //dir.toFile.deleteOnExit() + + val humanBuilder = new SamBuilder(sort = Some(SamOrder.Queryname)) + val mouseBuilder = new SamBuilder(sort = Some(SamOrder.Queryname)) + + val humanPair = humanBuilder.addPair(name = "pair", attrs = Map(NM.toString -> 2, AS.toString -> 32), start1 = 2, start2 = 101) + val mousePair = mouseBuilder.addPair(name = "pair", attrs = Map(NM.toString -> 2, AS.toString -> 32), start1 = 2, start2 = 101) + + humanBuilder.header.getSequenceDictionary.getSequences.forEach { seq: SAMSequenceRecord => val _ = seq.setAssembly(humanAssembly) } + mouseBuilder.header.getSequenceDictionary.getSequences.forEach { seq: SAMSequenceRecord => val _ = seq.setAssembly(mouseAssembly) } + + val prefix = PathUtil.pathTo(dir.toString, more = "insilico") + + val humanBam = humanBuilder.toTempFile(deleteOnExit = false) + val mouseBam = mouseBuilder.toTempFile(deleteOnExit = false) + + val disambiguate = new Disambiguate(input = Seq(mouseBam, humanBam), prefix = prefix) + disambiguate.execute() + + val humanSource = SamSource(PathUtil.pathTo(Seq(prefix, humanAssembly).mkString(".") + BamExtension)) + val mouseSource = SamSource(PathUtil.pathTo(Seq(prefix, mouseAssembly).mkString(".") + BamExtension)) + + humanSource.iterator.toSeq shouldBe empty + mouseSource.iterator.toSeq shouldBe empty + + val ambiguousAlignments = PathUtil.pathTo(dir.toString, more = "ambiguous-alignments") + + Io.assertWritableDirectory(ambiguousAlignments) // will raise exception if non-existent. + + val ambiguousHumanSource = SamSource(ambiguousAlignments.resolve(PathUtil.replaceExtension(humanBam.getFileName, ".ambiguous" + BamExtension))) + val ambiguousMouseSource = SamSource(ambiguousAlignments.resolve(PathUtil.replaceExtension(mouseBam.getFileName, ".ambiguous" + BamExtension))) + + ambiguousHumanSource.map(_.name).toSeq should contain theSameElementsInOrderAs humanPair.map(_.name) + ambiguousMouseSource.map(_.name).toSeq should contain theSameElementsInOrderAs mousePair.map(_.name) + } + + "NeoNeodisambiguate" should "run end-to-end via the CLI entrypoint" in { + val humanAssembly: String = "hg38" + val mouseAssembly: String = "mm10" + val dir: DirPath = Io.makeTempDir("disambiguate") + dir.toFile.deleteOnExit() + + val humanBuilder = new SamBuilder(sort = Some(SamOrder.Queryname)) + val mouseBuilder = new SamBuilder(sort = Some(SamOrder.Queryname)) + + val _ = humanBuilder.addPair(name = "pair", attrs = Map(NM.toString -> 6, AS.toString -> 32), start1 = 2, start2 = 101) + val mousePair = mouseBuilder.addPair(name = "pair", attrs = Map(NM.toString -> 2, AS.toString -> 32), start1 = 2, start2 = 101) + + humanBuilder.header.getSequenceDictionary.getSequences.forEach { seq: SAMSequenceRecord => val _ = seq.setAssembly(humanAssembly) } + mouseBuilder.header.getSequenceDictionary.getSequences.forEach { seq: SAMSequenceRecord => val _ = seq.setAssembly(mouseAssembly) } + + val prefix = PathUtil.pathTo(dir.toString, more = "insilico") + + val mouseBam = mouseBuilder.toTempFile(deleteOnExit = true) + val humanBam = humanBuilder.toTempFile(deleteOnExit = true) + + val conf = new NeodisambiguateConf(args = Seq("--input", mouseBam.toString, humanBam.toString, "--output", prefix.toString)) + Disambiguate.main(Array.from(conf.args)) + + val humanSource = SamSource(PathUtil.pathTo(Seq(prefix, humanAssembly).mkString(".") + BamExtension)) + val mouseSource = SamSource(PathUtil.pathTo(Seq(prefix, mouseAssembly).mkString(".") + BamExtension)) + + humanSource.iterator.toSeq shouldBe empty + mouseSource.map(_.name).toSeq should contain theSameElementsInOrderAs mousePair.map(_.name) + + Io.assertWritableDirectory(PathUtil.pathTo(dir.toString, more = "ambiguous-alignments")) // will raise exception if non-existent. + } } diff --git a/neodisambiguate/test/src/io/cvbio/neodisambiguate/bam/BamsTest.scala b/neodisambiguate/test/src/io/cvbio/neodisambiguate/bam/BamsTest.scala index b5d2bce..47eec14 100644 --- a/neodisambiguate/test/src/io/cvbio/neodisambiguate/bam/BamsTest.scala +++ b/neodisambiguate/test/src/io/cvbio/neodisambiguate/bam/BamsTest.scala @@ -33,6 +33,12 @@ class BamsTest extends UnitSpec { iterator.hasNext shouldBe false } + it should "raise an exception when empty and next() is called" in { + val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) + val iterator = Bams.templatesIterator(builder.toSource) + an[NoSuchElementException] shouldBe thrownBy { iterator.next() } + } + it should "accept a single valid SamSource with more than zero alignments" in { val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) builder.addPair(name = "pair1", start1 = 1, start2 = 2)