diff --git a/README.rst b/README.rst index b384309..6ca0a38 100644 --- a/README.rst +++ b/README.rst @@ -38,7 +38,9 @@ Basic tombo installation (python 2.7 and 3.4+ support) Quick Start =========== -Call 5mC and 6mA sites from raw nanopore read files. Then output genome browser `wiggle format file `_ for 5mA calls and plot raw signal around most significant 6mA sites. +Re-squiggle raw nanopore read files and call 5mC and 6mA sites. + +Then, for 5mA calls, output genome browser `wiggle format file `_ and, for 6mA calls, plot raw signal around most significant locations. :: @@ -47,22 +49,22 @@ Call 5mC and 6mA sites from raw nanopore read files. Then output genome browser --fastq-filenames basecalls1.fastq basecalls2.fastq \ --sequencing-summary-filenames seq_summary1.txt seq_summary2.txt \ --processes 4 - - tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 + + tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 --num-most-common-errors 5 tombo detect_modifications alternative_model --fast5-basedirs path/to/fast5s/ \ --statistics-file-basename sample.alt_modified_base_detection \ --per-read-statistics-basename sample.alt_modified_base_detection \ --alternate-bases 5mC 6mA --processes 4 - + # produces "estimated fraction of modified reads" genome browser files # for 5mC testing tombo text_output browser_files --statistics-filename sample.alt_modified_base_detection.5mC.tombo.stats \ --file-types dampened_fraction --browser-file-basename sample.alt_modified_base_detection.5mC # and 6mA testing (along with coverage bedgraphs) tombo text_output browser_files --statistics-filename sample.alt_modified_base_detection.6mA.tombo.stats \ - --fast5-basedirs path/to/fast5s/ --file-types dampened_fraction coverage\ + --fast5-basedirs path/to/fast5s/ --file-types dampened_fraction coverage \ --browser-file-basename sample.alt_modified_base_detection.6mA - + # plot raw signal at most significant 6mA locations tombo plot most_significant --fast5-basedirs path/to/fast5s/ \ --statistics-filename sample.alt_modified_base_detection.6mA.tombo.stats \ @@ -73,19 +75,23 @@ Detect any deviations from expected signal levels for canonical bases to investi :: - tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 + tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 --num-most-common-errors 5 tombo detect_modifications de_novo --fast5-basedirs path/to/fast5s/ \ --statistics-file-basename sample.de_novo_modified_base_detection \ --per-read-statistics-basename sample.de_novo_modified_base_detection \ --processes 4 - + # produces "estimated fraction of modified reads" genome browser files from de novo testing tombo text_output browser_files --statistics-filename sample.de_novo_modified_base_detection.tombo.stats \ --browser-file-basename sample.de_novo_modified_base_detection --file-types dampened_fraction -.. - - All of these commands work for RNA data as well, but a transcriptome reference sequence must be provided for spliced transcripts. +=== +RNA +=== + +All Tombo commands work for RNA data as well, but a transcriptome reference sequence must be provided for spliced transcripts. + +The reasons for this decision and other tips for processing RNA data within the Tombo framework can be found in the `RNA section `_ of the detailed Tombo documentation. ===================== Further Documentation @@ -95,182 +101,6 @@ Run ``tombo -h`` to see all Tombo command groups and run ``tombo [command-group] Detailed documentation for all Tombo commands and algorithms can be found at https://nanoporetech.github.io/tombo/ -============== -Tombo Commands -============== - -Re-squiggle (Raw Data to Genome Alignment) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ``resquiggle`` algorithm is the central point for the Tombo tookit. For each nanopore read, this command takes basecalled sequence and the raw nanopore signal values. The basecalled sequence is mapped to a genomic or transcriptomic reference. The raw nanopore signal is assigned to the mapped genomic or transcriptomic sequence based on expected signal levels from an included canonical base model. This anchors each raw signal observation from a read to a genomic position. This information is then leveraged to gain information about the potential location of modified nucleotides either within a single read or across a group of reads from a sample of interest. - -:: - - tombo resquiggle path/to/fast5s/ reference.fasta --processes 4 - -.. - - - Only R9.4 and R9.5 data is supported at this time (including R9.*.1). - - DNA or RNA sample type is automatically detected from FAST5s (set explicitly with ``--dna`` or ``--rna``). - - FAST5 files need not contain ``Events`` data, but must contain ``Fastq`` slot containing basecalls. See ``preprocess annotate_raw_with_fastqs`` for pre-processing of raw FAST5s with basecalled reads. - - The reference sequence file can be a genome/transcriptome FASTA file or a minimap2 index file. - - The ``resquiggle`` command must be run before testing for modified bases. - -Detect Modified Bases -^^^^^^^^^^^^^^^^^^^^^ - -There are three methods provided with Tombo to identify modified bases. - -For more information on these methods see the `Tombo documentation here `_. - -:: - - # Identify deviations from the canoncial expected signal levels that specifically match the - # expected levels from an alternative base e.g.5mC or 6mA (recommended method) - tombo detect_modifications alternative_model --fast5-basedirs path/to/native/dna/fast5s/ \ - --alternate-bases 5mC 6mA --statistics-file-basename sample.alt_testing - - # Identify any deviations from the canonical base model - tombo detect_modifications de_novo --fast5-basedirs path/to/native/dna/fast5s/ \ - --statistics-file-basename sample.de_novo_testing --processes 4 - - # comparing to a control sample (e.g. PCR) - tombo detect_modifications sample_compare --fast5-basedirs path/to/native/dna/fast5s/ \ - --control-fast5-basedirs path/to/amplified/dna/fast5s/ \ - --statistics-file-basename sample.compare_testing - -.. - - Must run ``resquiggle`` on reads before testing for modified bases. - - All ``detect_modifications`` commands produce a binary Tombo statistics file. For use in text output or plotting region selection see ``text_output browser_files`` or ``plot most_significant`` Tombo commands. - - Specify the ``--per-read-statistics-basename`` option to save per-read statistics for plotting or further processing (acces via the Tombo API). - -Text Output -^^^^^^^^^^^ - -:: - - # output estimated fraction of reads modified at each genomic base and - # valid coverage (after failed reads, filters and testing threshold are applied) in wiggle format - tombo text_output browser_files --file-types dampened_fraction --statistics-filename sample.alt_testing.5mC.tombo.stats - - # output read coverage depth (after failed reads and filters are applied) in bedgraph format - tombo text_output browser_files --file-types coverage --fast5-basedirs path/to/native/dna/fast5s/ - -.. - - For more text output commands see the `Tombo text output documentation here `_. - -Raw Signal Plotting -^^^^^^^^^^^^^^^^^^^ - -:: - - # plot raw signal with standard model overlay at reions with maximal coverage - tombo plot max_coverage --fast5-basedirs path/to/native/rna/fast5s/ --plot-standard-model - - # plot raw signal along with signal from a control (PCR) sample at locations with the AWC motif - tombo plot motif_centered --fast5-basedirs path/to/native/rna/fast5s/ \ - --motif AWC --genome-fasta genome.fasta --control-fast5-basedirs path/to/amplified/dna/fast5s/ - - # plot raw signal at genome locations with the most significantly/consistently modified bases - tombo plot most_significant --fast5-basedirs path/to/native/rna/fast5s/ \ - --statistics-filename sample.alt_testing.5mC.tombo.stats --plot-alternate-model 5mC - - # plot per-read test statistics using the 6mA alternative model testing method - tombo plot per_read --per-read-statistics-filename sample.alt_testing.6mA.tombo.per_read_stats \ - --genome-locations chromosome:1000 chromosome:2000:- --genome-fasta genome.fasta - -.. - - For more plotting commands see the `Tombo plotting documentation here `_. - -Read Filtering -^^^^^^^^^^^^^^ - -:: - - # filter reads to a specific genomic location - tombo filter genome_locations --fast5-basedirs path/to/native/rna/fast5s/ \ - --include-regions chr1:0-10000000 - - # apply a more strigent raw signal matching threshold - tombo filter --fast5-basedirs path/to/native/rna/fast5s/ \ - --signal-matching-score 1.0 - -.. - - For more read filtering commands see the `Tombo filter documentation here `_. - - Hint: Save a set of filters for later use by copying the Tombo index file: ``cp path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index save.native.tombo.index``. To re-set to a set of saved filters after applying further filters simply replace the index file: ``cp save.native.tombo.index path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index``. - -==================== -Note on Tombo Models -==================== - -Tombo is currently provided with two canonical models (for DNA and RNA data) and three alternative models (DNA::5mC, DNA::6mA and RNA::5mC). - -These models are used by default in the re-squiggle and modified base detection commands. The correct canonical model is automatically selected for DNA or RNA based on the contents of each FAST5 file and processed accordingly. - -Additional models will be added in future releases. - -========================= -Installation Requirements -========================= - -python Requirements (handled by conda or pip): -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -- numpy -- scipy -- h5py -- cython -- mappy>=2.10 -- tqdm - -Optional packages (handled by conda, but not pip): -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -- Plotting Packages (R and rpy2 must be linked during installation) - - + R - + rpy2 - + ggplot2 - + gridExtra (required for ``plot_motif_with_stats`` and ``plot_kmer`` subcommands) - -- On-disk Random Fasta Access - - + pyfaidx - -Advanced Installation Instructions ----------------------------------- - -Minimal tombo installation without optional dependencies (enables re-squiggle, all modified base testing methods and text output) - -:: - - pip install ont-tombo - -Install current github version of tombo - -:: - - pip install git+https://github.com/nanoporetech/tombo.git - -Download and install github version of tombo - -:: - - git clone https://github.com/nanoporetech/tombo.git - cd tombo - pip install -e . - - # to update, run: - git pull - pip install -I --no-deps -e . - ======== Citation ======== @@ -283,15 +113,13 @@ http://biorxiv.org/content/early/2017/04/10/094672 Known Issues ============ -- When running the ``detect_modifications`` commands on large genomes, the computational memory usage can become very high. It is currently recommended to processes smaller regions using the ``tombo filter genome_locations`` command (with saved Tombo index hint above). This problem is being addressed and will be resolved in a later release. - - The Tombo conda environment (especially with python 2.7) may have installation issues. - + + Tombo works best in python 3.4+, so many problems can be solved by upgrading python. + If installed using conda: - Ensure the most recent version of conda is installed (``conda update -n root conda``). - - It is recommended to set conda channels as described for `bioconda `_. + - It is recommended to set conda channels as described for `bioconda `_. - Run ``conda update --all``. + In python 2.7 there is an issue with the conda scipy.stats package. Down-grading to version 0.17 fixes this issue. + In python 2.7 there is an issue with the conda h5py package. Down-grading to version <=2.7.0 fixes this issue. diff --git a/docs/_images/adaptive_forward_pass.png b/docs/_images/adaptive_forward_pass.png index 2ad11f0..7643c09 100644 Binary files a/docs/_images/adaptive_forward_pass.png and b/docs/_images/adaptive_forward_pass.png differ diff --git a/docs/_images/adaptive_half_z_scores.png b/docs/_images/adaptive_half_z_scores.png index cebb10a..d61d835 100644 Binary files a/docs/_images/adaptive_half_z_scores.png and b/docs/_images/adaptive_half_z_scores.png differ diff --git a/docs/_images/alt_model_comp.png b/docs/_images/alt_model_comp.png index bc6fb3b..74d62be 100644 Binary files a/docs/_images/alt_model_comp.png and b/docs/_images/alt_model_comp.png differ diff --git a/docs/_images/begin_forward_pass.png b/docs/_images/begin_forward_pass.png index 60d6d3e..a47ec45 100644 Binary files a/docs/_images/begin_forward_pass.png and b/docs/_images/begin_forward_pass.png differ diff --git a/docs/_images/begin_half_z_scores.png b/docs/_images/begin_half_z_scores.png index ff20111..b5d90ab 100644 Binary files a/docs/_images/begin_half_z_scores.png and b/docs/_images/begin_half_z_scores.png differ diff --git a/docs/_images/model_comp.png b/docs/_images/model_comp.png index c8b70cb..8656064 100644 Binary files a/docs/_images/model_comp.png and b/docs/_images/model_comp.png differ diff --git a/docs/_images/per_read_do_novo.png b/docs/_images/per_read_do_novo.png index e84ba94..2e933ee 100644 Binary files a/docs/_images/per_read_do_novo.png and b/docs/_images/per_read_do_novo.png differ diff --git a/docs/_images/roc.png b/docs/_images/roc.png index 5f413b3..6017c48 100644 Binary files a/docs/_images/roc.png and b/docs/_images/roc.png differ diff --git a/docs/_images/sample_comp.png b/docs/_images/sample_comp.png index 7e758c6..635d1f2 100644 Binary files a/docs/_images/sample_comp.png and b/docs/_images/sample_comp.png differ diff --git a/docs/_images/testing_method_comparison.png b/docs/_images/testing_method_comparison.png index 7e30557..ce63441 100644 Binary files a/docs/_images/testing_method_comparison.png and b/docs/_images/testing_method_comparison.png differ diff --git a/docs/conf.py b/docs/conf.py index 78a2d1e..fd4757f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,10 +23,29 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.intersphinx', - 'sphinx.ext.mathjax', 'sphinxarg.ext'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', + 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax', 'sphinxarg.ext', + 'sphinx.ext.napoleon',] mathjax_path = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" +# don't include class inheritence in docs: https://stackoverflow.com/questions/46279030/how-can-i-prevent-sphinx-from-listing-object-as-a-base-class +from sphinx.ext.autodoc import ClassDocumenter, _ +add_line = ClassDocumenter.add_line +def add_line_no_bases(self, text, *args, **kwargs): + if text.strip().startswith('Bases: '): + return + add_line(self, text, *args, **kwargs) + +add_directive_header = ClassDocumenter.add_directive_header +def add_directive_header_no_bases(self, *args, **kwargs): + self.add_line = add_line_no_bases.__get__(self) + result = add_directive_header(self, *args, **kwargs) + del self.add_line + return result + +ClassDocumenter.add_directive_header = add_directive_header_no_bases + + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -45,7 +64,8 @@ copyright = u'2017-18, Oxford Nanopore Technologies' # Generate API documentation: -if subprocess.call(['sphinx-apidoc', '-o', './', "../{}".format(__pkg_name__)]) != 0: +if subprocess.call(['sphinx-apidoc', '--module-first', '--no-toc', + '-f', '-o', './', "../{}".format(__pkg_name__)]) != 0: sys.stderr.write('Failed to generate API documentation!\n') # The version info for the project you're documenting, acts as replacement for diff --git a/docs/examples.rst b/docs/examples.rst index 651e1d6..c48f69b 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -10,22 +10,25 @@ Re-squiggle (Raw Signal Genomic Alignment) The re-squiggle algorithm aligns raw signal (electric current nanopore measurements) to genomic/transcriptomic sequence. +One of the major assumptions of the re-squiggle algorithm is that the provided reference sequence is correct. Thus for poorly assembled genomes or divergent samples, an assembly polishing step (possibly from the same data/sample) may improve results. + The ``resquiggle`` command will add infomation (the mapped genomic location and the raw signal to sequence assignment) to the read files provided (in FAST5 format), as well as producing an index file for more efficient file access in downstream commands. .. important:: - + The ``resquiggle`` command must be run before any further processing by Tombo commands. -**Note**: Tombo currenly includes default canonical models for both DNA or RNA data (including R9.4 and R9.5; 1D and 1D^2; R9.*.1 chemistries). Analysis of other nanopore data types is not supported at this time (e.g. R7 data). If DNA or RNA sample type is not explicitly specified (via ``--dna`` or ``--rna`` options) the sample type will be detected automatically from the raw read files. +**Note**: Tombo currenly includes default canonical models for both DNA or RNA data (supporting R9.4 and R9.5; 1D and 1D^2; R9.*.1 chemistries). Analysis of other nanopore data types is not supported at this time (e.g. R7 data). If DNA or RNA sample type is not explicitly specified (via ``--dna`` or ``--rna`` options) the sample type will be detected automatically from the raw read files. For more details see the :doc:`re-squiggle documentation `. .. code-block:: bash - # annotate raw FAST5s with FASTQ files produced from the same reads if the raw files do not contain FASTQ information + # annotate raw FAST5s with FASTQ files produced from the same reads + # skip this step if raw read files already contain basecalls tombo annotate_raw_with_fastqs --fast5-basedir --fastq-filenames - tombo resquiggle --processes 4 + tombo resquiggle --processes 4 --num-most-common-errors 5 ----------------------- Modified Base Detection @@ -33,9 +36,18 @@ Modified Base Detection Tombo provides three methods for the investigation of modified bases (within the ``detect_modifications`` command group). Each method has different advantages and requirements. -All modified base detection methods poduce per-read, per-genomic position test statistics (which can be saved via the ``--per-read-statistics-basename`` option). A threshold is then applied to these statistics to produce a fraction of reads that appear modified at each genomic locaiton. +---- + +.. figure:: _images/testing_method_comparison.png + :align: center + + Tombo modified base testing methods. + +---- + +All modified base detection methods poduce per-read, per-genomic position test statistics (which can be saved via the ``--per-read-statistics-basename`` option). A threshold is then applied to these statistics to produce an estimate for the fraction of reads that appear modified at each genomic location. -1. Specific alternative base detection +1. **Specific alternative base detection (recommended)** - Run using ``tombo detect_modifications alternative_model`` command. - This method identifies signal that deviates from the canonical base expected signal level while matching a specific alternative base expected signal level. @@ -43,13 +55,13 @@ All modified base detection methods poduce per-read, per-genomic position test s - Alternative DNA models are currently available for 5-methylcytosine (5mC) and N6-methyladenosine (6mA) in all sequence contexts. - An alternative RNA model is available for 5mC. -2. *De novo* canonical model comparison +2. **De novo canonical model comparison** - Run using ``tombo detect_modifications de_novo`` command. - This method compares re-squiggled signal to the default canonical model. - While this method may produce significant false positive and negative results per-read, it produces the best results for many statistical measures per-genomic location (fraction of modified bases across a set of reads). -3. Canonical (control) sample comparison +3. **Canonical (control) sample comparison** - Run using ``tombo detect_modifications sample_compare`` command. - This method performs a hypothesis test against the distribution estimated from the control sample at each base. @@ -59,16 +71,6 @@ All modified base detection methods poduce per-read, per-genomic position test s Both the control sample comparison and the *de novo* methods may not identify the exact modified base location (as the shifted signal does not always center on a modified base) and gives no information as to the identity of a modified base. ----- - -.. figure:: _images/testing_method_comparison.png - :align: center - :scale: 30% - - Tombo modified base testing methods. - ----- - The result of all ``detect_modifications`` calls will be a binary statistics file(s), which can be passed to other Tombo commands. For more details see the :doc:`modified base detection documentation `. @@ -104,12 +106,14 @@ Canonical Sample Comparison Method In order to execute the canonical sample comparison method, use the ``detect_modifications sample_compare`` command. -This will perform a hypothesis test against the signal level observed from the control sample (provided via ``--control-fast5-basedirs`` option) at each genomic position. This method currently performs the worst, but future updates to this method may increase the accuracy of this method. This method (like the ``de_novo`` method) does not always identify the exact modified base position. +This will perform a hypothesis test against the signal level observed from the control sample (provided via ``--control-fast5-basedirs`` option) at each genomic position. This method (like the ``de_novo`` method) does not always identify the exact modified base position. + +As of version 1.4, this method uses the canonical base model as a prior for control sample distribution estimation drastically improving results, particularly for low coverage samples. To test only against the canonical sample use the ``--sample-only-estimates`` flag. The prior weights for the estimated mean and standard deviation can be set using the ``--model-prior-weights`` option. .. code-block:: bash tombo detect_modifications sample_compare --fast5-basedirs \ - --control-fast5-basedirs \ + --control-fast5-basedirs \ --statistics-file-basename sample_canonical_compare ----------- @@ -123,7 +127,7 @@ In order to output the results of re-squiggling and statistical testing in a gen .. code-block:: bash - tombo text_output genome_browser --fast5-basedirs \ + tombo text_output browser_files --fast5-basedirs \ --statistics-filename sample_alt_model.5mC.tombo.stats \ --browser-file-basename sample_alt_model --file-types dampened_fraction coverage @@ -148,7 +152,7 @@ Example `meme `_ command line modified base .. code-block:: bash ./meme -oc motif_output.meme -dna -mod zoops sample_alt_model.6mA.most_signif.fasta - + For more details see the :doc:`text output documentation `. ----------------- @@ -164,16 +168,40 @@ Each genome anchored plotting command allows for the selection of genomic positi .. code-block:: bash tombo plot max_coverage --fast5-basedirs --plot-standard-model - + tombo plot motif_centered --fast5-basedirs --motif AWC \ --genome-fasta genome.fasta --control-fast5-basedirs - + tombo plot per_read --per-read-statistics-filename \ --genome-locations chromosome:1000 chromosome:2000:- \ --genome-fasta genome.fasta For more details see the :doc:`plotting documentation `. +-------------- +Read Filtering +-------------- + +Read filtering commands can be useful to extract the most out out of a set of reads for modified base detection. Read filtering commands effect only the Tombo index file, and so filters can be cleared or applied iteratively without re-running the re-squiggle command. Five filters are currently made available (``genome_locations``, ``raw_signal_matching``, ``q_score``, ``level_coverage`` and ``stuck``). + +.. code-block:: bash + + # filter reads to a specific genomic location + tombo filter genome_locations --fast5-basedirs path/to/native/rna/fast5s/ \ + --include-regions chr1:0-10000000 + + # apply a more strigent observed to expected signal score (default: 1.1 for DNA reads) + tombo filter raw_signal_matching --fast5-basedirs path/to/native/rna/fast5s/ \ + --signal-matching-score 1.0 + +.. hint:: + + Hint: Save a set of filters for later use by copying the Tombo index file: ``cp path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index save.native.tombo.index``. To re-set to a set of saved filters after applying further filters simply replace the index file: ``cp save.native.tombo.index path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index``. + +.. + + For more read filtering commands see the `Tombo filter documentation here `_. + .. tip:: For additional command details, see the specific commands documentation section. diff --git a/docs/filtering.rst b/docs/filtering.rst index c1cb350..5fd2592 100644 --- a/docs/filtering.rst +++ b/docs/filtering.rst @@ -4,47 +4,47 @@ Read Filtering Commands Read filtering commands can be useful to extract the most out out of a set of reads for modified base detection. Read filtering commands effect only the Tombo index file, and so filters can be cleared or applied iteratively without re-running the re-squiggle command. Five filters are currently made available (``genome_locations``, ``raw_signal_matching``, ``q_score``, ``level_coverage`` and ``stuck``). ---------------------------- -``filter genome_locations`` ---------------------------- +--------------------------------- +``tombo filter genome_locations`` +--------------------------------- -The ``filter genome_locations`` command filters out reads falling outside of a specified set of ``--include-regions``. These regions can either be whole chromosomes/sequence records or sub-regions within sequence records. +The ``tombo filter genome_locations`` command filters out reads falling outside of a specified set of ``--include-regions``. These regions can either be whole chromosomes/sequence records or sub-regions within sequence records. ------------------------------- -``filter raw_signal_matching`` ------------------------------- +------------------------------------ +``tombo filter raw_signal_matching`` +------------------------------------ -The ``filter raw_signal_matching`` command filters out reads with poor matching between raw observed signal and expected signal levels from the canonical base model. Specify a new threshold to apply with the ``--signal-matching-score`` option. These scores are the mean half z-score (absolute value of z-score) taken over all bases of a read. A reasonable range for this threshold should be approxiamtely between 0.5 and 3. Reads with a larger fraction of modifications may require a larger value to process successfully. +The ``tombo filter raw_signal_matching`` command filters out reads with poor matching between raw observed signal and expected signal levels from the canonical base model. Specify a new threshold to apply with the ``--signal-matching-score`` option. These scores are the mean half z-score (absolute value of z-score) taken over all bases of a read. A reasonable range for this threshold should be approxiamtely between 0.5 and 3. Reads with a larger fraction of modifications may require a larger value to process successfully. ------------------- -``filter q_score`` ------------------- +------------------------ +``tombo filter q_score`` +------------------------ -The ``filter q_score`` command filters out reads with poor mean basecalling quality scores. This value can be indicative of low quality reads. Set this value with the ``--q-score`` option. +The ``tombo filter q_score`` command filters out reads with poor mean basecalling quality scores. This value can be indicative of low quality reads. Set this value with the ``--q-score`` option. -------------------------- -``filter level_coverage`` -------------------------- +------------------------------- +``tombo filter level_coverage`` +------------------------------- -The ``filter level_coverage`` command aims to filter reads to achieve more even read depth across a genome/transcriptome. This may be useful in canonical and alternative model estimation. This filter may also help make test statistics more comparable across the genome. +The ``tombo filter level_coverage`` command aims to filter reads to achieve more even read depth across a genome/transcriptome. This may be useful in canonical and alternative model estimation. This filter may also help make test statistics more comparable across the genome. This filter is applied by randomly selecting reads weighted by the approximate coverage at the mapped location of each read. The number of reads removed from downstream processing is defined by the ``--percent-to-filter`` option. This filter is likely to be more useful for PCR'ed sample where duplicate locations are more likely to accumulate and cause large spikes in coverage. ----------------- -``filter stuck`` ----------------- +---------------------- +``tombo filter stuck`` +---------------------- -The ``filter stuck`` command aims to remove reads where bases tend to get stuck in the pore for longer durations of time. These reads can be indicative of poor quality reads and thus negatively effect modified base detection. +The ``tombo filter stuck`` command aims to remove reads where bases tend to get stuck in the pore for longer durations of time. These reads can be indicative of poor quality reads and thus negatively effect modified base detection. This filter is based on the number of observations per genomic base along a read. The filter can be set on any number of percentiles of obervations per base. Reasonable values depend strongly on the sample type (DNA or RNA). A reasonable filter for DNA reads would be to filter reads with 99th percentile > 200 obs/base or a maximum base with > 5k obs/base. This filter would be set with the ``--obs-per-base-filter 99:200 100:5000`` option. Larger values should be used for RNA reads. ------------------------- -``filter clear_filters`` ------------------------- +------------------------------ +``tombo filter clear_filters`` +------------------------------ -The ``filters clear_filters`` command removes any applied filters to this sample (including those applied during the ``resquiggle`` command; though reads that failed before signal to sequence assignment will not be included). New filters can then be applied to this set of reads. +The ``tombo filters clear_filters`` command removes any applied filters to this sample (including those applied during the ``resquiggle`` command; though reads that failed before signal to sequence assignment will not be included). New filters can then be applied to this set of reads. All Tombo sub-commands will respect the filtered reads when parsed for processing. diff --git a/docs/index.rst b/docs/index.rst index 14e5e20..77cc6b0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -35,7 +35,9 @@ See :doc:`examples` for common workflows. Quick Start =========== -Call 5mC and 6mA sites from raw nanopore read files. Then output genome browser `wiggle format file `_ for 5mA calls and plot raw signal around most significant 6mA sites. +Re-squiggle raw nanopore read files and call 5mC and 6mA sites. + +Then, for 5mA calls, output genome browser `wiggle format file `_ and, for 6mA calls, plot raw signal around most significant locations. :: @@ -44,13 +46,13 @@ Call 5mC and 6mA sites from raw nanopore read files. Then output genome browser --fastq-filenames basecalls1.fastq basecalls2.fastq \ --sequencing-summary-filenames seq_summary1.txt seq_summary2.txt \ --processes 4 - + tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 tombo detect_modifications alternative_model --fast5-basedirs path/to/fast5s/ \ --statistics-file-basename sample.alt_modified_base_detection \ --per-read-statistics-basename sample.alt_modified_base_detection \ --alternate-bases 5mC 6mA --processes 4 - + # produces "estimated fraction of modified reads" genome browser files # for 5mC testing tombo text_output browser_files --statistics-filename sample.alt_modified_base_detection.5mC.tombo.stats \ @@ -59,8 +61,8 @@ Call 5mC and 6mA sites from raw nanopore read files. Then output genome browser tombo text_output browser_files --statistics-filename sample.alt_modified_base_detection.6mA.tombo.stats \ --fast5-basedirs path/to/fast5s/ --file-types dampened_fraction coverage\ --browser-file-basename sample.alt_modified_base_detection.6mA - - # plot raw signal at most significant locations + + # plot raw signal at most significant 6mA locations tombo plot most_significant --fast5-basedirs path/to/fast5s/ \ --statistics-filename sample.alt_modified_base_detection.6mA.tombo.stats \ --plot-standard-model --plot-alternate-model 6mA \ @@ -75,13 +77,13 @@ Detect any deviations from expected signal levels for canonical bases to investi --statistics-file-basename sample.de_novo_modified_base_detection \ --per-read-statistics-basename sample.de_novo_modified_base_detection \ --processes 4 - + # produces sample.de_novo_modified_base_detection.dampened_fraction.[plus|minus].wig files tombo text_output browser_files --statistics-filename sample.de_novo_modified_base_detection.tombo.stats \ --browser-file-basename sample.de_novo_modified_base_detection --file-types dampened_fraction .. note:: - + All of these commands work for RNA data as well, but a transcriptome reference sequence must be provided for spliced transcripts. Run ``tombo -h`` to see all Tombo command groups, run ``tombo [command-group] -h`` to see all commands within each group and run ``tombo [command-group] [comand] -h`` for help with arguments to each Tombo command. @@ -110,18 +112,13 @@ Contents rna model_training -------------------------- -Full API reference (beta) -------------------------- +------------------- +Tombo API Reference +------------------- .. toctree:: :maxdepth: 2 tombo -------------------- -Documentation Index -------------------- - -* :ref:`genindex` -* :ref:`modindex` +:ref:`Tombo Module Documentation Index ` diff --git a/docs/model_training.rst b/docs/model_training.rst index 53fe51a..4ce1b4a 100644 --- a/docs/model_training.rst +++ b/docs/model_training.rst @@ -2,21 +2,21 @@ Model Training (Advanced Users Only) ************************************ -Model training is made available via several Tombo commands, but should be used with care as these methods can be very sensetive to the samples used. Commands relevant to model training are found within the ``build_model`` command group. The commands are ``estimate_reference`` for estimating a canonical bases model, ``estimate_alt_reference`` for estimation of a non-canonical alternative base model, and ``event_resquiggle`` for re-squiggling reads without a model (requires event-based basecaller results). +Model training is made available via several Tombo commands, but should be used with care as these methods can be very sensetive to the samples used. Commands relevant to model training are found within the ``tombo build_model`` command group. The commands are ``estimate_reference`` for estimating a canonical bases model, ``estimate_alt_reference`` for estimation of a non-canonical alternative base model, and ``event_resquiggle`` for re-squiggling reads without a model (requires event-based basecaller results). .. note:: - - Model training produces a binary Tombo model file similar to those included in the Tombo software (found in the code repository here ``tombo/tombo_models``). User-created strandard Tombo models can be used in re-squiggling, modified base detection and plotting commands using the advanced ``--tombo-model-filename`` option. This option is generally for advanced users training their own models, so this option is not shown in the command line help documentation. Similarly user-created alternative models can be passed to plotting commands via the hidden ``--alternate-model-filename`` option and passed to any ``detect_modifications`` command via the advanced ``--alternate-model-filenames`` option. -====================== -``estimate_reference`` -====================== + Model training produces a binary Tombo model file similar to those included in the Tombo software (found in the code repository here ``tombo/tombo_models``). User-created strandard Tombo models can be used in re-squiggling, modified base detection and plotting commands using the advanced ``--tombo-model-filename`` option. This option is generally for advanced users training their own models, so this option is not shown in the command line help documentation. Similarly user-created alternative models can be passed to plotting commands via the hidden ``--alternate-model-filename`` option and passed to any ``tombo detect_modifications`` command via the advanced ``--alternate-model-filenames`` option. -The ``build_model estimate_reference`` command is provided to estimate a Tombo model for canonical bases only. +======================================== +``tombo build_model estimate_reference`` +======================================== -To estimate a canonical model, first genomic base levels are parsed from reads as assigned by a re-squiggle command (either ``event_resquiggle`` or ``resquiggle`` processed reads are acceptable) and grouped by their genomic base assignment. By default, the median and standard deviation of the current level over reads covering each genomic position is computed. The ``--estimate-mean`` option will trigger this to be computed as a mean instead, though this can be sensetive to outlier read signal assignment and is thus not recommended. +The ``tombo build_model estimate_reference`` command is provided to estimate a Tombo model for canonical bases only. -All genomic current levels are then grouped based on the genomic k-mer sequence at that location. This k-mer is defined by the ``--upstream-bases`` and ``--downstream-bases`` options. Note that the modeled k-mer will be one longer than the sum of these two options as the k-mer includes the *dominant*, central position as well. The central position generally has the strongest correlation with the current signal level as can be seen with the ``plot_kmer`` command. +To estimate a canonical model, first genomic base levels are parsed from reads as assigned by a re-squiggle command (either ``tombo build_model event_resquiggle`` or ``tombo resquiggle`` processed reads are acceptable) and grouped by their genomic base assignment. By default, the median and standard deviation of the current level over reads covering each genomic position is computed. The ``--estimate-mean`` option will trigger this to be computed as a mean instead, though this can be sensetive to outlier read signal assignment and is thus not recommended. + +All genomic current levels are then grouped based on the genomic k-mer sequence at that location. This k-mer is defined by the ``--upstream-bases`` and ``--downstream-bases`` options. Note that the modeled k-mer will be one longer than the sum of these two options as the k-mer includes the *dominant*, central position as well. The central position generally has the strongest correlation with the current signal level as can be seen with the ``plot kmer`` command. The reference signal level and spread for each k-mer are then estimated by taking the median of the signal level and mean of the standard deviation over all observations of each k-mer across the genome. By default, a single global standard deviation is taken as the median over all k-mers. The ``--kmer-specific-sd`` option is provided in order to estimate a seperate standard deviation for each k-mer, but is not recommended as this can have deleterious effects on Tombo analyses. In particular, k-mer specific standard deviation estimates can produce poor re-squiggle results due to signal being "packed" into high SD k-mers. @@ -24,11 +24,11 @@ These values are stored in the output file in the binary HDF5 format and can be Several options are supplied in order to ensure more robust parameter estimates via read depth thresholds at various stages of model estimation (``--minimum-test-reads``, ``--coverage-threshold`` and ``--minimum-kmer-observations``). -The model estimation command is capable of using mutiple processes via the ``--multiprocess-region-size`` and ``--processes`` options with similar behavior as these options in the ``detect_modifications`` command. The multi-processing only applies to the genome position level computation and not the global model estimation stage; as such changes in multi-processing options will not change resulting models. +The model estimation command is capable of using mutiple processes via the ``--multiprocess-region-size`` and ``--processes`` options with similar behavior as these options in the ``tombo detect_modifications`` command. The multi-processing only applies to the genome position level computation and not the global model estimation stage; as such changes in multi-processing options will not change resulting models. -========================== -``estimate_alt_reference`` -========================== +============================================ +``tombo build_model estimate_alt_reference`` +============================================ --------------------------- Alternative Reference Goals @@ -36,9 +36,9 @@ Alternative Reference Goals One of the main goals of the Tombo suite of tools is to make alternative model estimation more accessible. Key to this goal is the estimation of an alternative model from a relatively simple to prodcue biological sample. A significant additional goal is the estimation of a model capable of detecting an alternative base in all sequence contexts. -In order to address these goals, the sample required for alternative model estimation must contain the four canonical bases along with a **single, known, alternative base incorporated randomly instead of one canonical base** into a sample with a known genome (referred to as the "*alternative sample*" below). The rate of incorporation for the alternative base should ideally be between 15% and 35%, though a larger range may be acceptable. Key to this method is that the exact known location of alternative base incorporation is not needed, though the base must incorporate in place of only a single canonical base (referred to as the "*swap base*" below and specified with the ``--alternate-model-base`` option to ``build_model estimate_alt_reference``). +In order to address these goals, the sample required for alternative model estimation must contain the four canonical bases along with a **single, known, alternative base incorporated randomly instead of one canonical base** into a sample with a known genome (referred to as the "*alternative sample*" below). The rate of incorporation for the alternative base should ideally be between 15% and 35%, though a larger range may be acceptable. Key to this method is that the exact known location of alternative base incorporation is not needed, though the base must incorporate in place of only a single canonical base (referred to as the "*swap base*" below and specified with the ``--alternate-model-base`` option to ``tombo build_model estimate_alt_reference``). -The creation of such a sample for the estimation of the included 5-methylcytosine (5mC) model was completed by introducing 25% (ratio to canonical dCTP) 5-methyl-dCTP into a standard PCR reaction in E. coil. Note that a standard PCR'ed (or otherwise produced canonical bases only) sample is also required for alternative model estimation (referred to as the "*standard sample*" below). For the included N6-methyladenosine (6mA) model, the sample was produced using an in vitro methylase thus exemplifying the flexibility of the alternative model estimation method to different sample preparation techniques. These samples were then re-squiggled and processed with the ``build_model estimate_alt_reference`` command to produce the included 5mC and 6mA models. +The creation of such a sample for the estimation of the included 5-methylcytosine (5mC) model was completed by introducing 25% (ratio to canonical dCTP) 5-methyl-dCTP into a standard PCR reaction in E. coil. Note that a standard PCR'ed (or otherwise produced canonical bases only) sample is also required for alternative model estimation (referred to as the "*standard sample*" below). For the included N6-methyladenosine (6mA) model, the sample was produced using an in vitro methylase thus exemplifying the flexibility of the alternative model estimation method to different sample preparation techniques. These samples were then re-squiggled and processed with the ``tombo build_model estimate_alt_reference`` command to produce the included 5mC and 6mA models. --------------------------------------- Alternative Reference Estimation Method @@ -49,7 +49,7 @@ Base Level Extraction Given the above descsribed standard and alternative samples, the alternative model estimation procedure begins with the extraction of the current signal level from a number of reads from both samples. These signal levels are grouped by the genomic k-mer at the location assigned by the re-squiggle algorithm. Importantly, in contrast to standard reference estimation, the signal is not averaged or otherwise processed at the genomic position level. This is because each swap base genomic position contains some proportion of canonical and alternative bases. -Reads continue to be processed until every k-mer has at least ``--minimum-kmer-observations`` unique event observations. For PCR'ed samples in paricular, the ``filter level_coverage`` command can help speed up this processing step if the sample coverage is highly variable. In order to save on the memory footprint, event levels are no longer stored once 10,000 obervations have been made for a particular k-mer. +Reads continue to be processed until every k-mer has at least ``--minimum-kmer-observations`` unique event observations. For PCR'ed samples in paricular, the ``tombo filter level_coverage`` command can help speed up this processing step if the sample coverage is highly variable. In order to save on the memory footprint, event levels are no longer stored once 10,000 obervations have been made for a particular k-mer. Signal Level Density Estimation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -72,8 +72,7 @@ Most of these k-mers are likely to shift the signal only slightly (though this m .. figure:: _images/alt_density_est.png :align: center - :scale: 30% - + Canonical and spike-in 5mC example 6-mer signal level distributions and estimated 5mC distribution ---- @@ -88,15 +87,15 @@ For k-mers not containing any swap bases, the standard model expected level is t Alternative Model Output ^^^^^^^^^^^^^^^^^^^^^^^^ -The alternative model is then saved to the file specified with the ``--alternate-model-filename`` option. Also specified is the ``--alternate-model-name`` option, which should be a short name describing the alternative base. When ``detect_modifications`` is run with this alternative model, the results are saved with this short name included in the output Tombo statsitics filename. +The alternative model is then saved to the file specified with the ``--alternate-model-filename`` option. Also specified is the ``--alternate-model-name`` option, which should be a short name describing the alternative base. When ``tombo detect_modifications`` is run with this alternative model, the results are saved with this short name included in the output Tombo statsitics filename. -==================== -``event_resquiggle`` -==================== +====================================== +``tombo build_model event_resquiggle`` +====================================== -The ``event_resquiggle`` command performs the re-squiggle algorithm first developed as part of the nanoraw software package. This command requires an events table, but does not require an expected current level model. In the model training framework, this command can be useful as the original model training step, for example when no Tombo model is available for a particular chemistry. +The ``tombo build_model event_resquiggle`` command performs the re-squiggle algorithm first developed as part of the nanoraw software package. This command requires an events table, but does not require an expected current level model. In the model training framework, this command can be useful as the original model training step, for example when no Tombo model is available for a particular chemistry. -In comparison to the ``resquiggle`` command, this method maps a read to a genome, but then takes the initial signal to base assignment from the Events table. The read is anchored to the raw signal at locations where the read maps correctly to the genome. Where ever a read incorrectly maps to the genome the re-squiggle algorithm discovers new base boundaries based only on the significant changes in signal level (generally indicative of transition from one base to the next). +In comparison to the ``tombo build_model resquiggle`` command, this method maps a read to a genome, but then takes the initial signal to base assignment from the Events table. The read is anchored to the raw signal at locations where the read maps correctly to the genome. Where ever a read incorrectly maps to the genome the re-squiggle algorithm discovers new base boundaries based only on the significant changes in signal level (generally indicative of transition from one base to the next). This method has several known failure modes and thus the model aware re-squiggle algorithm is strongly preferred when a valid model is available. @@ -106,14 +105,13 @@ Note that this method does make alternative signal normalization methods availab .. figure:: _images/old_correction_plot.png :align: center - :scale: 30% - + Event re-squiggle correction process from basecalls (top segments) to genomic sequence(bottom segments) ---- -================== -``estimate_scale`` -================== +==================================== +``tombo build_model estimate_scale`` +==================================== -The ``estimate_scale`` command is provided in order to estimate a global scaling parameter from a sub-set of reads in a run (as is performed by default at the beginning of the ``resquiggle`` command). The value returned may be useful as the value passed to the ``--fixed-scale`` option of the ``resquiggle`` command in certain situations, but is considered experimental at this time. +The ``tombo build_model estimate_scale`` command is provided in order to estimate a global scaling parameter from a sub-set of reads in a run (as is performed by default at the beginning of the ``tombo resquiggle`` command). The value returned may be useful as the value passed to the ``--fixed-scale`` option of the ``tombo resquiggle`` command in certain situations, but is considered experimental at this time. diff --git a/docs/modified_base_detection.rst b/docs/modified_base_detection.rst index 04c4d4e..8c5f0be 100644 --- a/docs/modified_base_detection.rst +++ b/docs/modified_base_detection.rst @@ -2,55 +2,38 @@ Modified Base Detection *********************** -Tombo enables three methods for detecting shifts in current signal level, indicative of non-canonical bases. These three methods allow researchers to investigate non-canonical bases given any sample type, while also enabling more accurate detection of specific modifications when applicable. +Tombo enables three methods for detecting shifts in current signal level, indicative of non-canonical bases. These three methods allow researchers to investigate non-canonical bases given any sample type, while enabling more accurate detection of specific modifications when applicable. ---- .. figure:: _images/testing_method_comparison.png :align: center - :scale: 30% - + Tombo modified base testing methods. ---- -All three methods are accessed by the ``detect_modifications`` Tombo command as described below. +All three methods are accessed by the ``tombo detect_modifications`` command group as described below. **TL;DR**: -* To identify 5-methylcytosine (5mC) and N6-methyladenosine (6mA), run ``detect_modifications alternative_model`` with the ``--alternate-bases 5mC 6mA`` option -* For more experimental de novo modified base detection simply run ``detect_modifications de_novo`` with just a set of reads -* For modified base detection via comparison to a control sample (e.g. PCR) run ``detect_modifications sample_compare`` with a control set of reads (``--control-fast5-basedirs``) -* The ``detect_modifications`` command will produce a binary file (not intended for use outside the Tombo framework) - - - To extract useful text files see the ``text_output`` commands - - To visualize raw signal around significant regions use the ``plot most_significant`` command - - To assess testing results around a known motif use the ``plot motif_with_stats``, ``plot roc``, and ``plot per_read_roc`` commands - -.. hint:: - - The ``resquiggle`` command must be run on a set of reads before processing with ``detect_modifications``. - -------------------- -Statistical Testing -------------------- +* To identify 5-methylcytosine (5mC; DNA or RNA) and N6-methyladenosine (6mA; DNA only), run ``tombo detect_modifications alternative_model`` with the ``--alternate-bases 5mC 6mA`` option +* For more experimental *de novo* modified base detection simply run ``tombo detect_modifications de_novo`` with just a set of reads +* For modified base detection via comparison to a control sample (e.g. PCR or IVT) run ``tombo detect_modifications sample_compare`` with a control set of reads (``--control-fast5-basedirs``) +* The ``tombo detect_modifications`` command will produce a binary file (not intended for use outside the Tombo framework) -For all statistical testing methods, the result is a binary Tombo statistics file. This file contains statistics associated with each genomic base producing a valid result. This file is not intended for use outside of the Tombo framework. Several Tombo commands (e.g. ``text_output browser_files``, ``text_output signif_sequence_context`` and ``plot most_significant``) take the statistics file as an input, accommodating many user pipelines downstream of modified base detection. + - To extract useful text files see the ``tombo text_output`` commands + - To visualize raw signal around significant regions use the ``tombo plot most_significant`` command + - To assess testing results around a known motif use the ``tombo plot motif_with_stats``, ``tombo plot roc``, and ``tombo plot per_read_roc`` commands -Of particular interest, the statistics file contains the fraction of reads at each genomic position passing a set threshold or falling outside of a set interval if 2 values are provided to the ``--single-read-threshold`` option. The default value for this parameter is set for each testing method and for DNA and RNA data types using the default settings. Note that changing testing parameters may require a new ``--single-read-threshold`` for optimal results. For example, changing the ``--fishers-method-context`` option value in either the ``de_novo`` or ``compare_sample`` methods is likely to require a new threshold value. - -For ``--single-read-threshold`` values with an interval or for the ``alternative_model`` with values greater than 0, the number of reads falling outside of the threshold values is saved under the ``valid_cov`` column in the statistics file. These values can be output with the ``text_output browser_files --file-types valid_coverage`` command. - -For the de novo and alternative model testing approaches a default canonical model is used (included with Tombo). Users may also train their own canonical Tombo model (possibly for an older chemistry version) and test against this model using the advanced ``--tombo-model-filename`` option. See more in the :doc:`model_training` section. - -Another available output from the ``detect_modifications`` command is a per-read (and per-base) binary (HDF5) statistics file (via ``--per-read-statistics-basename`` option). This file is currently made available for research on per-read modified base detection including plotting via the ``plot per_read`` command and further computing via the ``detect_modifications aggregate_per_read_stats`` command. For advanced researchers, the per-read statistics data can be accessed (including random access to particular regions of the genome) using the ``tombo.tombo_stats.PerReadStats`` class from the Tombo python API. +.. hint:: -Alternative Model Method -======================== + The ``tombo resquiggle`` command must be run on a set of reads before processing with ``tombo detect_modifications``. -In order to specifically detect 5mC and 6mA, use the ``detect_modifications alternative_model`` command. Users may also train their own alternative base Tombo models and test against these with the advanced ``--alternate-model-filenames`` option. See more details in the :doc:`model_training` section. +Specific Alternate Base Detection (Recommended) +=============================================== -The ``detect_modifications alternative_model`` command will compute a statistic similar to a log likelihood ratio (LLR) but dynamically scaled to be more robust to outlier signal assignment. This statistic is computed for each "swap base" within each read provided (e.g. computed at each cytosine for 5mC detection and each adenine for 6mA detection). +In order to specifically detect 5mC and/or 6mA, use the ``tombo detect_modifications alternative_model`` command. This command computes a statistic similar to a log likelihood ratio (LLR) but dynamically scaled to be more robust to outlier signal levels. This statistic is computed for each "swap base" within each read provided (e.g. each cytosine for 5mC detection or each adenine for 6mA detection). This statistic is computed by scaling the LLR by the normal likelihood function with the same variance and mean halfway between the canonical and alternative expected signal levels. Three additional scaling factors are added to this function in order to give greater weight to sequence contexts with larger differences between the canonical and alternative expected signal levels, which inherently provide more power to distinguish the canonical and alternative base signal levels. These parameters are also set so that values are on relatively the same scale as a log likelihood ratio for setting ``--single-read-threshold`` values. Default values for the scale factors below are :math:`S_f = 4`, :math:`S_{f2} = 3` and :math:`S_p = 0.3`, which produce the functions shown in the figure below. Users can experiment with the effect of these parameters with the provided ``scripts/outlier_robust_llr.R`` script. @@ -68,13 +51,12 @@ In order to compute a standard log likelihood ratio, use the ``--standard-log-li .. figure:: _images/outlier_robust_llr.gif :align: center - :scale: 30% - + Tombo outlier-robust versus standard likelihood ratio statistic over varied differences between canonical and alternative expected signal levels. ---- -This statistic is computed and summed over all positions modeled. The default DNA model is a 6-mer, so the signal at the six surrounding genomic bases contribute to the resulting statistic at any one position. For example, for 5mC detection within in a TGGTA **C** GTCCG context, the signal will be tested against expected canonical and alternative 5mC levels at the following locations:: +This statistic is computed and summed over all positions where the base of interest is included in the modeled k-mer. The default DNA model is a 6-mer, so the signal at the six surrounding genomic bases contribute to the resulting statistic at any one position. For example, for 5mC detection within in a TGGTA **C** GTCCG context, the signal will be tested against expected canonical and alternative 5mC levels at the following locations:: TGGTA **C** GTCCG ----------------- @@ -92,17 +74,16 @@ New alternative base models will be added as they are trained and validated inte tombo detect_modifications alternative_model --fast5-basedirs \ --alternate-bases 5mC 6mA --statistics-file-basename sample.alt_model - # with user trained alternative base model - tombo detect_modifications alternative_model --fast5-basedirs \ - --alternate-model-filenames alternative_base.tombo.model \ - --statistics-file-basename sample.user_alt_model +.. hint:: + + Users may also train their own alternative base Tombo models and test against these with the advanced ``--alternate-model-filenames`` option. See more details in the :doc:`model_training` section. De novo Non-canonical Base Method ================================= -In order to perform *de novo* non-canonical base detection, use the ``detect_modifications de_novo`` command. +In order to perform *de novo* non-canonical base detection, use the ``tombo detect_modifications de_novo`` command. This method is ideal for unknown modification motif detection when using in combination with the ``tombo text_output signif_sequence_context`` command and motif detection software (e.g. `MEME `_). -For each read, this will perform a hypothesis test against the canonical model based on the genomic sequence at each position. Note that this method can be quite error prone and may result in a high false positive rate, but may be of use in a research and development setting. This method also has the lowest barrier to entry, requiring only a set of reads and a genome, allowing any nanopore researcher to start investigating potentially any type of modified base. +For each read at each position, this method performs a hypothesis test against the canonical model based on the genomic sequence. Note that this method can be quite error prone and may result in a high false positive rate, especially on a per-read basis. This method also has the lowest barrier to entry, requiring only a set of reads and a reference sequence, allowing any nanopore researcher to start investigating potentially any type of modified base. .. code-block:: bash @@ -112,13 +93,9 @@ For each read, this will perform a hypothesis test against the canonical model b Canonical Sample Comparison Method ================================== -In order to perform *canonical sample comparison* modified base detection, use the ``detect_modifications sample_compare`` command with a second set of reads from the same biological sample containing only canonical bases (e.g. PCR for DNA or IVT for RNA) via the ``--control-fast5-basedirs``. +In order to perform *canonical sample comparison* modified base detection, use the ``tombo detect_modifications sample_compare`` command with a second set of reads from the same biological sample containing only canonical bases (e.g. PCR for DNA or IVT for RNA) via the ``--control-fast5-basedirs`` option. -For each sample read, this will perform a hypothesis test against a normal distribution estimated from the signal level observed from the control sample reads at each genome position. This method does not always identify the exact modification position or the identity of the modified base as with the *de novo* method. - -Note that no model is used in the application of this method. Instead the testing null distribution is estimated at each genomic location from the control set of reads. - -For both this method, as well as the *de novo* method, the ``--fishers-method-context`` option will combine test values, using `Fisher's Method `_, over a moving window extending a number of positions in either direction. Due to the nature of nanopore sequencing, the genomic context surrounding the read head effect that current at any position. Thus shifts in signal due to a modified base may occur at several positions to either side of the true modified location. Thus combining statistical test values across several genomic positions can help to center significant values on the truly modified position. The default value for this parameter is 1, but reasonable results can be obtained for values between 0 and 3. +For each sample read, this will perform a hypothesis test against a distribution estimated from the signal levels observed from the control sample reads at each genome position. As of version 1.4, this method uses the canonical base model as a prior for this estimated distribution improving results for low coverage regions (disable canonical prior with the ``--sample-only-estimates`` option or lower the prior impact on estimates by lowering the default ``--model-prior-weights`` values). .. code-block:: bash @@ -126,44 +103,71 @@ For both this method, as well as the *de novo* method, the ``--fishers-method-co --control-fast5-basedirs \ --statistics-file-basename sample.compare_sample ------------------------------ -Aggregate Per-read Statistics ------------------------------ +.. note:: + + Due to the nature of nanopore sequencing, the genomic context surrounding the read head effect that current at any position. Thus shifts in signal due to a modified base may occur at several positions to either side of the true modified location. In order to account for this the canonical sample and *de novo* modfied base detection methods accept the ``--fishers-method-context`` option which combines test values, using `Fisher's Method `_, over a moving window across the genome. This can help to center significant values on modified base positions. The default value for this parameter is 1, but reasonable results can be obtained for values between 0 and 3. + +Aggregating Per-read Statistics +=============================== -In order to facilitate research on the per-genomic base aggregation across reads, Tombo provides the ``detect_modifications aggregate_per_read_stats`` command. The primary utility for this command is to enable easier manipulation of the per-read threshold values. It is not possible to change other testing parameters from this command (e.g. ``--fishers-method-context`` or ``--tombo-model-filename``). +All of the above methods compute per-read, per-genome location test statistics. In order to facilitate research at the genomic location level, these per-read statistics are combined at each genomic location by applying a global threshold identifying each read as supporting a canonical or alternative base. This results in a fraction of reads indicating a modified base at each genomic location. This global threshold may consist of a single threshold value or a pair of values (where test statistics between the values do not contribute to the estimated fraction of modified reads). + +All ``tombo detect_modifications`` methods enable output of per-read test statistics (``--per-read-statistics-basename``). Tombo also provides the ``tombo detect_modifications aggregate_per_read_stats`` command in order to apply different global threshold values to per-read statistics without re-computing these statistics. Note it is not possible to change other testing parameters from this command (e.g. ``--fishers-method-context``). + +Dampened Fraction Estimates +=========================== + +At low coverage locations the fraction of modified reads estimates can be poor. Thus the ``--coverage-dampen-counts`` option is provided in order to dampen the estimated fraction of modified reads at low coverage locations. This allows easier use of the fraction statistic in downstream analysis. + + - The fraction estimate includes pseudo-counts added to the un-modified and modified read counts (as specified by the ``--coverage-dampen-counts`` option) + - This is equivalent to using a beta prior when estimating the fraction of reads modified at each position + - Test the effect of different dampen counts using the ``scripts/test_beta_priors.R`` (the default values are shown below) + - The raw fraction is still included in the statistics file (access from python API) + +---- + +.. figure:: _images/dampened_fraction.png + :align: center + + Heatmap showing the resulting dampened farction of modified reads given the default ``--coverage-dampen-counts`` values over range of coverage and number of un-modified reads. + +---- ----------------- Multi-processing ----------------- +================ Tombo statistical testing provides the option to perform testing spread across multiple processes. This also limits the memory requirement for modified base detection, as only signal levels within a multiprocess block are held in memory. For very high coverage samples, consider lowering the ``--multiprocess-region-size`` value to minimize computational memory usage. Multi-processing is performed over batches delineated by regular intervals across chromosomes covered by at least one read. The interval size is determined by the ``--multiprocess-region-size`` option and processed by a number of processors indicated by the ``--processes`` option. The produced per-base (and per-reda) results are identical no matter the multi-processing options selected. These regions are also used as batches to store the pre-read statistics file. ----------------------------- Tombo Statistics File Format ----------------------------- +============================ + +For all modified base detection methods, the result is a binary Tombo statistics file. This file contains statistics associated with each genomic base producing a valid result. This file is not intended for use outside of the Tombo framework. Several Tombo commands (e.g. ``tombo text_output browser_files``, ``tombo text_output signif_sequence_context`` and ``tombo plot most_significant``) take the binary statistics file as an input, accommodating many user pipelines downstream of modified base detection. -While the Tombo statistics file is meant to be a binary file not processed by outside tools its contents are described here for completeness. The Tombo statistics file is `HDF5 format `_. There is one attribute at the root level, ``stat_type`` indicating which testing method was used (``model_compare``, ``de_novo`` or ``sample_compare``). +While the Tombo statistics file is meant to be a binary file not processed by outside tools its contents are described here for completeness. Access to this file is recommended through the ``tombo.tombo_helper.TomboStats`` object in the Tombo python API. -The per-base statistics are stored in a dataset, ``stats``, containing one record for each genomic base. Each record contains the following attributes: ``frac``, ``pos``, ``chrm``, ``strand``, ``cov``, ``control_cov``, and ``valid_cov``. +.. important:: + + All other optional arguments to the ``tombo.tombo_stats.TomboStats`` constructor should be left as ``None``; setting these values will delete the file and construct a blank per-read statistics file. -``pos``, ``chrm`` and ``strand`` define the zero-based genomic position for this record. +The Tombo statistics file is in `HDF5 format `_. Attributes at the root level are 1) ``stat_type`` indicating which testing method was used (``model_compare``, ``de_novo`` or ``sample_compare``), 2) ``block_size`` indicating the number of genomic bases in each statistics block and 3) `Cov_Threshold`` containing the coverage threshold applied to this file. + +Blocks of statistics are stored in the ``Statistic_Blocks`` group. Within this group, each block of statistics is found within a group named ``Group_NNN``. Each group contains attributes for the block ``start``, ``chrm`` and ``strand``. The ``block_stats`` data set contains the per-location statistics records. Each record contains the following attributes: ``frac``, ``pos``, ``chrm``, ``strand``, ``cov``, ``control_cov``, and ``valid_cov``. ``frac`` contains the fraction of valid (not including per-read statistics within the interval specified by ``--single_read_threshold``) reads at this genomic position identified as the standard base. ``cov``, ``control_cov``, and ``valid_cov`` contain the read coverage at the genomic position for the sample and control reads. ``control_cov`` is only applicable for the control sample comparison testing method. ``valid_cov`` contains the number of reads contributing to the ``frac`` of tested reads as defined by ``--single-read-threshold``. -------------------------------- Per-read Statistics File Format -------------------------------- +=============================== -Per-read statistics can be stored by setting the ``--per-read-statistics-basename`` option to any ``detect_modifications`` command. This output file can then be used in downstream Tombo sub-commands (e.g. the ``plot per_read`` and ``detect_modifications aggregate_per_read_stats`` commands). +Per-read statistics can be stored by setting the ``--per-read-statistics-basename`` option to any ``tombo detect_modifications`` command. This output file can then be used in downstream Tombo sub-commands (e.g. the ``tombo plot per_read`` and ``tombo detect_modifications aggregate_per_read_stats`` commands). For advanced users, the Tombo per-read statsitics file can be accessed via the Tombo python API using the ``tombo.tombo_stats.PerReadStats`` class. This class provides initialization, simply taking the per-read statsitics filename. The ``PerReadStats`` class supports the ``get_region_stats`` function which takes a ``tombo.tombo_helper.intervalData`` object specifying an interval of interest. This will return a numpy array containing a record for each read (specified by the ``read_id`` field) and each tested genomic position (``pos`` field) along with the test statistic (``stat`` field) at that location. .. important:: - + All other optional arguments to the ``tombo.tombo_stats.PerReadStats`` constructor should be left as ``None``; setting these values will delete the file and construct a blank per-read statistics file. The per-read statistics file is in the HDF5 format. All blocks are stored within the ``Statistic_Blocks`` slot. The size of the blocks is stored in the ``block_size`` attribute (defined by the ``--multiprocess-region-size`` option) and the type of statistical test applied is stored in the ``stat_type`` attribute. diff --git a/docs/plotting.rst b/docs/plotting.rst index 79271ff..a071acf 100644 --- a/docs/plotting.rst +++ b/docs/plotting.rst @@ -13,11 +13,11 @@ Plot Region Selection Most Tombo plotting functions are genome-anchored. These commands create plots analogous to a genome browser, but including all raw signal within a region. The available commands differ in their mode of genome region selection. This allows users to plot regions of interest for many research contexts. -* ``plot max_coverage`` - Select regions with maximal coverage -* ``plot genome_location`` - Select specified genomic locations -* ``plot motif_centered`` - Select regions with a specific motif (follows `NEB single letter codes `_) -* ``plot max_difference`` - Select regions where two samples' average signal differs most -* ``plot most_significant`` - Select most consistently/significantly mofidied locations +* ``tombo plot max_coverage`` - Select regions with maximal coverage +* ``tombo plot genome_location`` - Select specified genomic locations +* ``tombo plot motif_centered`` - Select regions with a specific motif (follows `NEB single letter codes `_) +* ``tombo plot max_difference`` - Select regions where two samples' average signal differs most +* ``tombo plot most_significant`` - Select most consistently/significantly mofidied locations These plotting commands produce raw signal level plots such at the example below. Options are available for each of these plots to logically select genomic regions based on the given criterion. @@ -29,8 +29,7 @@ These plotting commands produce raw signal level plots such at the example below .. figure:: _images/single_samp.png :align: center - :scale: 30% - + Single sample raw signal plot ---- @@ -46,20 +45,17 @@ Control these plots with these options: ``--control-fast5-basedirs``, ``--plot-s .. figure:: _images/sample_comp.png :align: center - :scale: 30% - + Control sample comparison plot .. figure:: _images/model_comp.png :align: center - :scale: 30% - + Canonical model plot .. figure:: _images/alt_model_comp.png :align: center - :scale: 30% - + Alternate model plot ---- @@ -73,49 +69,46 @@ When high coverage regions are plotted, the raw signal plots can become less int .. figure:: _images/boxplot.png :align: center - :scale: 30% - + Boxplot over-plotting option .. figure:: _images/quantile.png :align: center - :scale: 30% - + Quantile over-plotting option .. figure:: _images/density.png :align: center - :scale: 30% - + Density over-plotting option ---- +--------------------------- Per-read Statistic Plotting -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------- -All testing in the Tombo framework is applied first on a per-read basis; to visualize these per-read results, per-read statistic plots are available. Per-read statistics are an optional output from any ``detect_modifications`` command via the ``--per-read-statistics-filename`` option, and the output file specified by this option is required in order to the plot per-read statistics command. Create these plots with the ``plot per_read`` command. +All testing in the Tombo framework is applied first on a per-read basis; to visualize these per-read results, per-read statistic plots are available. Per-read statistics are an optional output from any ``detect_modifications`` command via the ``--per-read-statistics-filename`` option, and the output file specified by this option is required in order to the plot per-read statistics command. Create these plots with the ``tombo plot per_read`` command. ---- .. figure:: _images/pre_read_5mC.png :align: center - :scale: 30% - + Alternative 5mC model testing .. figure:: _images/per_read_do_novo.png :align: center - :scale: 30% - + De novo, standard model, per-read testing ---- +--------------------------------- Motif-centered Statistic Plotting -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------------- -In several biological contexts base modifications occur at specific motifs. In order to visualize the distribution of Tombo statistical test results centered on a motif of biolgical interest (or a discovered motif) the ``plot motif_with_stats`` command is provided. +In several biological contexts base modifications occur at specific motifs. In order to visualize the distribution of Tombo statistical test results centered on a motif of biolgical interest (or a discovered motif) the ``tombo plot motif_with_stats`` command is provided. This command identifies a number (defined by ``--num-statistics``) of genomic regions centered on this motif with the highest significance testing values. Importantly, the identified highest testing values need not be found within the actual motif, but simply within a region containing the motif defined by ``--num-context``. In this way, non-interesting motifs (motifs which don't direct modifications) will not contain more significant statistics centered on a specific position within the provided motif. A number (defined by ``--num-regions``) of example regions with the highest test statistics centered on the motif of interest are added at the top portion of this plot. @@ -123,8 +116,7 @@ This command identifies a number (defined by ``--num-statistics``) of genomic re .. figure:: _images/stat_dist.png :align: center - :scale: 30% - + Example statistics distribution around `biologically relevant CCWGG motif in E. coli `_ ---- @@ -136,14 +128,13 @@ Other Plotting Commands K-mer Level Distributions ^^^^^^^^^^^^^^^^^^^^^^^^^ -In order to investigate the k-mer signal current levels of a particular set of reads, the ``plot kmer`` command is provided. This plot extracts the observed signal levels from a set of reads and groups the signal by the local genomic sequence context (k-mer) and plots the resulting distributions of signal levels. +In order to investigate the k-mer signal current levels of a particular set of reads, the ``tombo plot kmer`` command is provided. This plot extracts the observed signal levels from a set of reads and groups the signal by the local genomic sequence context (k-mer) and plots the resulting distributions of signal levels. ---- .. figure:: _images/kmer_levels.png :align: center - :scale: 30% - + Example k-mer current level distribution plot ---- @@ -151,32 +142,30 @@ In order to investigate the k-mer signal current levels of a particular set of r ROC Curves ^^^^^^^^^^ -In order to validate the performance of modified base detection results at a known sequence motif, the ``plot roc`` command is provided. This command takes a Tombo statistics file, corresponding motif descriptions and the genome FASTA file. The "area under the curve" (AUC) for each motif is printed and the precision-recall curve is also plotted for each motif on the second page of the resulting PDF. Note that only genomic positions with the canonical base of interest are included in the results from this command (since the alternative model only makes calls at these positions). +In order to validate the performance of modified base detection results at a known sequence motif, the ``tombo plot roc`` command is provided. This command takes a Tombo statistics file, corresponding motif descriptions and the genome FASTA file. The "area under the curve" (AUC) for each motif is printed and the precision-recall curve is also plotted for each motif on the second page of the resulting PDF. Note that only genomic positions with the canonical base of interest are included in the results from this command (since the alternative model only makes calls at these positions). Below is an example command and resulting plot for identifying the known dam and dcm methylase contexts in E. coli using all three provided testing methods. .. code-block:: bash tombo plot roc --statistics-filenames \ - alt_testing.native_e_coli.5mC.tombo.stats \ - alt_testing.native_e_coli.6mA.tombo.stats \ - de_novo_testing.native_e_coli.tombo.stats \ - sample_comp_testing.tombo.stats \ + native.sample_comp.tombo.stats \ + native.de_novo.tombo.stats \ + native.alt.5mC.tombo.stats \ + native.alt.6mA.tombo.stats \ --motif-descriptions \ - CCWGG:2:"dcm 5mC Alt Model" \ - GATC:2:"dam 6mA Alt Model" \ - CCWGG:2:"dcm 5mC De novo"::GATC:2:"dam 6mA De novo" \ - CCWGG:2:"dcm 5mC Sample Comp"::GATC:2:"dam 6mA Sample Comp" \ - --genome-fasta ~/e_coli.fasta \ - --pdf-filename native_e_coli.roc.pdf --minimum-test-reads 10 + CCWGG:2:"dcm 5mC Sample Compare"::GATC:2:"dam 6mA Sample Compare" \ + CCWGG:2:"dcm 5mC De Novo"::GATC:2:"dam 6mA De Novo" \ + CCWGG:2:"dcm 5mC Alt. Model" \ + GATC:2:"dam 6mA Alt. Model" \ + --genome-fasta e_coli.fasta ---- .. figure:: _images/roc.png :align: center - :scale: 30% - - Example ROC curve plot (re-coloring not avaailable directly from Tombo plot roc command) + + Example ROC curve plot ---- @@ -196,9 +185,5 @@ It is also possible to compute and plot validation results on a per-read basis f .. figure:: _images/per_read_stat_dist.png :align: center - :scale: 30% - - Example per-read statistic distribution - ----- + Example per-read statistic distribution diff --git a/docs/resquiggle.rst b/docs/resquiggle.rst index a69c962..dc00916 100644 --- a/docs/resquiggle.rst +++ b/docs/resquiggle.rst @@ -2,29 +2,29 @@ Re-squiggle Algorithm ********************* -The electric current signal level data produced from a nanopore read is referred to as a squiggle. Base calling this squiggle information generally contains some errors compared to a refernce sequence. The re-squiggle algorithm defines a new assignment from squiggle to genomic sequence, hence a re-squiggle. +The electric current signal level data produced from a nanopore read is referred to as a squiggle. Base calling this squiggle information generally contains some errors compared to a reference sequence. The re-squiggle algorithm defines a new assignment from squiggle to genomic sequence, hence a re-squiggle. The re-squiggle algorithm is the basis for the Tombo framework. The re-squiggle algorithm takes as input a read file (in FAST5 format) containing raw signal and associated base calls. The base calls are mapped to a genome or transcriptome reference and then the raw signal is assigned to the genomic sequence based on an expected current level model. **TL;DR**: -* Re-squiggle must be run before modified base detection and other Tombo command. -* Minimally the command takes a directory containing FAST5 files and a genome/transcriptome reference. - +* The ``tombo resquiggle`` comand must be run on a set of reads before modified base detection or other Tombo commands. +* A directory containing FAST5 read files and a genome/transcriptome reference must be provided. + - The reference sequence may be previously known or discovered from this sample. -* FAST5 files must contain basecalls (as produced by albacore in fast5 mode or added with ``annotate_raw_with_fastqs``). - - - FAST5 files need NOT contain the "Events" table (required by ``nanoraw`` the Tombo predecessor). +* Importantly, the reference sequence is assumed to be correct, so polishing to create a personalized genome may improve performance, particularly for samples divergent from the reference or poorly assembled genomes. +* Raw read FAST5 files must contain basecalls. + + - Add basecalls from a set of FASTQs to raw read files with the ``tombo preprocess annotate_raw_with_fastqs`` command. + - Read files need not contain ``Events`` data (as output with ``fast5`` mode from albacore). * Tombo currently only supports both DNA and RNA data (including R9.4 and R9.5; 1D and 1D2 data; R9.*.1 chemistries). Other data may produce sub-optimal results (e.g. R7 data). * DNA and RNA reads will be detected automatically and processed accordingly (set explicitly with ``--dna`` or ``--rna``). - - - Tombo does not perform spliced mapping. Thus a transcriptime reference must be passed to the re-squiggle command for RNA samples. For futher details on Tombo RNA processing see the :doc:`rna` section. -* Run ``resquiggle`` over multiple cores with the ``--processes`` option. + - Tombo does not perform spliced mapping. Thus a transcriptime reference must be passed to the re-squiggle command for RNA samples. For futher details on Tombo RNA processing see the :doc:`rna` section. - - The ``--threads-per-process`` is also provided, but it is generally recommended that this option remains set to the default of 1, though it may improve results on some computing environments. +* Run ``tombo resquiggle`` over multiple cores with the ``--processes`` option. ----------------- Algorithm Details @@ -43,7 +43,7 @@ Genome Mapping The genome mapping is performed via the python API to ``minimap2`` (`mappy python package `_). -Read base called sequence location within the FAST5 file is defined by the ``--basecall-group`` and ``--basecall-subgroups`` command line options. The default values of these parameters point to the default location for base calls from albacore or ``annotate_raw_with_fastqs``. +Read base called sequence location within the FAST5 file is defined by the ``--basecall-group`` and ``--basecall-subgroups`` command line options. The default values of these parameters point to the default location for base calls from albacore or ``tombo preprocess annotate_raw_with_fastqs``. The genomic sequence for successfully mapped reads are then passed on to the :ref:`seqeunce_to_signal` stage. @@ -56,20 +56,28 @@ Before the first iteration of the event detection and signal to sequence assignm As of Tombo version 1.3 after the first iteration, new shift and scale parameters are computed by matching the expected signal levels with those observed from the first iteration of signal to seuquence assignment. The `Theil-Sen estimator `_ for the relationship between expected and observed signal levels is computed and used as a correction factor for the previous scale parameter. A shift correction factor is also computed taking the median of intercepts over each base in the read. -If either the shift or scale correction factors exceed a preset threshold, an additional round of event detection and sequence to signal to signal assignment is performed. This continues until the corrections factors are small enough or a maximum number of iterations are performed. Command line parameters to control this procedure can be found using the ``tombo resquiggle --print-advanced-arguments`` command. +If either the shift or scale correction factors exceed a preset threshold, an additional round of sequence to signal assignment is performed. This continues until the corrections factors are small enough or a maximum number of iterations are performed. Command line parameters to control this procedure can be found using the ``tombo resquiggle --print-advanced-arguments`` command. This method should be more robust to samples with higher modified base content than mean based sequence-dependent correction methods (e.g. M.O.M.). This per-read sequence-dependent normalization has provided much better results than previous Tombo scaling methods and is thus strongly recommended. Previous scaling methods are still made available for research purposes (see ``tombo resquiggle --print-advanced-arguments``). +.. note:: + + As of version 1.4, RNA samples are normalized over events after stall masking in order to provide more accurate normalzation factors. For RNA normalization compuation, the end of the read is trimmed (beginning in sequencing time), so that the DNA adapter does not effect normalization parameter estimation. + Event Detection --------------- -The Tombo algorithm does not require the "Events" table (raw signal assignment to base calls). Instead, Tombo discovers events from the raw signal. This segmented signal makes downstream processing steps more efficient and stable. The Tombo event detection algorithm is different from the event detection performed in previous versions of albacore, but produces similar results. +The Tombo resquiggle algorithm does not require the "Events" table (raw signal assignment to base calls). Instead, Tombo discovers events from the raw signal. This segmented signal makes downstream processing steps more efficient and stable. The Tombo event detection algorithm is different from the event detection performed in previous versions of albacore, but produces similar results. Events are determined by identifying large shifts in current level, by taking the running difference between neighboring windows of raw signal (explicitly set this parameter with the ``--segmentation-parameters`` option). The largest jumps (or most significant via a t-test for RNA) are chosen as the breakpoints between events. The mean of normalized raw signal is then computed for each event. -The ``--segmentation-parameters`` values have been optimized for DNA and RNA data types, so DNA and RNA read types should not be mixed in processing. +The ``--segmentation-parameters`` values are optimized for DNA and RNA data types, so DNA and RNA read types should not be mixed in processing. + +.. note:: + + For RNA samples, stalled bases are detected using a moving window mean approach and event boundaries located within a stalled base are removed from downstream processing. .. _seqeunce_to_signal: @@ -80,7 +88,7 @@ Given the mapped genomic sequence and normalized, segmented, raw signal, the seq This matching is found by a dynamic programming/dynamic time warpping algorithm to match event signal levels with expected signal levels given genomic sequence. -To compute this matching, a static banded matrix is constructed by computing the z-score for event level (x-axis) against genomic positions (y-axis). The negative absolute value z-score is shifted to an expected value of zero to fill the banded matrix (see figure **a** below). A forward pass computes the maximal cummulative score up to each matched event to genome position (see figure **b** below). +To compute this matching, first a static banded matrix is constructed by computing the z-score for event level (x-axis) against genomic positions (y-axis). The negative absolute value z-score is shifted to an expected value of zero to fill the banded matrix (see figure below). A forward pass computes the maximal cummulative score up to each matched event to genome position (see figure below). At each iteration (moving from bottom left to top right) the maximal score is taken over three possibilities 1) staying in the same genomic position, and accumulating the shifted z-score 2) matching an event with a genomic position (with score bonus) 3) skipping this genomic position (with a score penalty). The score match and skip penalties are definied by the ``--signal-align-parameters`` option. The default values have been optimized for DNA and RNA data types. From this forward pass, the maximal score along the last genomic position is taken and traced back to obtain a matching of sequence and signal. @@ -88,38 +96,36 @@ At each iteration (moving from bottom left to top right) the maximal score is ta .. figure:: _images/begin_half_z_scores.png :align: center - :scale: 110% - + Read start shifted half-normal scores +---- + .. figure:: _images/begin_forward_pass.png :align: center - :scale: 110% - + Read start forward pass scores and traceback path ---- -The algorithm first uses a large bandwidth (5000 events over the first 250 genomic bps) to identify the start of the genomic sequence within the events. This is necessary as some portion at the beginning of a read is not base called and some additional sequence may have been trimmed from the alignment. +The algorithm first uses a large bandwidth to identify the start of the genomic sequence within the events. This is necessary as some portion at the beginning of a read is not base called and some additional sequence may have been trimmed from the alignment. -If a read is short enough (less than 5250 events or less than 250 bps of called sequence), then the whole sequence to signal matching will be performed with a single run with an appropriate static bandwidth. +If a read is short enough, then the whole sequence to signal matching will be performed with an appropriate static bandwidth. -For longer reads, the above computed start matching position is taken and then the same dynamic programming solution is applied except a smaller adaptive band is now used (see figure below). The bandwidth is definied by the ``--signal-align-parameters`` option and again has been optimized for DNA and RNA data types. At each genomic position, the band position is defined to center on the maximal score of the forward pass from the previous base. This aims to ensure that the traceback path will remain within the adaptive window. There are edge cases where the valid matching leaves the adaptive band. These reads are filtered out and included in the failed read group ``Read event to sequence alignment extends beyond --bandwidth``. +For longer reads, the above computed geneome seqeunce start within the raw signal is taken and then the same dynamic programming solution is applied except a smaller adaptive band is now used (see figure below). The bandwidth is definied by the ``--signal-align-parameters`` option and again has been optimized for DNA and RNA data types. At each genomic position, the band position is defined to center on the maximal score of the forward pass from the previous base. This aims to ensure that the traceback path will remain within the adaptive window. There are edge cases where the valid matching leaves the adaptive band. These reads are filtered out and included in the failed read group ``Read event to sequence alignment extends beyond bandwidth``. -Most reads can be processed with a smaller bandwidth, but if a read fails to be successfully re-squiggled a second, larger, "save" bandwidth is used to attempt to rescue a read and complete a successful sequence to signal assignment. For samples with many low quality reads, this can cause larger run times, but should speed up the vast majority of runs by ~40%. +Most reads can be processed with a smaller bandwidth, but if a read fails to be successfully re-squiggled a second, larger, "save" bandwidth is used to attempt to rescue a read and complete a successful sequence to signal assignment. For samples with many low quality reads, this can cause larger run times, but should speed up the vast majority of runs. ---- .. figure:: _images/adaptive_half_z_scores.png :align: center - :scale: 80% - + Full read adaptive banded shifted half-normal scores .. figure:: _images/adaptive_forward_pass.png :align: center - :scale: 80% - + Full read adaptive banded forward pass scores ---- @@ -140,7 +146,7 @@ Common Failed Read Descriptions ``Fastq slot not present in --basecall-group`` ``Raw data is not found in Raw/Reads/Read_[read#]`` -* These error indicates that a necessary peice of information for Tombo to run was not found in the FAST5 file. +* These error indicates that a necessary peice of information for Tombo to run was not found in the FAST5 file. See ``tombo preprocess annotate_raw_with_fastqs`` for annotation of read files with basecalls in FASTQ format. ``Alignment not produced`` @@ -159,6 +165,10 @@ Common Failed Read Descriptions * This errors indicate that the dynamic programming algorithm produce a poorly scored matching of genomic sequence to raw signal (as definied by the ``--signal-matching-score``). Some potential sources for these errors include incorrect primary genomic mapping, incorrect genome sequence (compared to the biological sample), poor quality raw signal or an incompatible flowcell/library with included canonical models (only R9.5/4 flowcells currently supported; 2D reads are not supported; DNA and RNA are supported). +``Unexpected error`` + +* This indicates that an error not expected by the Tombo re-squiggle algorithm has occured. This will result in a file containing the full error message for that read. Please report these bugs `here `_. + ------------------ Tombo FAST5 Format ------------------ @@ -187,17 +197,21 @@ By default, Tombo will create a hidden file containing the essential genome mapp Additional Command Line Options ------------------------------- +``--num-most-common-errors`` + +* Dynamically updates the most common reasons for a read to be unsuccessfully processed. (May cause issues for smaller viewing screens or certain environments, so this is off by default) + ``--failed-reads-filename`` * This option outputs the filenames for each read that failed via each failure mode. This can be useful for tracking down bothersome errors. -``--obs-per-base-filter`` +``--signal-matching-score``, ``--q-score``, ``--obs-per-base-filter`` -* This option applies a filter to "stuck" reads (too many observations per genomic base). This filter is applied only to the Tombo index file and can be cleared later. See the :doc:`filtering` section for more details. +* Filter reads based on the specified criterion and threshold. These filters are applied only to the Tombo index file and can be cleared later. The ``--q-score`` and ``--obs-per-base-filter`` options are off by default and the ``--signal-matching-score`` default threshold is listed in the command help output (this value has been optimized for DNA and RNA data types). See the :doc:`filtering` section for more details. ``--ignore-read-locks`` -* Multiple independent ``resquiggle`` commands on the same set of reads should NOT be run simultaneously. This can cause hard to track errors and read file corruption. To protect against this, Tombo adds a lock (only acknowledged by Tombo) to each directory being processed. If a previous ``resquiggle`` command fails in a very unexpected fashion these locks can be left in place. In this case the ``--ignore-read-locks`` option is provided. This is the only intended use for this option. +* Multiple independent ``tombo resquiggle`` commands on the same set of reads should NOT be run simultaneously. This can cause hard to track errors and read file corruption. To protect against this, Tombo adds a lock (only acknowledged by Tombo) to each directory being processed. If a previous ``tombo resquiggle`` command fails in a very unexpected fashion these locks can be left in place. In this case the ``--ignore-read-locks`` option is provided. This is the only intended use for this option. --------------------- Pre-process Raw Reads @@ -219,10 +233,10 @@ Non-standard Data Locations In the Tombo framework, it is possible to access and store basecalls and genome-anchored re-squiggle results in custom locations within FAST5 files. -For example, basecalls can be found in the ``Basecall_1D_001`` slot in a set of reads that have been basecalled more than one time. In this case the basecalls can be accessed in Tombo by specifying the ``--basecall-group`` option to the ``resquiggle`` command. +For example, basecalls can be found in the ``Basecall_1D_001`` slot in a set of reads that have been basecalled more than one time. In this case the basecalls can be accessed in Tombo by specifying the ``--basecall-group`` option to the ``tombo resquiggle`` command. -It can also be adventageous to store re-squiggle results in a non-standard locations. For example, if one would like to test multiple sets of re-squiggle parameters or reference versions without having to overwrite old results and re-run the ``resquiggle`` command, the ``--corrected-group`` option can be specified. This will store the re-squiggle results in a new slot within the FAST5 file as well as creating a new Tombo index file. +It can also be adventageous to store re-squiggle results in a non-standard locations. For example, if one would like to test multiple sets of re-squiggle parameters or reference versions without having to overwrite old results and re-run the ``tombo resquiggle`` command, the ``--corrected-group`` option can be specified. This will store the re-squiggle results in a new slot within the FAST5 file as well as creating a new Tombo index file. .. important:: - - If the ``--corrected-group`` is specified in the ``resquiggle`` command, this same value must be passed to all other Tombo sub-commands in order to access these results. This inlcudes all filtering, plotting, significance testing, and text output commands. + + If the ``--corrected-group`` is specified in the ``tombo resquiggle`` command, this same value must be passed to all other Tombo sub-commands in order to access these results. This inlcudes all filtering, plotting, significance testing, and text output commands. diff --git a/docs/rna.rst b/docs/rna.rst index bdc525c..ea8240c 100644 --- a/docs/rna.rst +++ b/docs/rna.rst @@ -6,7 +6,7 @@ RNA Processing Tombo cannot currently process spliced alignments. Thus processing RNA data requires that a transcriptome (NOT genome) reference provided for organisms with spliced transcription products. -Processing RNA data within the Tombo framework requires some extra care. The major item to consider when performing RNA processing is that a transcriptome reference must be supplied as spliced mapping is not supported. The lack of spliced mapping support within the Tombo framework is a conscious decision for identification of modified RNA bases. This is because the transcriptome is the natural setting for the detection of modified RNA bases. When modified RNA bases are projected onto the genome reference any potential transcript isoform-specfic modification information is lost or the signal diluted. Leaving open the potential for isoform-specific modified base detection is one reason for the choice to force mapping modified bases to a transcriptome. Regions at the edge of alternative exons also have divergent expected signal levels and thus genome statistics computed at these positions would be very difficult to process. Processing would also be very sensetive to shifts in the mapped splice boundaries which can be variable with nanopore reads. +Processing RNA data within the Tombo framework requires some extra care. The major item to consider when performing RNA processing is that a transcriptome reference must be supplied as spliced mapping is not supported. The lack of spliced mapping support within the Tombo framework is a conscious decision for identification of modified RNA bases. This is because the transcriptome is the natural setting for the detection of modified RNA bases. When modified RNA bases are projected onto the genome reference any potential transcript isoform-specfic modification information is lost or the signal diluted. Leaving open the potential for isoform-specific modified base detection is one reason for the choice to force mapping modified bases to a transcriptome. Regions at the edge of alternative exons also have divergent expected signal levels and thus genome statistics computed at these positions would be very difficult to process into a logical output. Processing would also be very sensetive to shifts in the mapped splice boundaries which can be variable with nanopore reads. Tools to investigate isoform-specific modified bases is a future goal within the Tombo framework. This does pose some informatic challenges for downstream processing of Tombo RNA data. A recommended Tombo RNA processing pipeline will be posted here soon to help make integrative modified RNA processing more streamlined with other genome bioinformatic tools. @@ -21,3 +21,5 @@ As Tombo RNA processing presents unique informatic challenges, a recommended pro This pipeline is for users looking to process a sample from a genome seqeuence reference and a gene annotation file (GTF or GFF). For users successfully processing data from a transcriptome reference this processing workflow will not be applicable. This pipeline aims to address the majority of use cases for RNA modified base detection, namely porting Tombo results to a genome browser compatible format. Please check back soon for the recommended Tombo RNA processing pipeline! + +This pipeline will likely be built on the `R/bioconductor ensembldb package `_. This package allows the creation of custom data bases and `mapping between genome and transcriptome coordinates `_. The functions from this software are recommended in order to project Tombo RNA results into a genome coordinate space. A full tutorial/example script for a full Tombo pipeline based on this package will be provided soon. diff --git a/docs/text_output.rst b/docs/text_output.rst index e7c1b3e..c16656f 100644 --- a/docs/text_output.rst +++ b/docs/text_output.rst @@ -7,51 +7,36 @@ Two text outputs are available from Tombo: 1. Genome Broser Files - Genome browser compatible per-genomic-base statistics 2. Fasta - Genomic sequence output surrounding identified modified base sites -``text_output browser_files`` ------------------------------ +``tombo text_output browser_files`` +----------------------------------- -The ``text_output browser_files`` command takes in a set of reads (``--fast5-basedirs``) and/or a statistics file generated from a ``detect_modifications`` command (``--statistics-filename``). A control set of reads can also be provided (``--control-fast5-basedirs``). Output files will be produced for each requested statistic (both plus and minus strands) in either `variableStep wiggle format `_ or `bedgraph format `_ for ``--file-type coverage``. +The ``tombo text_output browser_files`` command takes in a set of reads (``--fast5-basedirs``) and/or a statistics file generated from a ``tombo detect_modifications`` command (``--statistics-filename``). A control set of reads can also be provided (``--control-fast5-basedirs``). Output files will be produced for each requested statistic (both plus and minus strands) in `variableStep wiggle format `_ (or `bedgraph format `_ for ``--file-type coverage``). Several statistics are available for output: * ``coverage`` - The coverage level for mapped and validly re-squiggled reads -* ``valid_coverage`` - The coverage level for reads that are mapped, validly re-squiggled and outside the interval specified by ``--single-read-threshold`` +* ``valid_coverage`` - The coverage level for reads that are mapped, validly re-squiggled and outside the interval specified by ``--single-read-threshold`` specified in a ``--statistics-filename``. * ``dampened_fraction`` - The estimated fraction of significantly modified reads - - - This estimate includes pseudo-counts added to the un-modified and modified read counts (as specified by the ``--coverage-dampen-counts`` option) - - This is equivalent to using a beta prior when estimating the fraction of reads modified at each position - - Test the effect of different dampen counts using the ``scripts/test_beta_priors.R`` (the default values are shown below) - * ``fraction`` - The raw fraction of significantly modified reads * ``signal`` - The mean signal level across all reads mapped to this location -* ``signal_sd`` - The mean signal standard deviation across all reads mapped to this location (not available unless ``--include-event-stdev`` was provided in resquiggle call) +* ``signal_sd`` - The mean signal standard deviation across all reads mapped to this location (not available unless ``--include-event-stdev`` was provided in ``tombo resquiggle`` command) * ``dwell`` - The mean number of raw observations observed assigned to this location * ``difference`` - The difference in normalized signal level between a sample and control set of reads ----- - -.. figure:: _images/dampened_fraction.png - :align: center - :scale: 30% - - Heatmap showing the resulting dampened farction of modified reads given the default ``--coverage-dampen-counts`` values over range of coverage and number of un-modified reads. - ----- - .. note:: - + ``signal``, ``signal_sd``, ``dwell`` and ``difference`` require each reads' event level data to be extracted from the raw read files and thus may be quite slow. ``coverage``, ``valid_coverage``, ``fraction`` , and ``dampened_fraction`` can be extracted simply from the tombo statistics files, which is much faster. The ``signal``, ``signal_sd``, ``dwell`` and ``difference`` outputs all require the ``--fast5-basedirs`` option, the ``valid_coverage``, ``fraction`` , and ``dampened_fraction`` outputs require the ``--statistics-filename`` option, and ``coverage`` output requires one or the other. Files will be output to individual wiggle files (two per statistic for plus and minus genomic strand) in the following format ``[wiggle-basename].[wiggle-type].[sample|control]?.[plus|minus].wig`` -``text_output signif_sequence_context`` ---------------------------------------- +``tombo text_output signif_sequence_context`` +--------------------------------------------- -The ``text_output signif_sequence_context`` command writes the genome sequence surrounding unique genomic positions with the largest estimated fraction of modified bases. This can be useful for several tasks related to modified base detection including motif discovery. +The ``tombo text_output signif_sequence_context`` command writes the genome sequence surrounding unique genomic positions with the largest estimated fraction of modified bases. This can be useful for several tasks related to modified base detection including motif discovery. -To run ``text_output signif_sequence_context``, a ``--statistics-filename`` is required to extract the most significant locations and either a ``--fast5-basedirs`` or ``--genome-fasta`` is required to extract the genomic sequence. Several options are availble for selecting the sequence to be output: +To run ``tombo text_output signif_sequence_context``, a ``--statistics-filename`` is required to extract the most significant locations and either a ``--fast5-basedirs`` or ``--genome-fasta`` is required to extract the genomic sequence. Several options are availble for selecting the sequence to be output: * ``--num-regions`` - Defines the number of unique locations to be output * ``--num-bases`` - Defines the number of bases to be output surrounding the significant locations @@ -60,4 +45,8 @@ The output of this command could be used to determine sequence contexts consiste .. code-block:: bash - ./meme -oc motif_output.meme -dna -mod zoops tombo_results.significant_regions.fasta + tombo detect_modifications de_novo --fast5-basedirs \ + --statistics-file-basename sample.de_novo + tombo text_output signif_sequence_context --statistics-filename sample.de_novo.tombo.stats \ + --genome-fasta reference.fasta --num-regions 1000 --num-bases 50 + ./meme -oc tombo.de_novo_motif_detection.meme -dna -mod zoops tombo_results.significant_regions.fasta diff --git a/scripts/debug_bandwidth.R b/scripts/debug_bandwidth.R new file mode 100644 index 0000000..4efc631 --- /dev/null +++ b/scripts/debug_bandwidth.R @@ -0,0 +1,39 @@ +library(dplyr) +library(ggplot2) + +## set _DEBUG_BANDWIDTH = True (or _DEBUG_START_BANDWIDTH) in resquiggle.py +## tombo resquiggle bandwidth_test_reads/ genome.fasta --processes 8 > band_boundary_tuning.txt + + +dat <- read.table('band_boundary_tuning.txt', header=TRUE) + +std_bw <- names(sort(table(dat$bandwidth), decreasing=TRUE))[1] +print(std_bw) + +pdf('band_boundary_tuning.pdf', width=11) +ggplot(dat %>% filter(bandwidth == std_bw)) + + geom_density(aes(x=min_bw_edge_buffer), fill='black') + theme_bw() +foo <- dev.off() + + +## print bandwidth values for each percentile of reads included in that bandwidth +print(((as.numeric(std_bw) / 2) - + quantile(dat$min_bw_edge_buffer, + c(0.01, 0.05, 0.1, 0.2, 0.25, 0.5))) * 2) + +## inverse for selected bw +sel_bw <- 200 +1 - ecdf(dat$min_bw_edge_buffer)((as.numeric(std_bw) / 2) - (sel_bw / 2)) + + + + +## for start benadwidth debugging +## print bandwidth values for each percentile of reads included in that bandwidth +print( + quantile(dat$min_bw_edge_buffer, + 1 - c(0.01, 0.05, 0.1, 0.2, 0.25, 0.5))) + +## inverse for selected bw +sel_bw <- 750 +ecdf(dat$min_bw_edge_buffer)(sel_bw) diff --git a/scripts/debug_eventless.R b/scripts/debug_eventless.R deleted file mode 100644 index 5f503cb..0000000 --- a/scripts/debug_eventless.R +++ /dev/null @@ -1,28 +0,0 @@ -library(ggplot2) - -dat <- read.table('debug_event_align.txt', header=TRUE) -tbdat <- read.table('debug_event_align.traceback.txt', header=TRUE) - -pdf('debug_event_align.pdf', height=4.5, width=6) -for(reg in unique(dat$Region)){ - regDat <- dat[dat$Region == reg,] - mp <- ifelse(grepl('fwd_end', reg), - mean(regDat$Score[grepl('fwd_end', regDat$Region)]), 0) - if (grepl('fwd', reg)){ - tbReg <- tbdat[tbdat$Region == reg,] - print(ggplot(regDat) + geom_tile(aes(x=EventPos, y=SeqPos, fill=Score)) + - scale_fill_gradient2(high='#67001f', mid='#ffffbf', low='#1a1a1a', - midpoint=mp) + - geom_line(aes(x=EventPos, y=SeqPos), - data=tbReg, color='steelblue') + - theme_minimal() + xlab('Segmented Signal') + - ylab('Genomic Sequence')) - } else { - print(ggplot(regDat) + geom_tile(aes(x=EventPos, y=SeqPos, fill=Score)) + - scale_fill_gradient2(high='#67001f', mid='#ffffbf', low='#1a1a1a', - midpoint=mp) + - theme_minimal() + xlab('Segmented Signal') + - ylab('Genomic Sequence')) - } -} -foo <- dev.off() diff --git a/scripts/debug_full_fit.R b/scripts/debug_full_fit.R deleted file mode 100644 index 30f4a35..0000000 --- a/scripts/debug_full_fit.R +++ /dev/null @@ -1,24 +0,0 @@ -library(ggplot2) -library(cowplot) - -bandwidth <- 500 - -dat <- read.table('debug_event_align.full_fit.txt', header=TRUE) -failedDat <- read.table('debug_event_align.full_failed.txt', header=TRUE, sep="\t") - -pdf('debug_event_align.full.pdf', width=15, height=5) -for(reg in unique(dat$Region)){ - regDat <- dat[dat$Region == reg,] - p1 <- ggplot(regDat) + geom_line(aes(x=EventPos, y=EventScore), size=0.1) + - theme_minimal() + ylim(c(-15,2)) + - geom_hline(aes(yintercept=yint), data=data.frame(yint=c(0,0.5)), - color='red') - p2 <- ggplot(regDat) + - geom_hline(aes(yintercept=bandwidth/2), color='blue') + - geom_line(aes(x=EventPos, y=BandPos), size=0.1) + - theme_minimal() + ylim(c(0,bandwidth)) - title <- ggdraw() + draw_label(as.character( - failedDat[failedDat$Region == reg,'DidFail'])) - print(plot_grid(title, p1, p2, align='v', ncol=1, rel_heights=c(1,6,6))) -} -foo <- dev.off() diff --git a/scripts/debug_model_resquiggle.R b/scripts/debug_model_resquiggle.R deleted file mode 100644 index a0a77a9..0000000 --- a/scripts/debug_model_resquiggle.R +++ /dev/null @@ -1,88 +0,0 @@ -library(dplyr) -library(ggplot2) -library(cowplot) - -dat <- read.table('debug_signal_space.window_z_scores.txt', header=TRUE) -tbDat <- read.table('debug_signal_space.window_traceback.txt', header=TRUE) -maxPathDat <- read.table('debug_signal_space.window_max_path.txt', header=TRUE) -sigMaxPathDat <- read.table( - 'debug_signal_space.window_signal_max_path.txt', header=TRUE) -origPathDat <- read.table('debug_signal_space.window_orig_path.txt', header=TRUE) -switchDat <- read.table('debug_signal_space.window_switch_points.txt', header=TRUE) -sig <- read.table('debug_signal_space.signal.txt', header=TRUE) - -diagDat <- read.table('debug_signal_space.window_last_diag.txt', header=TRUE) -diagDat$LastDiagCount[diagDat$LastDiagCount > 3] <- 3 -diagDat$LastDiagCount <- factor(diagDat$LastDiagCount) - -pdf('debug_signal_space.window.pdf', width=11) -for(reg_i in unique(dat$Region)){ - for(iter_i in 0:2){ - print(iter_i) - if(sum(dat$Region == reg_i && dat$Iteration == iter_i) == 0){ next } - print('.') - regDat <- dat %>% filter(Region==reg_i, Iteration==iter_i) - regPath <- tbDat %>% filter(Region==reg_i, Iteration==iter_i) - regDiag <- diagDat %>% filter(Region==reg_i, Iteration==iter_i) - zMean <- mean(regDat$ZScore) - regPath$pathVal <- regPath$pathVal - (zMean * (regPath$SignalPos + 1)) - regPath$pathVal <- regPath$pathVal * max(regPath$SignalPos + 1) / - ((regPath$SignalPos + 1) * max(regPath$pathVal)) - - pCols <- c('Original Path'='#ffffbf', - 'Signal-based\nTraceback Path'="#f16913", - 'Max Probability\nTraceback Path'="#cb181d") - zscrP <- ggplot(regDat) + - geom_tile(aes(x=SignalPos, y=BasePos, fill=ZScore)) + - scale_fill_gradient2( - low="#9970ab", mid='#f7f7f7', high="#00441b", - midpoint=mean(range(regDat$ZScore, na.rm=TRUE)), - name='Lower Tail\nZ-Score') + ylab('Base') + - geom_line(aes(x=SignalPos, y=BasePos), color=pCols[1], size=0.3, - data=origPathDat %>% filter(Region==reg_i, Iteration==iter_i)) + - geom_line(aes(x=SignalPos, y=BasePos), color=pCols[2], size=0.5, - data=sigMaxPathDat %>% filter(Region==reg_i, Iteration==iter_i)) + - geom_line(aes(x=SignalPos, y=BasePos), color=pCols[3], size=0.5, - data=maxPathDat %>% filter(Region==reg_i, Iteration==iter_i)) + - geom_point(aes(x=SignalPos, y=BasePos + 1), size=0.1, - data=switchDat %>% filter(Region==reg_i, Iteration==iter_i)) + - theme_minimal() + theme(axis.title.x=element_blank(), - axis.text.x=element_blank()) - - diagP <- ggplot(regDiag) + - geom_tile(aes(x=SignalPos, y=BasePos, fill=LastDiagCount)) + - geom_line(aes(x=SignalPos, y=BasePos), color=pCols[1], size=0.3, - data=origPathDat %>% filter(Region==reg_i, Iteration==iter_i)) + - geom_line(aes(x=SignalPos, y=BasePos), color=pCols[2], size=0.5, - data=sigMaxPathDat %>% filter(Region==reg_i, Iteration==iter_i)) + - geom_line(aes(x=SignalPos, y=BasePos), color=pCols[3], size=0.5, - data=maxPathDat %>% filter(Region==reg_i, Iteration==iter_i)) + ylab('Base') + - geom_point(aes(x=SignalPos, y=BasePos + 1), size=0.1, - data=switchDat %>% filter(Region==reg_i, Iteration==iter_i)) + - theme_minimal() + theme(axis.title.x=element_blank(), - axis.text.x=element_blank()) - - pathP <- ggplot(regPath) + - geom_tile(aes(x=SignalPos, y=BasePos, fill=pathVal)) + - geom_line(aes(x=SignalPos, y=BasePos, color=names(pCols)[1]), size=0.3, - data=origPathDat %>% filter(Region==reg_i, Iteration==iter_i)) + - geom_line(aes(x=SignalPos, y=BasePos, color=names(pCols)[2]), size=0.5, - data=sigMaxPathDat %>% filter(Region==reg_i, Iteration==iter_i)) + - geom_line(aes(x=SignalPos, y=BasePos, color=names(pCols)[3]), size=0.5, - data=maxPathDat %>% filter(Region==reg_i, Iteration==iter_i)) + - scale_fill_gradient2( - low="#9970ab", mid='#f7f7f7', high="#00441b", - midpoint=mean(range(regPath$pathVal, na.rm=TRUE)), - name='Normalized\nCumulative\nZ-Score') + - scale_color_manual(values=pCols, name='') + - ylab('Base') + theme_minimal() + theme(axis.title.x=element_blank(), - axis.text.x=element_blank()) - sigP <- ggplot(sig %>% filter(Region==reg_i, Iteration==iter_i)) + - geom_path(aes(x=SignalPos + 0.5, y=Signal, color='')) + - scale_color_manual(values=c("#cb181d"), name='') + - xlab('Position') + theme_minimal() - print(plot_grid(zscrP, diagP, pathP, sigP, - labels=c(paste0(reg_i, " ", iter_i),"","",""), - rel_heights=c(3,3,3,1.5), ncol=1, align='v')) -}} -foo <- dev.off() diff --git a/scripts/debug_params.R b/scripts/debug_params.R index 3473a17..2ccdde6 100644 --- a/scripts/debug_params.R +++ b/scripts/debug_params.R @@ -1,5 +1,6 @@ library(dplyr) library(ggplot2) +library(ggridges) library(ggbeeswarm) ## set _DEBUG_PARAMS = True in resquiggle.py @@ -9,7 +10,7 @@ library(ggbeeswarm) ## tombo resquiggle param_test_reads/ genome.fasta --segmentation-parameters 5 $testParam 5 --signal-align-parameters 4.2 4.2 1200 1.75 5.0 --processes 4 ##done > param_values.txt -stat <- 'min_obs_per_base' +stat <- 'skip_pen' dat <- read.table('param_values.txt') colnames(dat) <- c('running_window', 'min_obs_per_base', 'mean_obs_per_event', @@ -23,14 +24,24 @@ dat$skip_pen <- factor(dat$skip_pen) dat$bandwidth <- factor(dat$bandwidth) dat <- dat %>% group_by(mean_obs_per_event, min_obs_per_base, running_window, - match_evalue, skip_pen, bandwidth, read_name) %>% summarize(mean_score=min(mean_score)) + match_evalue, skip_pen, bandwidth, read_name) %>% + summarize(mean_score=min(mean_score)) rdat <- dat %>% group_by(read_name) %>% summarize(nreads=n()) maxNReads <- rdat$read_name[which(rdat$nreads == max(rdat$nreads))] fdat <- dat %>% filter(read_name %in% maxNReads) -minMed <- dat %>% group_by_at(stat) %>% summarize(med=median(mean_score)) %>% summarize(min(med)) -minMedF <- fdat %>% group_by_at(stat) %>% summarize(med=median(mean_score)) %>% summarize(min(med)) +dat %>% group_by_at(stat) %>% + summarize(med=median(mean_score), mean=mean(mean_score)) %>% + print.data.frame(digits=6) +fdat %>% group_by_at(stat) %>% + summarize(med=median(mean_score), mean=mean(mean_score)) %>% + print.data.frame(digits=6) + +minMed <- dat %>% group_by_at(stat) %>% summarize(med=median(mean_score)) %>% + summarize(foo=min(med)) %>% .$foo +minMedF <- fdat %>% group_by_at(stat) %>% summarize(med=median(mean_score)) %>% + summarize(foo=min(med)) %>% .$foo pdf(paste0('param_values.', stat, '.pdf'), width=10) ggplot(dat, aes_string(x=stat, y='mean_score', color=stat)) + diff --git a/scripts/plot_single_read.py b/scripts/plot_single_read.py new file mode 100644 index 0000000..e327fd4 --- /dev/null +++ b/scripts/plot_single_read.py @@ -0,0 +1,39 @@ +from tombo import tombo_helper as th +from tombo._plot_commands import plot_single_read, test_r_imports + +test_r_imports() + +# plot single read +read_fn = 'path/to/read.fast5' +plot_single_read(fast5_fn=read_fn) + + +# plot first 10k raw obs from several reads +num_reads = 10 +reads_index = th.TomboReads(['path/to/fast5/basedir/',]) +for i, r_data in enumerate(reads_index.iter_reads()): + plot_single_read( + fast5_fn=r_data.fn, num_obs=10000, + png_fn='single_read_raw_signal.num_' + str(i) + '.png') + if i > num_reads: break + + +# plot best reads +best_reads = sorted( + (r_data.sig_match_score, r_data) + for r_data in reads_index.iter_reads())[:num_reads] +for i, (r_score, r_data) in enumerate(best_reads): + plot_single_read( + fast5_fn=r_data.fn, num_obs=10000, + png_fn='best_reads.num_' + str(i) + '.png') + + +# plot worst scoring reads +reads_index = th.TomboReads(['path/to/fast5/basedir/',], remove_filtered=False) +worst_reads = sorted( + (r_data.sig_match_score, r_data) + for r_data in reads_index.iter_reads())[::-1][:num_reads] +for i, (r_score, r_data) in enumerate(worst_reads): + plot_single_read( + fast5_fn=r_data.fn, num_obs=10000, + png_fn='worst_reads.num_' + str(i) + '.png') diff --git a/setup.py b/setup.py index ef18626..b97101c 100644 --- a/setup.py +++ b/setup.py @@ -40,12 +40,12 @@ def readme(): extras_require.append('rpy2') ext_modules = [ - Extension(str("tombo.c_dynamic_programming"), - [str("tombo/c_dynamic_programming.pyx")], + Extension(str("tombo._c_dynamic_programming"), + [str("tombo/_c_dynamic_programming.pyx")], include_dirs=include_dirs, language="c++"), - Extension(str("tombo.c_helper"), - [str("tombo/c_helper.pyx")], + Extension(str("tombo._c_helper"), + [str("tombo/_c_helper.pyx")], include_dirs=include_dirs, language="c++") ] @@ -57,7 +57,7 @@ def readme(): name = "ont-tombo", version = __version__, packages = ["tombo"], - install_requires = ['h5py <= 2.7.0', 'numpy', 'scipy', 'cython', + install_requires = ['h5py', 'numpy', 'scipy', 'cython', 'setuptools >= 18.0', 'mappy >= 2.10', 'future', 'tqdm'], extras_require={'full':extras_require}, diff --git a/tombo/R_scripts/debugDP.R b/tombo/R_scripts/debugDP.R new file mode 100644 index 0000000..b5a8ffc --- /dev/null +++ b/tombo/R_scripts/debugDP.R @@ -0,0 +1,27 @@ +minDPScore <- -250 + +plotDP <- function(dpDat, tbDat){ + for(reg in unique(dpDat$Region)){ + regDat <- dpDat[dpDat$Region == reg,] + regDat$Score[regDat$Score < minDPScore] <- minDPScore + mp <- mean(range(regDat$Score[grepl(reg, regDat$Region)])) + if (grepl('fwd', reg)){ + tbReg <- tbDat[tbDat$Region == reg,] + print(ggplot(regDat) + + geom_tile(aes(x=EventPos, y=SeqPos, fill=Score)) + + scale_fill_gradient2(high='#67001f', mid='#ffffbf', + low='#1a1a1a', midpoint=mp) + + geom_line(aes(x=EventPos, y=SeqPos), + data=tbReg, color='steelblue') + + theme_minimal() + xlab('Segmented Signal') + + ylab('Genomic Sequence') + ggtitle(tbDat$Region[1])) + } else { + print(ggplot(regDat) + + geom_tile(aes(x=EventPos, y=SeqPos, fill=Score)) + + scale_fill_gradient2(high='#67001f', mid='#ffffbf', + low='#1a1a1a', midpoint=mp) + + theme_minimal() + xlab('Segmented Signal') + + ylab('Genomic Sequence') + ggtitle(regDat$Region[1])) + } + } +} diff --git a/tombo/R_scripts/debugFit.R b/tombo/R_scripts/debugFit.R new file mode 100644 index 0000000..867904e --- /dev/null +++ b/tombo/R_scripts/debugFit.R @@ -0,0 +1,22 @@ +plotFit <- function(fitDat, readBw){ + for(reg in unique(fitDat$Region)){ + regDat <- fitDat[fitDat$Region == reg,] + p1 <- ggplot(regDat) + + geom_line(aes(x=EventPos, y=EventScore), size=0.1) + + ggtitle(regDat$Region) + theme_minimal() + + #geom_hline(aes(yintercept=yint), data=data.frame(yint=c(-4)), + # color='red') + + #ylim(c(-15,5)) + + theme(axis.title.x=element_blank()) + p2 <- ggplot(regDat) + + geom_hline(aes(yintercept=readBw / 2), color='blue') + + geom_line(aes(x=EventPos, y=BandPos), size=0.5) + + theme_minimal() + ylim(c(0, readBw)) + + theme(axis.title.x=element_blank()) + p3 <- ggplot(regDat) + + geom_line(aes(x=EventPos, y=ModelMean), size=0.1, color='blue') + + geom_line(aes(x=EventPos, y=EventMean), size=0.1) + + theme_minimal() + print(plot_grid(p1, p2, p3, align='v', ncol=1, rel_heights=c(7,6,7))) + } +} diff --git a/tombo/R_scripts/plotMotifStats.R b/tombo/R_scripts/plotMotifStats.R index d9caa8f..08188c8 100644 --- a/tombo/R_scripts/plotMotifStats.R +++ b/tombo/R_scripts/plotMotifStats.R @@ -83,10 +83,10 @@ plotMotifStats <- function(PlotDat, BaseDat, StatsDat, } p <- p + geom_path(aes(x=Position, y=Signal, color=Group, group=Read), alpha=0.5, size=0.1, show.legend=FALSE) - p <- p + geom_text(aes(x=Position+0.5, y=-ylim, - label=Base, color=Base), - data=rBaseDat, - hjust=0.5, vjust=0, size=3, show.legend=FALSE) + + p <- p + geom_text( + aes(x=Position+0.5, y=-ylim,label=Base, color=Base), + data=rBaseDat, + vjust=0, hjust=0.5, size=3, show.legend=FALSE) + scale_color_manual( values=c( 'A'='#00CC00', 'C'='#0000CC', 'G'='#FFB300', @@ -132,15 +132,16 @@ plotMotifStats <- function(PlotDat, BaseDat, StatsDat, panel.grid.minor.y=element_blank()) + ylab('Est. Fraction Modified'))) maxWidth <- do.call(grid::unit.pmax, - sapply(ps, function(x) x$widths[2:3])) + sapply(ps, function(x) x$widths[1:4])) ps <- lapply(ps, function(p){ - p$widths[2:3] <- maxWidth + p$widths[1:4] <- maxWidth return(p)}) # close dev null sink foo <- dev.off() do.call( grid.arrange, c(ps, list(ncol=1, heights=c(rep(1, length(regions)), 3)))) + ##library(cowplot) ##print(do.call( ## plot_grid, ## c(ps, list(ncol=1, align='v', diff --git a/tombo/R_scripts/plotPerReadStats.R b/tombo/R_scripts/plotPerReadStats.R index 6b41aea..efca2e7 100644 --- a/tombo/R_scripts/plotPerReadStats.R +++ b/tombo/R_scripts/plotPerReadStats.R @@ -1,5 +1,7 @@ -ngpValMax <- 20 -lhRatioMax <- 6 +# set thresholds for plotting tile +pointMaxReads <- 30 +pointMaxBases <- 200 +textLim <- 150 plotPerReadStats <- function(StatData, OrdData, baseDat, boxCenter, arePvals){ all_reg_ids <- unique(StatData$Region) @@ -7,39 +9,58 @@ plotPerReadStats <- function(StatData, OrdData, baseDat, boxCenter, arePvals){ for(reg_i in all_reg_ids){ regDat <- StatData[StatData$Region == reg_i,] regOrd <- OrdData[OrdData$Region == reg_i,'Read'] - if(arePvals){ - regDat$Stats[regDat$Stats > ngpValMax] <- ngpValMax - } else { - regDat$Stats[regDat$Stats > lhRatioMax] <- lhRatioMax - regDat$Stats[regDat$Stats < -lhRatioMax] <- -lhRatioMax - } regDat$Read <- factor(regDat$Read, ordered=TRUE, levels=regOrd) - boxDat <- data.frame(xS=mean(range(regDat$Position))-1.5, - xE=mean(range(regDat$Position))+0.5, + boxDat <- data.frame(xS=mean(range(regDat$Position))-1, + xE=mean(range(regDat$Position))+1, yS=0.5, yE=length(unique(regDat$Read))+0.5) regDat <- regDat[!is.na(regDat$Stats),] reg_base_dat <- baseDat[baseDat$Region==reg_i,] p <- ggplot(regDat) + ## add stat values if(arePvals){ - p <- p + geom_point(aes(x=Position, y=Read, fill=Stats), - stroke=0, color='#969696', size=5, shape=21) + - scale_fill_gradient(low="#fff7ec", high='#7f0000', - name='-Log10\nP-Value') + if(length(unique(regDat$Read)) > pointMaxReads || + length(unique(regDat$Position)) > pointMaxBases){ + p <- p + geom_tile(aes(x=Position, y=Read, fill=Stats)) + } else { + p <- p + geom_point( + aes(x=Position, y=Read, fill=Stats), + stroke=0, color='#969696', size=5, shape=21) + } + p <- p + + scale_fill_gradient2( + low="#ffffff", mid="#ffffff", high='#cb181d', + midpoint=0.1, name='-Log10\nP-Value') } else { - p <- p + geom_point(aes(x=Position, y=Read, fill=Stats), - stroke=0, color='#969696', size=5, shape=21) + + if(length(unique(regDat$Read)) > pointMaxReads || + length(unique(regDat$Position)) > pointMaxBases){ + p <- p + geom_tile(aes(x=Position, y=Read, fill=Stats)) + } else { + p <- p + geom_point(aes(x=Position, y=Read, fill=Stats), + stroke=0, color='#969696', size=5, shape=21) + } + lhRatioMax <- max(abs(regDat$Stats)) + breaks <- seq(-lhRatioMax, lhRatioMax, length.out=5) + p <- p + scale_fill_gradient2( low="#b2182b", mid='#ffffff', high='#4d4d4d', midpoint=0, - name='Log\nLikelihood\nRatio\n', breaks=c(-6,-3,0,3,6), - labels=c('Alternative\nBase', '-3','0','3', 'Standard\nBase')) + name='Log\nLikelihood\nRatio\n', breaks=breaks, + labels=c('Alternative\nBase', breaks[2], '0', + breaks[4], 'Standard\nBase')) } + ## add either text or tile-like base data if(nrow(reg_base_dat) > 0){ - p <- p + geom_text( - aes(x=Position, y=0.5, label=Base, color=Base), - data=reg_base_dat, hjust=0.5, size=3, show.legend=FALSE, - vjust=1.2, angle=0) + if(nrow(reg_base_dat) < textLim){ + p <- p + geom_text( + aes(x=Position, y=0.5, label=Base, color=Base), + data=reg_base_dat, hjust=0.5, size=3, + show.legend=FALSE, vjust=1.2, angle=0) + } else { + p <- p + geom_point( + aes(x=Position, y=0, color=Base), + data=reg_base_dat, show.legend=FALSE, shape=15) + } } - if(boxCenter){ + if(boxCenter){ p <- p + geom_rect(aes(xmin=xS, xmax=xE, ymin=yS, ymax=yE), data=boxDat, fill=NA, color='black', size=0.2) } diff --git a/tombo/R_scripts/plotSingleRead.R b/tombo/R_scripts/plotSingleRead.R new file mode 100644 index 0000000..39c742f --- /dev/null +++ b/tombo/R_scripts/plotSingleRead.R @@ -0,0 +1,37 @@ +plotSingleRead <- function(sigDat, vlineDat, hDat, hrDat, stDat){ + sigDat$Position <- sigDat$Position / 1000 + vlineDat$Position <- vlineDat$Position / 1000 + p <- ggplot(sigDat) + + geom_vline(aes(xintercept=Position), + data=vlineDat, color='red', size=5) + + geom_path(aes(x=Position, y=Signal), size=0.3) + + theme_bw() + + theme(axis.text=element_text(size=24), + axis.title=element_text(size=28)) + + xlab('Position (1000 raw obs.)') + ylim(-5,5) + if(! is.null(hDat)){ + hDat$Position <- hDat$Position / 1000 + p <- p + geom_point(aes(x=Position, y=Signal), + data=hDat, color='red', size=2) + } + if(! is.null(hrDat)){ + hrDat$Position <- hrDat$Position / 1000 + hrDat$PositionEnd <- hrDat$PositionEnd / 1000 + p <- p + + geom_rect(aes(xmin=Position, xmax=PositionEnd, ymin=-Inf, ymax=Inf), + data=hrDat, fill='red', alpha=0.3, color=NA) + } + if(is.null(stDat)){ + print(p) + } else { + stDat$Position <- stDat$Position / 1000 + p2 <- ggplot(stDat) + geom_line(aes(x=Position, y=Value)) + theme_bw() + if(! is.null(hrDat)){ + p2 <- p2 + + geom_rect(aes(xmin=Position, xmax=PositionEnd, + ymin=-Inf, ymax=Inf), + data=hrDat, fill='red', alpha=0.3, color=NA) + } + print(plot_grid(p, p2, ncol=1, align='v')) + } +} diff --git a/tombo/__init__.py b/tombo/__init__.py index a98c64d..54aa32a 100644 --- a/tombo/__init__.py +++ b/tombo/__init__.py @@ -1 +1,85 @@ +""" +**************** +Tombo Python API +**************** + +The Tombo python API is intended to give users access to some of the lower level processing functions used throughout Tombo analysis pipelines. + +Primarily this includes inspection of per-read modified base detection results and observed (re-squiggle assigned) per-read signal levels. + +.. note:: + + Effort will be made to maintain this API interface introduced at Tombo version 1.3.1, but major structural changes to the Tombo framework may require changes to some API interface components. Such changes will be noted in github release notes where applicable. + +------------------- +Python API Examples +------------------- + +Import Tombo sub-modules:: + + from tombo import tombo_helper, tombo_stats, resquiggle + +Extract normalized raw signal assigned to each genomic base from each read in a region:: + + # specify region of interest + reg_data = tombo_helper.intervalData( + chrm='chr20', start=10000, end=10100, strand='+') + + # parse Tombo index from previously re-squiggled set of reads + reads_index = tombo_helper.TomboReads(['test_data/native_reads',]) + # extract reads that overlap this interval and then extract base signal + # levels from 10 randomly selected reads + reg_base_levels = reg_data.add_reads( + reads_index).get_base_levels(num_reads=10) + +Extracting per-read testing results:: + + sample_per_read_stats = tombo_stats.PerReadStats( + 'test_stats.alt_model.5mC.tombo.per_read_stats') + # reg_per_read_stats contains a numpy array containing per-read stats + # over all reads covering the region of interest + reg_per_read_stats = sample_per_read_stats.get_region_per_read_stats( + reg_data) + +Run standard resquiggle algorithm (may cause errors on some reads):: + + import h5py, mappy + + # set read values + fast5_fn, reference_fn = 'path/to/read.fast5', 'genome.fasta' + fast5_data = h5py.File(fast5_fn, 'r') + seq_samp_type = tombo_helper.get_seq_sample_type(fast5_data) + + # prep aligner, signal model and parameters + aligner = mappy.Aligner(reference_fn, preset=str('map-ont'), best_n=1) + std_ref = tombo_stats.TomboModel(seq_samp_type=seq_samp_type) + rsqgl_params = tombo_stats.load_resquiggle_parameters(seq_samp_type) + + # extract data from FAST5 + map_results = resquiggle.map_read(fast5_data, aligner, std_ref) + all_raw_signal = tombo_helper.get_raw_read_slot(fast5_data)['Signal'][:] + if seq_samp_type.rev_sig: + all_raw_signal = all_raw_signal[::-1] + map_results = map_results._replace(raw_signal=all_raw_signal) + + # run full re-squiggle + rsqgl_results = resquiggle.resquiggle_read( + map_results, std_ref, rsqgl_params, all_raw_signal=all_raw_signal) + + # or run individual steps + num_events = tombo_stats.compute_num_events( + all_raw_signal.shape[0], len(map_results.genome_seq), + rsqgl_params.mean_obs_per_event) + valid_cpts, norm_signal, scale_values = resquiggle.segment_signal( + map_results, num_events, rsqgl_params) + event_means = tombo_stats.compute_base_means(norm_signal, valid_cpts) + dp_results = resquiggle.find_adaptive_base_assignment( + valid_cpts, event_means, rsqgl_params, std_ref, map_results.genome_seq) + norm_signal = norm_signal[ + dp_results.read_start_rel_to_raw: + dp_results.read_start_rel_to_raw + dp_results.segs[-1]] + segs = resquiggle.resolve_skipped_bases_with_raw( + dp_results, norm_signal, rsqgl_params) +""" + from __future__ import unicode_literals, absolute_import diff --git a/tombo/__main__.py b/tombo/__main__.py index 603faf7..bbae7f2 100644 --- a/tombo/__main__.py +++ b/tombo/__main__.py @@ -8,8 +8,7 @@ import argparse class SubcommandHelpFormatter(argparse.RawDescriptionHelpFormatter): def _format_action(self, action): - parts = super( - argparse.RawDescriptionHelpFormatter, self)._format_action(action) + parts = super(SubcommandHelpFormatter, self)._format_action(action) if action.nargs == argparse.PARSER: parts = "\n".join(parts.split("\n")[1:]) return parts @@ -168,6 +167,7 @@ def main(args=None): save_args = args args = parser.parse_args(args) except: + # catch for re-squiggle advanced args printing import re if any(re.match(rsqgl_help[0][0], val) for val in args) and any( re.match(_option_parsers.printadv_opt[0], val) @@ -179,7 +179,7 @@ def main(args=None): if args.service_command is None: parser.print_help() - sys.stderr.write('\ntombo error: Must provide a tombo command group.\n') + sys.stderr.write('\nTombo error: Must provide a tombo command group.\n') sys.exit(2) # if no second level parser is provided print that command groups help @@ -192,8 +192,8 @@ def main(args=None): resquiggle._resquiggle_main(args) elif args.action_command == 'annotate_raw_with_fastqs': - from . import tombo_helper - tombo_helper._annotate_reads_with_fastq_main(args) + from . import _preprocess + _preprocess.annotate_reads_with_fastq_main(args) elif args.service_command == 'detect_modifications': from . import tombo_stats @@ -219,8 +219,8 @@ def main(args=None): 'Invalid Tombo build_model command.') elif args.service_command == 'filter': - from . import tombo_helper - tombo_helper._filter_main(args) + from . import _filter_reads + _filter_reads.filter_main(args) elif args.service_command == 'text_output': from . import _text_output_commands @@ -230,16 +230,16 @@ def main(args=None): _text_output_commands._write_signif_diff_main(args) else: from . import tombo_helper - tombo_helper._error_message_and_exitI( + tombo_helper._error_message_and_exit( 'Invalid Tombo text_output command.') elif args.service_command == 'plot': - from . import plot_commands - plot_commands._plot_main(args) + from . import _plot_commands + _plot_commands.plot_main(args) else: from . import tombo_helper - tombo_helper._error_message_and_exitI('Invalid Tombo command.') + tombo_helper._error_message_and_exit('Invalid Tombo command.') return diff --git a/tombo/c_dynamic_programming.pyx b/tombo/_c_dynamic_programming.pyx similarity index 85% rename from tombo/c_dynamic_programming.pyx rename to tombo/_c_dynamic_programming.pyx index 2348147..b4bd750 100644 --- a/tombo/c_dynamic_programming.pyx +++ b/tombo/_c_dynamic_programming.pyx @@ -1,3 +1,6 @@ +# _c_dynamic_programming.pyx +# cython: profile=True + cimport cython import numpy as np @@ -178,8 +181,60 @@ def c_base_traceback( curr_b_data[sig_pos-curr_start-1]): return sig_pos +@cython.wraparound(False) +@cython.boundscheck(False) +cdef DTYPE_INT_t c_argmax(np.ndarray[DTYPE_t] vals): + cdef DTYPE_t val + cdef DTYPE_t max_val = vals[0] + cdef DTYPE_INT_t pos + cdef DTYPE_INT_t max_pos = 0 + + for pos in range(1, vals.shape[0]): + val = vals[pos] + if val > max_val: + max_val = val + max_pos = pos + return max_pos # Eventless re-squiggle dynamic programming algorithm +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void c_process_band( + np.ndarray[DTYPE_t, ndim=2] fwd_pass, + np.ndarray[DTYPE_INT_t, ndim=2] fwd_pass_tb, + np.ndarray[DTYPE_t] shifted_z_scores, + DTYPE_t stay_pen, DTYPE_t skip_pen, + DTYPE_INT_t bandwidth, DTYPE_INT_t band_starts_diff, + DTYPE_INT_t seq_pos): + + cdef DTYPE_INT_t band_pos, max_from, prev_b_pos + cdef DTYPE_t max_score, diag_score, skip_score, pos_z_score + + for band_pos in range(1, bandwidth): + pos_z_score = shifted_z_scores[band_pos] + prev_b_pos = band_pos + band_starts_diff + + # first set to stay state + max_score = fwd_pass[seq_pos+1, band_pos-1] - stay_pen + pos_z_score + max_from = 0 + # then check diagonal score + if prev_b_pos - 1 < bandwidth: + diag_score = fwd_pass[seq_pos, prev_b_pos-1] + pos_z_score + if diag_score > max_score: + max_score = diag_score + max_from = 2 + # finally check skip score (note nested check to save some ops) + if prev_b_pos < bandwidth: + skip_score = fwd_pass[seq_pos, prev_b_pos] - skip_pen + if skip_score > max_score: + max_score = skip_score + max_from = 1 + + fwd_pass[seq_pos + 1, band_pos] = max_score + fwd_pass_tb[seq_pos + 1, band_pos] = max_from + + return + @cython.wraparound(False) @cython.boundscheck(False) def c_banded_forward_pass( @@ -198,7 +253,7 @@ def c_banded_forward_pass( for idx in range(bandwidth): fwd_pass[0, idx] = 0.0 - cdef DTYPE_INT_t max_from, band_pos, seq_pos, prev_b_pos + cdef DTYPE_INT_t max_from, band_pos, seq_pos, prev_b_pos, band_starts_diff cdef DTYPE_t max_score, pos_z_score, skip_score, diag_score for seq_pos in range(n_bases): @@ -214,30 +269,12 @@ def c_banded_forward_pass( shifted_z_scores[seq_pos, 0]) fwd_pass_tb[seq_pos + 1, 0] = 2 - for band_pos in range(1, bandwidth): - pos_z_score = shifted_z_scores[seq_pos, band_pos] - prev_b_pos = (band_pos + event_starts[seq_pos] - - event_starts[seq_pos-1] - if seq_pos > 0 else band_pos) - - # first set to stay state - max_score = fwd_pass[seq_pos+1, band_pos-1] - stay_pen + pos_z_score - max_from = 0 - # then check diagonal score - if prev_b_pos - 1 < bandwidth: - diag_score = fwd_pass[seq_pos, prev_b_pos-1] + pos_z_score - if diag_score > max_score: - max_score = diag_score - max_from = 2 - # finally check skip score (note nested check to save some ops) - if prev_b_pos < bandwidth: - skip_score = fwd_pass[seq_pos, prev_b_pos] - skip_pen - if skip_score > max_score: - max_score = skip_score - max_from = 1 - - fwd_pass[seq_pos + 1, band_pos] = max_score - fwd_pass_tb[seq_pos + 1, band_pos] = max_from + band_starts_diff = ( + event_starts[seq_pos] - event_starts[seq_pos-1] + if seq_pos > 0 else 0) + c_process_band( + fwd_pass, fwd_pass_tb, shifted_z_scores[seq_pos,:], stay_pen, + skip_pen, bandwidth, band_starts_diff, seq_pos) return fwd_pass, fwd_pass_tb @@ -265,28 +302,13 @@ def c_banded_traceback( band_pos -= 1 if (band_boundary_thresh >= 0 and min(band_pos, bandwidth - band_pos - 1) < band_boundary_thresh): - raise NotImplementedError, ( - 'Read event to sequence alignment extends beyond --bandwidth') + raise NotImplementedError( + 'Read event to sequence alignment extends beyond bandwidth') curr_event_pos = event_starts[curr_seq_pos-1] + band_pos seq_poss[curr_seq_pos-1] = curr_event_pos + 1 return seq_poss -@cython.wraparound(False) -@cython.boundscheck(False) -def c_argmax(np.ndarray[DTYPE_t] vals): - cdef DTYPE_t val - cdef DTYPE_t max_val = vals[0] - cdef DTYPE_INT_t pos - cdef DTYPE_INT_t max_pos = 0 - - for pos in range(1, vals.shape[0]): - val = vals[pos] - if val > max_val: - max_val = val - max_pos = pos - return max_pos - @cython.wraparound(False) @cython.boundscheck(False) def c_adaptive_banded_forward_pass( @@ -305,10 +327,12 @@ def c_adaptive_banded_forward_pass( cdef DTYPE_INT_t half_bandwidth = bandwidth / 2 cdef DTYPE_INT_t n_events = event_means.shape[0] - cdef DTYPE_INT_t event_pos, seq_pos, prev_band_start, curr_band_start, \ - band_pos, prev_b_pos, max_from - cdef DTYPE_t pos_z_score, ref_mean, ref_sd, max_score, skip_score, \ - diag_score + # comment out when profiling + #cdef DTYPE_INT_t band_pos, max_from, prev_b_pos + #cdef DTYPE_t max_score, diag_score, skip_score, pos_z_score + + cdef DTYPE_INT_t event_pos, seq_pos, prev_band_start, curr_band_start + cdef DTYPE_t pos_z_score, ref_mean, ref_sd cdef np.ndarray[DTYPE_t] shifted_z_scores = np.empty(bandwidth, dtype=DTYPE) cdef np.ndarray[DTYPE_t, ndim=2] all_shifted_z_scores @@ -327,7 +351,7 @@ def c_adaptive_banded_forward_pass( # the read is forced to skip to the end and will likely # not end in a favorable alignment if seq_pos < n_bases - 2: - raise NotImplementedError, ( + raise NotImplementedError( 'Adaptive signal to seqeunce alignment extended ' + 'beyond raw signal') curr_band_start = n_events - 1 @@ -342,6 +366,8 @@ def c_adaptive_banded_forward_pass( pos_z_score = (event_means[event_pos] - ref_mean) / ref_sd if pos_z_score < 0: pos_z_score = -pos_z_score + if do_winsorize_z: + pos_z_score = min(pos_z_score, max_half_z_score) shifted_z_scores[ event_pos - curr_band_start] = z_shift - pos_z_score else: @@ -352,6 +378,8 @@ def c_adaptive_banded_forward_pass( pos_z_score = (event_means[event_pos] - ref_mean) / ref_sd if pos_z_score < 0: pos_z_score = -pos_z_score + if do_winsorize_z: + pos_z_score = min(pos_z_score, max_half_z_score) shifted_z_scores[ event_pos - curr_band_start] = z_shift - pos_z_score for event_pos in range(n_events - curr_band_start, bandwidth): @@ -374,28 +402,9 @@ def c_adaptive_banded_forward_pass( # profiling shows that >60% of the time is spent here. Not # functionalized now due to function call overheads - for band_pos in range(1, bandwidth): - pos_z_score = shifted_z_scores[band_pos] - prev_b_pos = band_pos + curr_band_start - prev_band_start - - # first set to stay state - max_score = fwd_pass[seq_pos+1, band_pos-1] - stay_pen + pos_z_score - max_from = 0 - # then check diagonal score - if prev_b_pos - 1 < bandwidth: - diag_score = fwd_pass[seq_pos, prev_b_pos-1] + pos_z_score - if diag_score > max_score: - max_score = diag_score - max_from = 2 - # finally check skip score (note nested check to save some ops) - if prev_b_pos < bandwidth: - skip_score = fwd_pass[seq_pos, prev_b_pos] - skip_pen - if skip_score > max_score: - max_score = skip_score - max_from = 1 - - fwd_pass[seq_pos + 1, band_pos] = max_score - fwd_pass_tb[seq_pos + 1, band_pos] = max_from + c_process_band( + fwd_pass, fwd_pass_tb, shifted_z_scores, stay_pen, skip_pen, + bandwidth, curr_band_start - prev_band_start, seq_pos) if return_z_scores: return all_shifted_z_scores diff --git a/tombo/c_helper.pyx b/tombo/_c_helper.pyx similarity index 72% rename from tombo/c_helper.pyx rename to tombo/_c_helper.pyx index 3909a32..0b1c5a3 100644 --- a/tombo/c_helper.pyx +++ b/tombo/_c_helper.pyx @@ -1,3 +1,5 @@ +cimport cython + import numpy as np cimport numpy as np @@ -7,11 +9,16 @@ ctypedef np.float64_t DTYPE_t DTYPE_INT = np.int64 ctypedef np.int64_t DTYPE_INT_t +DTYPE_INT16 = np.int16 +ctypedef np.int16_t DTYPE_INT16_t + from libc.math cimport log, exp cdef extern from "math.h": double sqrt(double m) +from itertools import combinations + def c_mean_std(np.ndarray[DTYPE_t] values): """ More efficient method to get both mean and standard deviation @@ -106,7 +113,7 @@ def c_valid_cpts_w_cap( cand_pos - min_base_obs + 1, cand_pos + min_base_obs)) cand_idx += 1 if cand_idx >= num_cands: - raise NotImplementedError, 'Fewer changepoints found than requested' + raise NotImplementedError('Fewer changepoints found than requested') return cpts @@ -187,14 +194,87 @@ def c_valid_cpts_w_cap_t_test( cand_pos - min_base_obs + 1, cand_pos + min_base_obs)) cand_idx += 1 if cand_idx >= num_cands: - raise NotImplementedError, 'Fewer changepoints found than requested.' + raise NotImplementedError('Fewer changepoints found than requested') return cpts +@cython.wraparound(False) +@cython.boundscheck(False) +cdef DTYPE_INT_t _c_searchsorted( + np.ndarray[DTYPE_INT16_t] sorted_arr, DTYPE_INT16_t value): + cdef DTYPE_INT_t low, mid, high + low = 0 + high = sorted_arr.shape[0] - 1 + while (low <= high): + mid = (low + high) / 2 + if (sorted_arr[mid] >= value): + high = mid - 1 + else: + low = mid + 1 + return low + +@cython.wraparound(False) +@cython.boundscheck(False) +def c_compute_running_pctl_diffs( + np.ndarray[DTYPE_INT16_t] arr, DTYPE_INT_t window_size, + DTYPE_t lower_pctl, DTYPE_t upper_pctl): + cdef DTYPE_INT_t lower_pctl_index = np.int32( + (window_size - 1) * lower_pctl / 100.0) + cdef DTYPE_INT_t upper_pctl_index = np.int32( + (window_size - 1) * upper_pctl / 100.0) + + # store values in rolling circle fashion that are within the current + # window in order to get the value to be removed. the curr_index + # indicates the current start position within the rolling array + cdef DTYPE_INT_t curr_index = 0 + cdef np.ndarray[DTYPE_INT16_t] rolling_arr = arr[:window_size].copy() + # sorted array to be maintained at each iteration + cdef np.ndarray[DTYPE_INT16_t] sorted_arr = rolling_arr.copy() + sorted_arr.sort() + + cdef np.ndarray[DTYPE_INT16_t] running_pctl_diffs = np.empty( + arr.shape[0] - window_size + 1, dtype=DTYPE_INT16) + running_pctl_diffs[:] = np.NAN + running_pctl_diffs[0] = (sorted_arr[upper_pctl_index] - + sorted_arr[lower_pctl_index]) + + cdef DTYPE_INT16_t pop_val, push_val + cdef DTYPE_INT_t i, pop_index, push_index, arr_i, rolling_i + for arr_i in range(window_size, arr.shape[0]): + push_val = arr[arr_i] + # get value to be removed from running array + rolling_i = curr_index % window_size + pop_val = rolling_arr[rolling_i] + # if pop and push values are equal skip to pclt diff comp + if pop_val != push_val: + # find indices for push and pop in sorted array + pop_index = _c_searchsorted(sorted_arr, pop_val) + push_index = _c_searchsorted(sorted_arr, push_val) + # shift apporiate values within the sorted array and + # add push value + if pop_index == push_index: + sorted_arr[push_index] = push_val + elif pop_index > push_index: + for i in reversed(range(push_index, pop_index)): + sorted_arr[i + 1] = sorted_arr[i] + sorted_arr[push_index] = push_val + else: + for i in range(pop_index, push_index - 1): + sorted_arr[i] = sorted_arr[i + 1] + sorted_arr[push_index - 1] = push_val + # replace pop value with push value in rolling array + rolling_arr[rolling_i] = push_val + + curr_index += 1 + running_pctl_diffs[curr_index] = (sorted_arr[upper_pctl_index] - + sorted_arr[lower_pctl_index]) + + return running_pctl_diffs + def c_calc_llh_ratio( np.ndarray[DTYPE_t] reg_means, - np.ndarray[DTYPE_t] reg_ref_means, np.ndarray[DTYPE_t] reg_ref_vars, - np.ndarray[DTYPE_t] reg_alt_means, np.ndarray[DTYPE_t] reg_alt_vars): + np.ndarray[DTYPE_t] reg_ref_means, np.ndarray[DTYPE_t] reg_alt_means, + np.ndarray[DTYPE_t] reg_ref_vars, np.ndarray[DTYPE_t] reg_alt_vars): cdef DTYPE_t ref_z_sum, ref_log_var_sum, alt_z_sum, alt_log_var_sum ref_z_sum, ref_log_var_sum, alt_z_sum, alt_log_var_sum = 0.0, 0.0 ,0.0 ,0.0 cdef DTYPE_t ref_diff, alt_diff, log_lh_ratio @@ -272,3 +352,22 @@ def c_calc_scaled_llh_ratio_const_var( density_height_factor) return running_scaled_lhr + +@cython.wraparound(False) +@cython.boundscheck(False) +def c_compute_slopes( + np.ndarray[DTYPE_t] r_event_means, np.ndarray[DTYPE_t] r_model_means, + DTYPE_t max_slope=1000.0): + cdef DTYPE_INT_t s_i, i, j, n_events + n_events = r_event_means.shape[0] + assert r_model_means.shape[0] == n_events + cdef np.ndarray[DTYPE_t] slopes = np.empty( + (n_events * (n_events - 1) / 2), dtype=DTYPE) + for s_i, (i, j) in enumerate(combinations(range(n_events), 2)): + if r_event_means[i] == r_event_means[j]: + slopes[s_i] = max_slope + else: + slopes[s_i] = ( + r_model_means[i] - r_model_means[j]) / ( + r_event_means[i] - r_event_means[j]) + return slopes diff --git a/tombo/_default_parameters.py b/tombo/_default_parameters.py index 5892a84..a570261 100644 --- a/tombo/_default_parameters.py +++ b/tombo/_default_parameters.py @@ -5,14 +5,17 @@ ############################### # default model names +RNA_SAMP_TYPE = 'RNA' +DNA_SAMP_TYPE = 'DNA' + STANDARD_MODELS = { - 'DNA':'tombo.DNA.model', - 'RNA':'tombo.RNA.180mV.model', + DNA_SAMP_TYPE:'tombo.DNA.model', + RNA_SAMP_TYPE:'tombo.RNA.180mV.model', } ALTERNATE_MODELS = { - 'DNA_5mC':'tombo.DNA.5mC.model', - 'DNA_6mA':'tombo.DNA.6mA.model', - 'RNA_5mC':'tombo.RNA.5mC.model', + DNA_SAMP_TYPE + '_5mC':'tombo.DNA.5mC.model', + DNA_SAMP_TYPE + '_6mA':'tombo.DNA.6mA.model', + RNA_SAMP_TYPE + '_5mC':'tombo.RNA.5mC.model', } @@ -25,8 +28,8 @@ # 2) minimum observations per genomic base # 3) mean number of observations per event during segmentation SEG_PARAMS_TABLE = { - 'RNA':(12, 6, 12), - 'DNA':(5, 3, 5), + RNA_SAMP_TYPE:(12, 5, 15), + DNA_SAMP_TYPE:(5, 3, 5), } # table containing default signal to sequence assignment parameters @@ -36,50 +39,78 @@ # 3) adaptive bandwidth # 4) save adaptive bandwidth (if first bw fails) # 5) z-score winsorizing threshold +# 6) band boundary threshold +# 7) start bandwidth +# 8) start save bandwidth +# 9) start num bases ALGN_PARAMS_TABLE = { - 'RNA':(4, 8, 400, 1200, 5.0), - 'DNA':(4.2, 4.2, 250, 1200, 5.0), + RNA_SAMP_TYPE:(6, 4, 500, 1500, 20.0, 50, 1000, 3000, 250), + DNA_SAMP_TYPE:(4.2, 4.2, 300, 1500, 20.0, 40, 750, 2500, 250), } # default thresholds for filtering out reads that don't match well to # expected signal levels SIG_MATCH_THRESH = { - 'RNA':1.3, - 'DNA':1.1, + RNA_SAMP_TYPE:2, + DNA_SAMP_TYPE:1.1, } +# outlier signal winsorizing threshold +OUTLIER_THRESH = 5.0 + # factor of extra raw signal above minimum to add around skipped bases for # raw signal segment detection EXTRA_SIG_FACTOR = 1.1 -MASK_FILL_Z_SCORE = -10 MASK_BASES = 50 - -START_BANDWIDTH = 5000 -START_SEQ_WINDOW = 250 -BAND_BOUNDARY_THRESH = 5 +MASK_FILL_Z_SCORE = -15 DEL_FIX_WINDOW = 2 -MAX_DEL_FIX_WINDOW = 8 +MAX_DEL_FIX_WINDOW = 10 MAX_RAW_CPTS = 200 MIN_EVENT_TO_SEQ_RATIO = 1.1 +# special RNA scaling parameters from events to avoid adapter +USE_RNA_EVENT_SCALE = True +RNA_SCALE_NUM_EVENTS = 10000 +RNA_SCALE_MAX_FRAC_EVENTS = 0.75 + + +# collapse stalls for more robust dynamic programming results +COLLAPSE_RNA_STALLS = True +COLLAPSE_DNA_STALLS = False + +# stall identification parameters th.stallParams +# percentile stall method params +PCTL_STALL_PARAMS = dict(( + ('window_size', 400), ('threshold', 100), + ('edge_buffer', 50), ('min_consecutive_obs', 200), + ('lower_pctl', 5), ('upper_pctl', 95))) +MEAN_STALL_PARAMS = dict(( + ('window_size', 7 * 50), ('threshold', 40), + ('edge_buffer', 100), ('min_consecutive_obs', 200), + ('n_windows', 7), ('mini_window_size', 50))) +STALL_PARAMS = MEAN_STALL_PARAMS + +# mapping start clipped bases parameters th.startClipParams +START_CLIP_PARAMS = (1000, 200) + ############################ ##### Testing Defaults ##### ############################ LLR_THRESH = { - 'DNA':(-1.5, 2.5), - 'RNA':(-2.5, 2.5), + DNA_SAMP_TYPE:(-1.5, 2.5), + RNA_SAMP_TYPE:(-2.5, 2.5), } SAMP_COMP_THRESH = { - 'DNA':(0.15, 0.5), - 'RNA':(0.05, 0.4), + DNA_SAMP_TYPE:(0.15, 0.5), + RNA_SAMP_TYPE:(0.05, 0.4), } DE_NOVO_THRESH = { - 'DNA':(0.15, 0.5), - 'RNA':(0.05, 0.4), + DNA_SAMP_TYPE:(0.15, 0.5), + RNA_SAMP_TYPE:(0.05, 0.4), } # outlier corrected likelihood ratio parameters @@ -98,6 +129,12 @@ OCLLHR_HEIGHT = 1.0 OCLLHR_POWER = 0.2 +FM_OFFSET_DEFAULT = 1 + +# default constants for posterior estimation of control sample reference means +MEAN_PRIOR_CONST = 5 +SD_PRIOR_CONST = 40 + ##################################### ##### Model Estimation Defaults ##### @@ -131,7 +168,7 @@ # sequence-based scaling thresholds for iterative re-squiggle SHIFT_CHANGE_THRESH = 0.1 SCALE_CHANGE_THRESH = 0.1 -MAX_SCALING_ITERS=3 +MAX_SCALING_ITERS = 3 # number of reads to adjust model NUM_READS_TO_ADJUST_MODEL = 5000 @@ -151,3 +188,15 @@ # default values for dampened fraction computations COV_DAMP_COUNTS = [2, 0.5] + +# store N arrays during stat computation before re-computing the +# most significant array +MOST_SIGNIF_NUM_BATCHES_DEFAULT = 10 + +# trim values for plotting per-read stats +PLOT_PVAL_MAX, PLOT_LLR_MAX = 4, 4 + + +if __name__ == '__main__': + sys.stderr.write('This is a module. See commands with `tombo -h`') + sys.exit(1) diff --git a/tombo/_event_resquiggle.py b/tombo/_event_resquiggle.py index 1344192..91c1244 100644 --- a/tombo/_event_resquiggle.py +++ b/tombo/_event_resquiggle.py @@ -32,8 +32,8 @@ from . import tombo_stats as ts from . import tombo_helper as th -from ._default_parameters import SEG_PARAMS_TABLE -from .c_helper import c_valid_cpts, c_valid_cpts_w_cap +from ._default_parameters import SEG_PARAMS_TABLE, DNA_SAMP_TYPE, RNA_SAMP_TYPE + VERBOSE = False @@ -68,6 +68,7 @@ CIGAR_PAT = re.compile('(\d+)([MIDNSHP=X])') GAP_PAT = re.compile('-+') + ################################################# ########## Raw Signal Re-squiggle Code ########## ################################################# @@ -150,9 +151,8 @@ def extend_group(indel_group): group_end < len(align_segs) - 1) # ensure no infinite loop for large segmentation parameters if num_cpts == prev_num_cpts: - raise NotImplementedError( - 'Entire read does not contain enough ' + - 'signal to re-squiggle') + raise th.TomboError( + 'Entire read does not contain enough signal to re-squiggle') prev_num_cpts = num_cpts group_start = max(0, group_start - 1) group_end = min(len(align_segs) - 1, group_end + 1) @@ -174,14 +174,14 @@ def get_cpts(group_start, group_end, num_cpts): while maintaining min_obs_per_base between changepoints. """ if num_cpts_limit is not None and num_cpts > num_cpts_limit: - raise NotImplementedError('Reached maximum number of ' + - 'changepoints for a single indel') + raise th.TomboError('Reached maximum number of changepoints ' + + 'for a single indel') try: - cpts = c_valid_cpts_w_cap( + cpts = th.valid_cpts_w_cap( raw_signal[align_segs[group_start]:align_segs[group_end]], min_obs_per_base, running_stat_width, num_cpts) # not implemented error returned when fewer cpts found than requested - except NotImplementedError: + except th.TomboError: return None cpts.sort() return cpts @@ -216,7 +216,7 @@ def extend_for_cpts( curr_group = [all_indels[0],] for indel in all_indels[1:]: if timeout is not None and time() - timeout_start > timeout: - raise NotImplementedError('Read took too long to re-segment.') + raise th.TomboError('Read took too long to re-segment.') # check if indel hits current group if max(g_indel.end for g_indel in curr_group) >= indel.start: curr_group.append(indel) @@ -255,7 +255,7 @@ def find_read_start( if starts_rel_to_read[-1] > num_obs else starts_rel_to_read if begin_read_starts.shape[0] <= 0: return norm_signal, starts_rel_to_read - signal_cpts = c_valid_cpts_w_cap( + signal_cpts = th.valid_cpts_w_cap( norm_signal[:num_obs], min_obs_per_base, running_stat_width, begin_read_starts.shape[0]) @@ -296,7 +296,7 @@ def resquiggle_read( fast5_fn, read_start_rel_to_raw, starts_rel_to_read, norm_type, outlier_thresh, alignVals, fix_read_start, timeout, num_cpts_limit, genome_loc, read_info, - basecall_group, corrected_group, compute_sd, pore_model, obs_filter, + basecall_group, corr_grp, compute_sd, pore_model, obs_filter, seg_params, in_place=True, skip_index=False): # errors should not happen here since these slotes were checked # in alignment function, but old zombie processes might cause @@ -306,7 +306,7 @@ def resquiggle_read( channel_info = th.get_channel_info(fast5_data) # extract raw data for this read - all_raw_signal = th.get_raw_read_slot(fast5_data)['Signal'].value + all_raw_signal = th.get_raw_read_slot(fast5_data)['Signal'][:] rna = th.is_read_rna(fast5_data) if rna: all_raw_signal = all_raw_signal[::-1] @@ -314,7 +314,7 @@ def resquiggle_read( if norm_type == 'pA': event_data = fast5_data[ '/Analyses/' + basecall_group + '/' + - read_info.Subgroup + '/Events'].value + read_info.Subgroup + '/Events'][:] r_event_means = event_data['mean'] r_event_kmers = list(map(lambda x: x.decode(), event_data['model_state'])) @@ -324,23 +324,23 @@ def resquiggle_read( pore_model.inv_var[kmer] for kmer in r_event_kmers]) fast5_data.close() except: - raise NotImplementedError( + raise th.TomboError( 'Error opening file for re-squiggle. This should have ' + 'been caught during the alignment phase. Check that there ' + 'are no other tombo processes or processes accessing ' + 'these HDF5 files running simultaneously.') if seg_params is None: - bio_samp_type = 'RNA' if rna else 'DNA' - (running_stat_width, min_obs_per_base, - _) = SEG_PARAMS_TABLE[bio_samp_type] + seg_params = SEG_PARAMS_TABLE[RNA_SAMP_TYPE] if rna else \ + SEG_PARAMS_TABLE[RNA_SAMP_TYPE] + (running_stat_width, min_obs_per_base, _) = seg_params else: running_stat_width, min_obs_per_base = seg_params # normalize signal (potentially using model fitting if provided) norm_signal, scale_values = ts.normalize_raw_signal( all_raw_signal, read_start_rel_to_raw, starts_rel_to_read[-1], - norm_type, channel_info, outlier_thresh, event_means=r_event_means, + norm_type, outlier_thresh, channel_info, event_means=r_event_means, model_means=r_model_means, model_inv_vars=r_model_inv_vars) if fix_read_start: norm_signal, read_start_rel_to_raw = find_read_start( @@ -365,27 +365,30 @@ def resquiggle_read( new_segs.append(starts_rel_to_read[prev_stop:]) new_segs = np.concatenate(new_segs).astype(np.int64) if np.diff(new_segs).min() < 1: - raise NotImplementedError( + raise th.TomboError( 'New segments include zero length events.') if new_segs[0] < 0: - raise NotImplementedError( + raise th.TomboError( 'New segments start with negative index.') if new_segs[-1] > norm_signal.shape[0]: - raise NotImplementedError( + raise th.TomboError( 'New segments end past raw signal values.') # get just from alignVals align_seq = ''.join(map(itemgetter(1), alignVals)).replace('-', '') if new_segs.shape[0] != len(align_seq) + 1: - raise ValueError('Aligned sequence does not match number ' + - 'of segments produced.') + raise th.TomboError('Aligned sequence does not match number ' + + 'of segments produced.') if in_place: + rsqgl_res = th.resquiggleResults( + align_info=read_info, genome_loc=genome_loc, genome_seq=align_seq, + mean_q_score=None, raw_signal=norm_signal, + read_start_rel_to_raw=read_start_rel_to_raw, segs=new_segs, + scale_values=scale_values) # create new hdf5 file to hold new read signal th.write_new_fast5_group( - fast5_fn, genome_loc, read_start_rel_to_raw, new_segs, align_seq, - norm_signal, scale_values, corrected_group, read_info.Subgroup, - norm_type, outlier_thresh, compute_sd, alignVals, read_info, + fast5_fn, corr_grp, rsqgl_res, norm_type, compute_sd, alignVals, starts_rel_to_read, rna) else: # create new hdf5 file to hold corrected read events @@ -397,14 +400,17 @@ def resquiggle_read( base_lens = np.diff(new_segs) is_filtered = any(np.percentile(base_lens, pctl) > thresh for pctl, thresh in obs_filter) - return th.prep_index_data( - fast5_fn, genome_loc, read_start_rel_to_raw, new_segs, - corrected_group, read_info.Subgroup, rna, is_filtered) + + mapped_end = genome_loc.Start + len(new_segs) - 1 + return (genome_loc.Chrom, genome_loc.Strand, th.readData( + genome_loc.Start, mapped_end, is_filtered, + read_start_rel_to_raw, genome_loc.Strand, fast5_fn, + corr_grp + '/' + read_info.Subgroup, rna)) return def resquiggle_worker( - basecalls_q, failed_reads_q, index_q, basecall_group, corrected_group, + basecalls_q, failed_reads_q, index_q, basecall_group, corr_grp, norm_type, outlier_thresh, timeout, num_cpts_limit, compute_sd, pore_model, obs_filter, seg_params): num_processed = 0 @@ -434,7 +440,7 @@ def resquiggle_worker( fast5_fn, read_start_rel_to_raw, starts_rel_to_read, norm_type, outlier_thresh, alignVals, fix_read_start, timeout, num_cpts_limit, genome_loc, read_info, - basecall_group, corrected_group, compute_sd, + basecall_group, corr_grp, compute_sd, pore_model, obs_filter, seg_params, skip_index=skip_index) if not skip_index: proc_index_data.append(index_data) @@ -443,7 +449,7 @@ def resquiggle_worker( #raise try: th.write_error_status( - fast5_fn, corrected_group, read_info.Subgroup, unicode(e)) + fast5_fn, corr_grp, read_info.Subgroup, unicode(e)) except: pass failed_reads_q.put(( @@ -509,6 +515,10 @@ def fix_all_clipped_bases(batch_align_data, batch_reads_data): read_info = th.alignInfo( read_id, bc_subgroup, start_clipped_bases, end_clipped_bases, num_ins, num_del, num_match, num_mismatch) + # print genomic sequence for exact sequence comparison to resquiggle + #print('@' + read_id.decode() + '\n' + + # ''.join([str(b) for b in list(zip(*alignVals))[1] + # if b != '-']) + '\n+\n!!!') clip_fix_align_data.append((fast5_fn, ( alignVals, genome_loc, starts_rel_to_read, @@ -543,20 +553,20 @@ def clip_m5_alignment(alignVals, start, strand, chrm): alignVals = alignVals[:-1*end_clipped_align_bases] if strand == '+' and start_clipped_genome_bases > 0: - genome_loc = th.genomeLoc( + genome_loc = th.genomeLocation( start + start_clipped_genome_bases, '+', chrm) elif strand == '-' and end_clipped_genome_bases > 0: - genome_loc = th.genomeLoc( + genome_loc = th.genomeLocation( start + end_clipped_genome_bases, '-', chrm) else: - genome_loc = th.genomeLoc(start, strand, chrm) + genome_loc = th.genomeLocation(start, strand, chrm) return alignVals, start_clipped_read_bases, \ end_clipped_read_bases, genome_loc def parse_m5_record(r_m5_record): if r_m5_record['tStrand'] != '+': - raise NotImplementedError( + raise th.TomboError( 'Mapping indicates negative strand reference mapping.') if r_m5_record['qStrand'] == "+": @@ -609,7 +619,7 @@ def parse_cigar(strand): (int(reg_len), reg_type) for reg_len, reg_type in CIGAR_PAT.findall(r_sam_record['cigar'])] if len(cigar) < 1: - raise NotImplementedError('Invalid cigar string produced.') + raise th.TomboError('Invalid cigar string produced.') if strand == '-': cigar = cigar[::-1] @@ -697,7 +707,7 @@ def get_align_vals(tSeq, qSeq, cigar, strand): qSeq, start_clipped_bases, end_clipped_bases, cigar, strand) alignVals = get_align_vals(tSeq, qSeq, cigar, strand) - return (alignVals, th.genomeLoc( + return (alignVals, th.genomeLocation( int(r_sam_record['pos']) - 1, strand, r_sam_record['rName']), start_clipped_bases, end_clipped_bases) @@ -732,7 +742,7 @@ def parse_sam_output(align_output, batch_reads_data, genome_index): r_sam_record, genome_index) except Exception as e: # uncomment to identify mysterious errors - raise + #raise batch_align_failed_reads.append((unicode(e), read_fn_sg)) return batch_align_failed_reads, batch_align_data @@ -783,7 +793,7 @@ def align_to_genome(batch_reads_data, genome_fn, mapper_data, genome_index, genome_fn, read_fp.name, num_align_ps, mapper_data.index) stdout_sink = out_fp else: - raise NotImplementedError('Mapper not supported.') + raise th.TomboError('Mapper not supported.') try: exitStatus = call([mapper_data.exe,] + mapper_options, @@ -809,7 +819,7 @@ def align_to_genome(batch_reads_data, genome_fn, mapper_data, genome_index, batch_parse_failed_reads, batch_align_data = parse_m5_output( align_output, batch_reads_data) else: - raise NotImplementedError('Mapper output type not supported.') + raise th.TomboError('Mapper output type not supported.') clip_fix_align_data = fix_all_clipped_bases( batch_align_data, batch_reads_data) @@ -826,7 +836,7 @@ def fix_stay_states( event_change_state = move_states[0] while not event_change_state: if start_clip >= len(move_states) - 2: - raise NotImplementedError( + raise th.TomboError( 'Read is composed entirely of stay model ' + 'states and cannot be processed') start_clip += 1 @@ -862,7 +872,7 @@ def get_read_data(fast5_fn, basecall_group, basecall_subgroup): try: fast5_data = h5py.File(fast5_fn, 'r') except: - raise NotImplementedError( + raise th.TomboError( 'Error opening file for alignment. This should have ' + 'been caught during the HDF5 prep phase. Check that there ' + 'are no other tombo processes or processes accessing ' + @@ -871,14 +881,14 @@ def get_read_data(fast5_fn, basecall_group, basecall_subgroup): try: # get albacore version, or if not specified set to 0.0 albacore_version = LooseVersion(fast5_data[ - '/Analyses/' + basecall_group].attrs['version'] + '/Analyses/' + basecall_group].attrs.get('version') if 'version' in fast5_data['/Analyses/' + basecall_group].attrs else "0.0") called_dat = fast5_data[ '/Analyses/' + basecall_group + '/' + basecall_subgroup + - '/Events'].value + '/Events'][:] except: - raise NotImplementedError( + raise th.TomboError( 'No events or corrupted events in file. Likely a ' + 'segmentation error or mis-specified basecall-' + 'subgroups (--2d?).') @@ -892,7 +902,7 @@ def get_read_data(fast5_fn, basecall_group, basecall_subgroup): channel_info = th.get_channel_info(fast5_data) fast5_data.close() except: - raise NotImplementedError( + raise th.TomboError( 'Error getting channel information and closing fast5 file.') read_id = raw_attrs['read_id'] @@ -977,10 +987,10 @@ def get_read_data(fast5_fn, basecall_group, basecall_subgroup): if any(len(vals) <= 1 for vals in ( starts_rel_to_read, basecalls, called_dat['model_state'])): - raise NotImplementedError( + raise th.TomboError( 'One or no segments or signal present in read.') if min(np.diff(starts_rel_to_read)) < 1: - raise NotImplementedError( + raise th.TomboError( 'Zero length event present in input data.') # remove stay states from the base caller @@ -1027,17 +1037,17 @@ def align_and_parse( def align_reads( fast5_batch, genome_fn, mapper_data, genome_index, - basecall_group, basecall_subgroups, corrected_group, + basecall_group, basecall_subgroups, corr_grp, basecalls_q, overwrite, num_align_ps, in_place=True): batch_prep_failed_reads = [] fast5s_to_process = [] for fast5_fn in fast5_batch: prep_result = th.prep_fast5( - fast5_fn, corrected_group, overwrite, in_place, basecall_group) + fast5_fn, corr_grp, overwrite, in_place, basecall_group) if prep_result is None: fast5s_to_process.append(fast5_fn) else: - batch_prep_failed_reads.append(prep_result) + batch_prep_failed_reads.append(prep_result[:2]) batch_align_failed_reads, batch_align_data = align_and_parse( fast5s_to_process, genome_fn, mapper_data, genome_index, @@ -1053,7 +1063,7 @@ def align_reads( def alignment_worker( fast5_q, basecalls_q, failed_reads_q, genome_fn, mapper_data, basecall_group, basecall_subgroups, - corrected_group, overwrite, num_align_ps): + corr_grp, overwrite, num_align_ps): # this is only needed for sam output format (not m5) genome_index = th.Fasta(genome_fn) while not fast5_q.empty(): @@ -1065,7 +1075,7 @@ def alignment_worker( batch_failed_reads = align_reads( fast5_batch, genome_fn, mapper_data, genome_index, basecall_group, basecall_subgroups, - corrected_group, basecalls_q, overwrite, num_align_ps) + corr_grp, basecalls_q, overwrite, num_align_ps) for failed_read in batch_failed_reads: try: sg_fn = failed_read[1].split(FASTA_NAME_JOINER) @@ -1074,7 +1084,7 @@ def alignment_worker( else: subgroup, fast5_fn = None, sg_fn th.write_error_status( - fast5_fn, corrected_group, subgroup, failed_read[0]) + fast5_fn, corr_grp, subgroup, failed_read[0]) except: pass failed_reads_q.put(failed_read) @@ -1092,11 +1102,11 @@ def alignment_worker(*args): def resquiggle_all_reads( fast5_fns, genome_fn, mapper_data, - basecall_group, basecall_subgroups, corrected_group, norm_type, + basecall_group, basecall_subgroups, corr_grp, norm_type, outlier_thresh, timeout, num_cpts_limit, overwrite, align_batch_size, num_align_ps, align_threads_per_proc, num_resquiggle_ps, compute_sd, pore_model, skip_index, obs_filter, - seg_params): + seg_params, fast5s_dir): manager = mp.Manager() fast5_q = manager.Queue() # set maximum number of parsed basecalls to sit in the middle queue @@ -1119,7 +1129,7 @@ def resquiggle_all_reads( align_args = ( fast5_q, basecalls_q, failed_reads_q, genome_fn, mapper_data, basecall_group, basecall_subgroups, - corrected_group, overwrite, align_threads_per_proc) + corr_grp, overwrite, align_threads_per_proc) align_ps = [] for p_id in range(num_align_ps): p = mp.Process(target=alignment_worker, args=align_args) @@ -1127,7 +1137,7 @@ def resquiggle_all_reads( align_ps.append(p) rsqgl_args = (basecalls_q, failed_reads_q, index_q, basecall_group, - corrected_group, norm_type, outlier_thresh, timeout, + corr_grp, norm_type, outlier_thresh, timeout, num_cpts_limit, compute_sd, pore_model, obs_filter, seg_params) resquiggle_ps = [] @@ -1136,13 +1146,14 @@ def resquiggle_all_reads( p.start() resquiggle_ps.append(p) - if VERBOSE: th._status_message( + if VERBOSE: th.status_message( 'Correcting ' + unicode(num_reads) + ' files with ' + unicode(len(basecall_subgroups)) + ' subgroup(s)/read(s) ' + 'each (Will print a dot for each ' + unicode(PROGRESS_INTERVAL) + ' reads completed).') failed_reads = defaultdict(list) - all_index_data = [] + if index_q is not None: + reads_index = th.TomboReads([fast5s_dir,], corr_grp, for_writing=True) while any(p.is_alive() for p in align_ps): try: errorType, fn = failed_reads_q.get(block=False) @@ -1163,7 +1174,8 @@ def resquiggle_all_reads( except queue.Empty: try: proc_index_data = index_q.get(block=False) - all_index_data.extend(proc_index_data) + for index_r_data in proc_index_data: + reads_index.add_read_data(*index_r_data) except queue.Empty: sleep(1) continue @@ -1174,19 +1186,21 @@ def resquiggle_all_reads( failed_reads[errorType].append(fn) while not index_q.empty(): proc_index_data = index_q.get(block=False) - all_index_data.extend(proc_index_data) + for index_r_data in proc_index_data: + reads_index.add_read_data(*index_r_data) # print newline after read progress dots if VERBOSE: sys.stderr.write('\n') + reads_index.write_index_file() - return dict(failed_reads), all_index_data + return dict(failed_reads) def check_for_albacore(files, basecall_group, num_reads=50): has_albacore = False for fast5_fn in np.random.choice(files, num_reads): try: fast5_data = h5py.File(fast5_fn, 'r') - if fast5_data['/Analyses/' + basecall_group].attrs['name'] == \ + if fast5_data['/Analyses/' + basecall_group].attrs.get('name') == \ ALBACORE_TEXT: has_albacore = True break @@ -1194,7 +1208,7 @@ def check_for_albacore(files, basecall_group, num_reads=50): continue if not has_albacore: - th._warning_message( + th.warning_message( 'The provided FAST5 files do not ' + 'appear to contain albacore basecalling events. ' + 'tombo is only tested on albacore formatted results ' + @@ -1211,7 +1225,7 @@ def _event_resquiggle_main(args): if all(map_exe is None for map_exe in ( args.minimap2_executable, args.bwa_mem_executable, args.graphmap_executable)): - th._error_message_and_exit( + th.error_message_and_exit( 'Must provide either a minimap2, graphmap or ' + 'bwa-mem executable.') if args.minimap2_executable is not None: @@ -1222,31 +1236,29 @@ def _event_resquiggle_main(args): else: mapper_data = mapperData(args.graphmap_executable, 'graphmap') - if VERBOSE: th._status_message('Getting file list.') + if VERBOSE: th.status_message('Getting file list.') try: - if not os.path.isdir(args.fast5_basedir): - th._error_message_and_exit( + if not os.path.isdir(args.fast5s_basedir): + th.error_message_and_exit( 'Provided --fast5-basedir is not a directory.') fast5_basedir = ( - args.fast5_basedir if args.fast5_basedir.endswith('/') else - args.fast5_basedir + '/') + args.fast5s_basedir if args.fast5s_basedir.endswith('/') else + args.fast5s_basedir + '/') files = th.get_files_list(fast5_basedir) - if not args.skip_index: - index_fn = th.get_index_fn(fast5_basedir, args.corrected_group) - if os.path.exists(index_fn): os.remove(index_fn) except OSError: - th._error_message_and_exit( + th.error_message_and_exit( 'Reads base directory, a sub-directory ' + 'or an old (hidden) index file does not appear to be ' + 'accessible. Check directory permissions.') if len(files) < 1: - th._error_message_and_exit( + th.error_message_and_exit( 'No files identified in the specified directory or ' + 'within immediate subdirectories.') check_for_albacore(files, args.basecall_group) outlier_thresh = args.outlier_threshold if ( + args.outlier_threshold is not None and args.outlier_threshold > 0) else None # resolve processor and thread arguments @@ -1267,30 +1279,29 @@ def _event_resquiggle_main(args): pore_model = None if args.normalization_type == 'pA': pore_model = ts.TomboModel( - args.pore_model_filename, is_text_model=True) + args.pore_model_filename, is_text_model=True, minimal_startup=False) obs_filter = th.parse_obs_filter(args.obs_per_base_filter) \ if 'obs_per_base_filter' in args else None - failed_reads, all_index_data = resquiggle_all_reads( + failed_reads = resquiggle_all_reads( files, args.reference_fasta, mapper_data, args.basecall_group, args.basecall_subgroups, args.corrected_group, args.normalization_type, outlier_thresh, args.timeout, args.cpts_limit, args.overwrite, args.alignment_batch_size, args.align_processes, align_threads_per_proc, num_resquiggle_ps, compute_sd, - pore_model, args.skip_index, obs_filter, args.segmentation_parameters) - if not args.skip_index: - th.write_index_file(all_index_data, index_fn, fast5_basedir) + pore_model, args.skip_index, obs_filter, args.segmentation_parameters, + fast5_basedir) fail_summary = [(err, len(fns)) for err, fns in failed_reads.items()] if len(fail_summary) > 0: total_num_failed = sum(map(itemgetter(1), fail_summary)) - th._status_message('Failed reads summary (' + unicode(total_num_failed) + + th.status_message('Failed reads summary (' + unicode(total_num_failed) + ' total failed):\n' + '\n'.join( "\t" + err + " :\t" + unicode(n_fns) for err, n_fns in sorted(fail_summary))) else: - th._status_message('All reads successfully re-squiggled!') + th.status_message('All reads successfully re-squiggled!') if args.failed_reads_filename is not None: with io.open(args.failed_reads_filename, 'wt') as fp: fp.write('\n'.join(( @@ -1299,11 +1310,6 @@ def _event_resquiggle_main(args): return -def args_and_main(): - import _option_parsers - event_resquiggle_main( - _option_parsers.get_resquiggle_parser().parse_args()) - return - if __name__ == '__main__': - args_and_main() + sys.stderr.write('This is a module. See commands with `tombo -h`') + sys.exit(1) diff --git a/tombo/_filter_reads.py b/tombo/_filter_reads.py new file mode 100644 index 0000000..46708e7 --- /dev/null +++ b/tombo/_filter_reads.py @@ -0,0 +1,367 @@ +from __future__ import division, unicode_literals, absolute_import + +from builtins import int, range, dict, map, zip + +import sys + +# Future warning from cython in h5py +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) +import h5py + +import numpy as np + +from operator import itemgetter + +if sys.version_info[0] > 2: + unicode = str + +from . import tombo_helper as th + + +VERBOSE = False + + +############################## +###### Filter Functions ###### +############################## + +def clear_filters(fast5s_dir, corr_grp): + """Clear filters applied to this directories index files + """ + reads_index = th.TomboReads([fast5s_dir,], corr_grp, remove_filtered=False) + + th.status_message('Clearing all filters.') + reads_index.replace_index(dict( + (chrm_strand, [rd._replace(filtered=False) for rd in cs_reads_index]) + for chrm_strand, cs_reads_index in reads_index)) + + reads_index.write_index_file() + th.status_message('All filters successfully cleared!') + + return + +def print_filter_mess( + num_filt_reads, prev_unfilt_reads, total_reads, fast5s_dir, filter_text): + if prev_unfilt_reads == 0: + th.error_message_and_exit( + 'No unfiltered reads present in current Tombo index.') + + th.status_message( + 'Filtered {:d} reads ({:.1%} of previously filtered and '.format( + num_filt_reads, float(num_filt_reads) / prev_unfilt_reads) + + '{:.1%} of all valid reads)'.format( + float(num_filt_reads) / total_reads) + + ' reads due to ' + filter_text + ' filter from ' + fast5s_dir + '.') + return + +def filter_reads_for_stuck(fast5s_dir, corr_grp, obs_filter): + """Filter reads based on some observation per base threshold criteria + """ + def read_is_stuck(fast5_fn, s_grp): + try: + with h5py.File(fast5_fn, 'r') as fast5_data: + base_lens = th.get_single_slot_read_centric( + fast5_data, 'length', s_grp) + if base_lens is None: return True + return any(np.percentile(base_lens, pctl) > thresh + for pctl, thresh in obs_filter) + except: + return True + + + reads_index = th.TomboReads([fast5s_dir,], corr_grp, remove_filtered=False) + + th.status_message('Filtering stuck reads.') + filt_reads_index = {} + prev_unfilt_reads, num_filt_reads, total_reads = 0, 0, 0 + for chrm_strand, cs_reads in reads_index: + cs_filt_reads = [] + for rd in cs_reads: + total_reads += 1 + if not rd.filtered: + prev_unfilt_reads += 1 + if read_is_stuck(rd.fn, rd.corr_group): + num_filt_reads += 1 + rd = rd._replace(filtered=True) + cs_filt_reads.append(rd) + filt_reads_index[chrm_strand] = cs_filt_reads + + print_filter_mess(num_filt_reads, prev_unfilt_reads, total_reads, + fast5s_dir, 'observations per base') + + reads_index.replace_index(filt_reads_index) + reads_index.write_index_file() + + return + +def filter_reads_for_coverage(fast5s_dir, corr_grp, frac_to_filter): + """Filter reads at higher coverage regions + """ + reads_index = th.TomboReads([fast5s_dir,], corr_grp, remove_filtered=False) + + th.status_message('Filtering reads to obtain more uniform coverage.') + filt_reads_index = dict((cs, []) for cs in reads_index.get_all_cs()) + unfilt_reads = [] + unfilt_reads_cov = [] + total_reads = 0 + for chrm_strand, cs_reads in reads_index: + # TODO: perform coverage computation iteratively for larger fractions + # of reads requested to avoid canceling out high coverage locations + + # compute coverage + max_end = max(rd.end for rd in cs_reads) + cs_coverage = np.zeros(max_end, dtype=np.int64) + for rd in cs_reads: + total_reads += 1 + if rd.filtered: + # add previously filtered reads straight back to new index + filt_reads_index[chrm_strand].append(rd) + cs_coverage[rd.start:rd.end] += 1 + # assign coverage value to each read + for rd in (rd for rd in cs_reads if not rd.filtered): + # add approximate coverage from middle of read + # faster than mean over the whole read + unfilt_reads_cov.append(cs_coverage[ + rd.start + ((rd.end - rd.start) // 2)]) + unfilt_reads.append((chrm_strand, rd)) + + prev_unfilt_reads = len(unfilt_reads) + if prev_unfilt_reads == 0: + th.error_message_and_exit( + 'No unfiltered reads present in current Tombo index.') + num_filt_reads = int(frac_to_filter * prev_unfilt_reads) + print_filter_mess(num_filt_reads, prev_unfilt_reads, total_reads, + fast5s_dir, 'even coverage') + + # create probabilities array with coverage values normalized to sum to 1 + unfilt_reads_cov = np.array(unfilt_reads_cov, dtype=np.float) + unfilt_reads_p = unfilt_reads_cov / unfilt_reads_cov.sum() + # randomly chose reads to filter + filt_indices = set(np.random.choice( + prev_unfilt_reads, size=num_filt_reads, replace=False, p=unfilt_reads_p)) + for i, (chrm_strand, rd) in enumerate(unfilt_reads): + if i in filt_indices: + rd = rd._replace(filtered=True) + filt_reads_index[chrm_strand].append(rd) + + reads_index.replace_index(filt_reads_index) + reads_index.write_index_file() + + return + +def filter_reads_for_qscore(fast5s_dir, bc_grp, corr_grp, q_score_thresh): + """Filter reads based on mean q-score + """ + def read_fails_q_score(fast5_fn, s_grp): + try: + with h5py.File(fast5_fn, 'r') as fast5_data: + r_q_scores = fast5_data['/Analyses/' + bc_grp + '/' + s_grp + + '/Fastq'][:].decode().split('\n')[3] + return th.get_mean_q_score(r_q_scores) < q_score_thresh + except: + return True + + + reads_index = th.TomboReads([fast5s_dir,], corr_grp, remove_filtered=False) + + th.status_message('Filtering reads below a mean q-score cutoff.') + filt_reads_index = dict((cs, []) for cs in reads_index.get_all_cs()) + num_filt_reads, prev_unfilt_reads, total_reads = 0, 0, 0 + for chrm_strand, cs_reads in reads_index: + for rd in cs_reads: + total_reads += 1 + if rd.filtered: + filt_reads_index[chrm_strand].append(rd) + continue + prev_unfilt_reads += 1 + if rd.mean_q_score is None: + filter_read = read_fails_q_score( + rd.fn, rd.corr_group.split('/')[-1]) + else: + filter_read = rd.mean_q_score < q_score_thresh + if filter_read: + num_filt_reads += 1 + rd = rd._replace(filtered=True) + filt_reads_index[chrm_strand].append(rd) + + print_filter_mess(num_filt_reads, prev_unfilt_reads, total_reads, + fast5s_dir, 'q-score') + + reads_index.replace_index(filt_reads_index) + reads_index.write_index_file() + + return + +def filter_reads_for_signal_matching(fast5s_dir, corr_grp, sig_match_thresh): + """Filter reads based on observed to expected signal matching score + """ + def read_fails_matching_score(fast5_fn, corr_group): + try: + with h5py.File(fast5_fn, 'r') as fast5_data: + return fast5_data['/Analyses/' + corr_group].attrs.get( + 'signal_match_score') > sig_match_thresh + except: + return True + + + reads_index = th.TomboReads([fast5s_dir,], corr_grp, remove_filtered=False) + + th.status_message('Filtering reads above a signal matching score threshold.') + filt_reads_index = dict((cs, []) for cs in reads_index.get_all_cs()) + num_filt_reads, prev_unfilt_reads, total_reads = 0, 0, 0 + for chrm_strand, cs_reads in reads_index: + for rd in cs_reads: + total_reads += 1 + if rd.filtered: + filt_reads_index[chrm_strand].append(rd) + continue + prev_unfilt_reads += 1 + if rd.sig_match_score is None: + filter_read = read_fails_matching_score(rd.fn, rd.corr_group) + else: + filter_read = rd.sig_match_score > sig_match_thresh + if filter_read: + num_filt_reads += 1 + rd = rd._replace(filtered=True) + filt_reads_index[chrm_strand].append(rd) + + print_filter_mess(num_filt_reads, prev_unfilt_reads, total_reads, + fast5s_dir, 'signal matching') + + reads_index.replace_index(filt_reads_index) + reads_index.write_index_file() + + return + +def filter_reads_for_genome_pos( + fast5s_dir, corr_grp, include_regs, include_partial=False): + """Filter reads to include or exclude genomic regions + """ + def read_included(start, end, chrm_include_regs): + if chrm_include_regs is None: + return True + if include_partial: + # include all reads partially overlapping regions + return any(not (start > i_end or end < i_start) + for i_start, i_end in chrm_include_regs) + + # test if read is completely contained within the interval + return any((start >= i_start and end <= i_end) + for i_start, i_end in chrm_include_regs) + + + reads_index = th.TomboReads([fast5s_dir,], corr_grp, remove_filtered=False) + + th.status_message('Filtering reads outside of the specified ' + + 'genomic location.') + filt_reads_index = dict((cs, []) for cs in reads_index.get_all_cs()) + num_filt_reads, prev_unfilt_reads, total_reads = 0, 0, 0 + for (chrm, strand), cs_reads in reads_index: + do_filter_cs_reads = chrm not in include_regs + for rd in cs_reads: + total_reads += 1 + if rd.filtered: + filt_reads_index[(chrm, strand)].append(rd) + continue + prev_unfilt_reads += 1 + if do_filter_cs_reads or not read_included( + rd.start, rd.end, include_regs[chrm]): + num_filt_reads += 1 + rd = rd._replace(filtered=True) + filt_reads_index[(chrm, strand)].append(rd) + + print_filter_mess(num_filt_reads, prev_unfilt_reads, total_reads, + fast5s_dir, 'genomic position') + + reads_index.replace_index(filt_reads_index) + reads_index.write_index_file() + + return + + +################################### +###### Filter Main Functions ###### +################################### + +def _clear_filters_main(args): + for fast5s_dir in args.fast5_basedirs: + clear_filters(fast5s_dir, args.corrected_group) + + return + +def _filter_stuck_main(args): + obs_filter = th.parse_obs_filter(args.obs_per_base_filter) + for fast5s_dir in args.fast5_basedirs: + filter_reads_for_stuck(fast5s_dir, args.corrected_group, obs_filter) + + return + +def _filter_coverage_main(args): + if not 0 < args.percent_to_filter < 100: + th.error_message_and_exit( + '--percent-to-filter must be between 0 and 100.') + + for fast5s_dir in args.fast5_basedirs: + filter_reads_for_coverage( + fast5s_dir, args.corrected_group, args.percent_to_filter / 100.0) + + return + +def _filter_q_score_main(args): + if not 0 < args.q_score < 40: + th.error_message_and_exit('--q-score must be between 0 and 40.') + + for fast5s_dir in args.fast5_basedirs: + filter_reads_for_qscore( + fast5s_dir, args.basecall_group, args.corrected_group, args.q_score) + + return + +def _filter_signal_matching_main(args): + if not 0 < args.signal_matching_score < 10: + th.error_message_and_exit( + '--signal-matching-score must be between 0 and 10.') + + for fast5s_dir in args.fast5_basedirs: + filter_reads_for_signal_matching( + fast5s_dir, args.corrected_group, args.signal_matching_score) + + return + +def _filter_genome_pos_main(args): + include_regs = th.parse_genome_regions(args.include_regions) + + for fast5s_dir in args.fast5_basedirs: + filter_reads_for_genome_pos( + fast5s_dir, args.corrected_group, include_regs, + args.include_partial_overlap) + + return + +def filter_main(args): + global VERBOSE + VERBOSE = not args.quiet + th.VERBOSE = VERBOSE + + if args.action_command == 'clear_filters': + _clear_filters_main(args) + elif args.action_command == 'genome_locations': + _filter_genome_pos_main(args) + elif args.action_command == 'stuck': + _filter_stuck_main(args) + elif args.action_command == 'level_coverage': + _filter_coverage_main(args) + elif args.action_command == 'q_score': + _filter_q_score_main(args) + elif args.action_command == 'raw_signal_matching': + _filter_signal_matching_main(args) + else: + th.error_message_and_exit('Invalid Tombo filter command.') + + return + + +if __name__ == '__main__': + sys.stderr.write('This is a module. See commands with `tombo -h`') + sys.exit(1) diff --git a/tombo/_option_parsers.py b/tombo/_option_parsers.py index 9f5221f..514c8de 100644 --- a/tombo/_option_parsers.py +++ b/tombo/_option_parsers.py @@ -11,7 +11,8 @@ from ._default_parameters import ( SEG_PARAMS_TABLE, ALGN_PARAMS_TABLE, LLR_THRESH, SAMP_COMP_THRESH, DE_NOVO_THRESH, ALTERNATE_MODELS, MAX_SCALING_ITERS, ALT_EST_PCTL, - COV_DAMP_COUNTS, SIG_MATCH_THRESH) + COV_DAMP_COUNTS, SIG_MATCH_THRESH, FM_OFFSET_DEFAULT, OUTLIER_THRESH, + DNA_SAMP_TYPE, RNA_SAMP_TYPE, MEAN_PRIOR_CONST, SD_PRIOR_CONST) ALT_BASES = tuple(set(alt_name.split('_')[1] for alt_name in ALTERNATE_MODELS)) @@ -20,7 +21,7 @@ ###### Positional arguments ###### ################################## -basedir_opt=('fast5_basedir', { +basedir_opt=('fast5s_basedir', { 'type':unicode, 'help':'Directory containing fast5 files. All files ending in "fast5" ' + 'found recursively within this base directory will be processed.'}) @@ -219,10 +220,12 @@ proc_opt=('--processes', { 'type':int, 'help':'Number of processes. Default: %(default)d'}) thrpp_opt=('--threads-per-process', { - 'type':int, + 'type':int, 'default':1, 'help':'Number of file input/output and mapping threads per compute ' + 'process [--processes]. This should likely be left at 1, but may ' + 'improve performance on some systems. Default: %(default)d'}) +hidthrpp_opt=('--threads-per-process', { + 'type':int, 'default':1, 'help':argparse.SUPPRESS}) alignproc_opt=('--align-processes', { 'type':int, 'default':1, @@ -249,6 +252,12 @@ 'help':'Size of regions over which to multiprocesses statistic ' + 'computation. For very deep samples a smaller value is recommmended ' + 'in order to control memory consumption. Default: %(default)d'}) +mstsgnf_opt=('--num-most-significant-stored', { + 'default':100000, 'type':int, + 'help':'Number of the most significant sites to store for faster access. ' + + 'If a longer list of most significant sites is required the list must be ' + + 're-computed from all batches. Very large values can increase RAM usage. ' + + 'Default: %(default)d'}) timeout_opt=('--timeout', { 'type':int, 'help':'Timeout in seconds for processing a single read. ' + @@ -297,7 +306,7 @@ 'instead of raw signal. Default: %(default)d'}) fmo_opt=('--fishers-method-context', { - 'type':int, 'default':1, + 'type':int, 'default':FM_OFFSET_DEFAULT, 'help':'Number of context bases up and downstream over which to compute ' + "Fisher's method combined p-values. Note: Not applicable " + "for alternative model likelihood ratio tests. Default: %(default)d."}) @@ -306,24 +315,33 @@ 'help':'Number of reads required at a position to perform significance ' + 'testing or contribute to model estimation. Default: %(default)d'}) +allspb_opt=('--statistics-per-block', { + 'type':int, + 'help':'Number of randomly selected per-read, per-base statistics to ' + + 'extract from each genomic block for plotting. Default: Include all stats'}) spb_opt=('--statistics-per-block', { - 'type':int, 'default':100000, + 'type':int, 'help':'Number of randomly selected per-read, per-base statistics to ' + 'extract from each genomic block for plotting. Default: %(default)d'}) tsl_opt=('--total-statistics-limit', { - 'type':int, 'default':5000000, + 'type':int, 'help':'Total per-read statistics to be extracted for plotting. ' + 'Avoids memory overflow for large runs. Default: %(default)d'}) +dynerr_opt=('--num-most-common-errors', { + 'type':int, 'default':0, + 'help':'Dynamically show this many most common errors so far through run. ' + + 'Default: 0; Just show progress'}) segpars_opt=('--segmentation-parameters', { - 'type':int, 'nargs':3, - 'help':'Specify the 3 parameters for segmentation 1) running neighboring ' + + 'type':int, 'nargs':len(next(iter(SEG_PARAMS_TABLE.values()))), + 'help':'Specify parameters for segmentation 1) running neighboring ' + 'windows width 2) minimum raw observations per genomic base 3) mean raw ' + 'observations per event. Sample type defaults: ' + ' || '.join((bst + ' : ' + ' '.join(map(str, params))) for bst, params in SEG_PARAMS_TABLE.items())}) hidsegpars_opt=('--segmentation-parameters', { - 'type':int, 'nargs':3, 'help':argparse.SUPPRESS}) + 'type':int, 'nargs':len(next(iter(SEG_PARAMS_TABLE.values()))), + 'help':argparse.SUPPRESS}) segpars2_opt=('--segmentation-parameters', { 'type':int, 'nargs':2, 'help':'Specify the 2 parameters for segmentation 1) running neighboring ' + @@ -400,6 +418,10 @@ 'help':'Use a standard log likelihood ratio (LLR) statistic. Default ' + 'is to use an outlier-robust LLR-like statistic. Detail in full ' + 'online documentation.'}) +prtovlp_opt=('--include-partial-overlap', { + 'default':False, 'action':'store_true', + 'help':'Include reads that partially overlap the specified region. ' + + 'Default: Only include reads completely contained in a specified region'}) readmean_opt=('--read-mean', { 'default':False, 'action':'store_true', @@ -417,6 +439,11 @@ pstdmod_opt=('--plot-standard-model', { 'default':False, 'action':'store_true', 'help':"Add default standard model distribution to the plot."}) +samponly_opt=('--sample-only-estimates', { + 'default':False, 'action':'store_true', + 'help':"Only use canonical sample to estimate expected signal level and " + + "spread. Default: Use canonical model to improve estimtates (esp. for " + + "low coverage regions) using baysian posterior estimates."}) quiet_opt=(('--quiet', '-q'), { 'default':False, 'action':'store_true', @@ -428,11 +455,11 @@ ############################## otlthresh_opt=('--outlier-threshold', { - 'default':5, 'type':float, + 'type':float, 'default':OUTLIER_THRESH, 'help':'Windosrize the signal at this number of scale values. ' + 'Negative value disables outlier clipping. Default: %(default)f'}) hidotlthresh_opt=('--outlier-threshold', { - 'default':5, 'type':float, 'help':argparse.SUPPRESS}) + 'type':float, 'default':OUTLIER_THRESH, 'help':argparse.SUPPRESS}) snglrdthrsh_opt=('--single-read-threshold', { 'type':float, 'nargs':'+', @@ -495,8 +522,8 @@ 'Default: %(default)f'}) sms_opt=('--signal-matching-score', { 'type':float, - 'help':'Mean half normal z-score threshold for filtering reads with ' + - 'poor raw to expected signal matching. Signal type defaults: ' + + 'help':'Observed to expected signal matching score (higher score ' + + 'indicates poor matching). Sample type defaults: ' + ' || '.join(bst + ' : ' + str(params) for bst, params in SIG_MATCH_THRESH.items())}) fxdscl_opt=('--fixed-scale', { @@ -507,20 +534,27 @@ cvgdmp_opt=('--coverage-dampen-counts', { 'type':float, 'nargs':2, 'default':COV_DAMP_COUNTS, 'help':'Dampen fraction modified estimates for low coverage sites. Two ' + - 'parameters are unmodified and modified psuedo read counts. This is ' + + 'parameters are unmodified and modified pseudo read counts. This is ' + 'equivalent to a beta prior on the fraction estimate. Set to "0 0" to ' + 'disable dampened fraction estimation. Default: %(default)s'}) +prwht_opt=('--model-prior-weights', { + 'type':float, 'nargs':2, 'default':[MEAN_PRIOR_CONST, SD_PRIOR_CONST], + 'help':'Prior weights (one each for mean and spread) applied to ' + + 'canonical base model for estimating posterior model parameters for ' + + 'sample comparison. Default: %(default)s'}) sigapars_opt=('--signal-align-parameters', { - 'type':float, 'nargs':5, - 'help':'Specify the 4 parameters for signal to genome sequence alignment ' + + 'type':float, 'nargs':len(next(iter(ALGN_PARAMS_TABLE.values()))), + 'help':'Specify the parameters for signal to genome sequence alignment ' + 'algorithm 1) match expected value 2) skip penalty 3) bandwidth 4) save ' + 'bandwidth (if read fails with bandwith) 5) z-score winsorizing ' + - 'threshold. Sample type defaults: ' + ' || '.join( - (bst + ' : ' + ' '.join(map(str, params))) - for bst, params in ALGN_PARAMS_TABLE.items())}) + 'threshold 6) bandwidth boundary threshold 7) start bandwidth 8) start ' + + 'save bandwidth 9) start num bases. Sample type defaults: ' + + ' || '.join((bst + ' : ' + ' '.join(map(str, params))) + for bst, params in ALGN_PARAMS_TABLE.items())}) hidsigapars_opt=('--signal-align-parameters', { - 'type':float, 'nargs':5, 'help':argparse.SUPPRESS}) + 'type':float, 'nargs':len(next(iter(ALGN_PARAMS_TABLE.values()))), + 'help':argparse.SUPPRESS}) ############################## @@ -566,11 +600,11 @@ 'Default: "coverage"'}) dna_opt=('--dna', { - 'dest':'bio_sample_type', 'action':'store_const', 'const':'DNA', + 'dest':'seq_sample_type', 'action':'store_const', 'const':DNA_SAMP_TYPE, 'help':'Explicitly select canonical DNA model. Default: Automatically ' + 'determine from FAST5s'}) rna_opt=('--rna', { - 'dest':'bio_sample_type', 'action':'store_const', 'const':'RNA', + 'dest':'seq_sample_type', 'action':'store_const', 'const':RNA_SAMP_TYPE, 'help':'Explicitly select canonical RNA model. Default: Automatically ' + 'determine from FAST5s'}) @@ -600,6 +634,7 @@ def add_misc_args(parser): def add_common_testing_args(parser): io_args = parser.add_argument_group('Output Argument') io_args.add_argument(prstatbn_opt[0], **prstatbn_opt[1]) + io_args.add_argument(mstsgnf_opt[0], **mstsgnf_opt[1]) multi_args = parser.add_argument_group('Multiprocessing Arguments') multi_args.add_argument(mpreg_opt[0], **mpreg_opt[1]) @@ -651,7 +686,7 @@ def get_resquiggle_parser(): multi_args = parser.add_argument_group('Multiprocessing Arguments') multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) - multi_args.add_argument(thrpp_opt[0], default=1, **thrpp_opt[1]) + multi_args.add_argument(hidthrpp_opt[0], **hidthrpp_opt[1]) fast5_args = parser.add_argument_group('FAST5 Data Arguments') fast5_args.add_argument(corrgrp_opt[0], **corrgrp_opt[1]) @@ -661,6 +696,7 @@ def get_resquiggle_parser(): io_args = parser.add_argument_group('Input/Output Arguments') io_args.add_argument(failed_opt[0], **failed_opt[1]) + io_args.add_argument(dynerr_opt[0], **dynerr_opt[1]) hid_args = parser.add_argument_group('Advanced Arguments') hid_args.add_argument(printadv_opt[0], **printadv_opt[1]) @@ -696,6 +732,7 @@ def print_advanced_resquiggle(): hid_args.add_argument(skpidx_opt[0], **skpidx_opt[1]) hid_args.add_argument(incldsd_opt[0], **incldsd_opt[1]) hid_args.add_argument(ignrlock_opt[0], **ignrlock_opt[1]) + hid_args.add_argument(thrpp_opt[0], **thrpp_opt[1]) hid_args.add_argument(*help_opt[0], **help_opt[1]) @@ -878,7 +915,7 @@ def get_estimate_scale_parser(): def get_de_novo_test_signif_parser(): parser = argparse.ArgumentParser( description='Test for significant shifts in raw nanopore signal ' + - 'against either a canonical model.', add_help=False) + 'awway from a canonical base expected signal model.', add_help=False) req_args = parser.add_argument_group('Required Argument') req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) req_args.add_argument(statbsnm_opt[0], required=True, **statbsnm_opt[1]) @@ -892,6 +929,7 @@ def get_de_novo_test_signif_parser(): test_args.add_argument(fmo_opt[0], **fmo_opt[1]) test_args.add_argument(minreads_opt[0], default=1, **minreads_opt[1]) test_args.add_argument(dnthresh_opt[0], **dnthresh_opt[1]) + test_args.add_argument(cvgdmp_opt[0], **cvgdmp_opt[1]) io_args, multi_args = add_common_testing_args(parser) fast5_args, misc_args, parser = add_default_args(parser) @@ -901,7 +939,7 @@ def get_de_novo_test_signif_parser(): def get_alt_test_signif_parser(): parser = argparse.ArgumentParser( description='Test for significant shifts in raw nanopore signal ' + - 'which match a specific non-canonical base model.', add_help=False) + 'specificly matching a non-canonical base model.', add_help=False) req_args = parser.add_argument_group('Required Argument') req_args.add_argument(fast5dir_opt[0], **fast5dir_opt[1]) req_args.add_argument(statbsnm_opt[0], **statbsnm_opt[1]) @@ -918,6 +956,7 @@ def get_alt_test_signif_parser(): test_args.add_argument(minreads_opt[0], default=1, **minreads_opt[1]) test_args.add_argument(altthresh_opt[0], **altthresh_opt[1]) test_args.add_argument(stdllhr_opt[0], **stdllhr_opt[1]) + test_args.add_argument(cvgdmp_opt[0], **cvgdmp_opt[1]) io_args, multi_args = add_common_testing_args(parser) fast5_args, misc_args, parser = add_default_args(parser) @@ -927,18 +966,26 @@ def get_alt_test_signif_parser(): def get_samp_comp_test_signif_parser(): parser = argparse.ArgumentParser( description='Test for significant shifts in raw nanopore signal ' + - 'against either a model, a set of two models or another sequencing ' + - 'sample.', add_help=False) + 'away from a control/canonical base only sample (usually ' + + 'PCR for DNA or IVT for RNA).', add_help=False) req_args = parser.add_argument_group('Required Argument') req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) req_args.add_argument(statbsnm_opt[0], required=True, **statbsnm_opt[1]) req_args.add_argument( ctrlfast5dir_opt[0], required=True, **ctrlfast5dir_opt[1]) + alt_args = parser.add_argument_group('Model Prior Arguments') + alt_args.add_argument(samponly_opt[0], **samponly_opt[1]) + alt_args.add_argument(prwht_opt[0], **prwht_opt[1]) + alt_args.add_argument(dna_opt[0], **dna_opt[1]) + alt_args.add_argument(rna_opt[0], **rna_opt[1]) + alt_args.add_argument(hidden_tbmod_opt[0], **hidden_tbmod_opt[1]) + test_args = parser.add_argument_group('Significance Test Arguments') test_args.add_argument(fmo_opt[0], **fmo_opt[1]) test_args.add_argument(minreads_opt[0], default=1, **minreads_opt[1]) test_args.add_argument(scompthresh_opt[0], **scompthresh_opt[1]) + test_args.add_argument(cvgdmp_opt[0], **cvgdmp_opt[1]) io_args, multi_args = add_common_testing_args(parser) fast5_args, misc_args, parser = add_default_args(parser) @@ -951,11 +998,18 @@ def get_aggregate_per_read_parser(): '(genomic base) statistics file.', add_help=False) req_args = parser.add_argument_group('Required Argument') req_args.add_argument(prstat_opt[0], required=True, **prstat_opt[1]) - req_args.add_argument(statbsnm_opt[0], required=True, **statbsnm_opt[1]) + req_args.add_argument(statfn_opt[0], required=True, **statfn_opt[1]) req_args.add_argument(snglrdthrsh_opt[0], required=True, **snglrdthrsh_opt[1]) test_args = parser.add_argument_group('Significance Test Arguments') test_args.add_argument(minreads_opt[0], default=1, **minreads_opt[1]) + test_args.add_argument(cvgdmp_opt[0], **cvgdmp_opt[1]) + + io_args = parser.add_argument_group('Output Argument') + io_args.add_argument(mstsgnf_opt[0], **mstsgnf_opt[1]) + + multi_args = parser.add_argument_group('Multiprocessing Arguments') + multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) fast5_args, misc_args, parser = add_default_args(parser) @@ -1055,7 +1109,10 @@ def get_filter_genome_pos_parser(): req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) req_args.add_argument(incldreg_opt[0], **incldreg_opt[1]) - fast5_args = parser.add_argument_group('FAST5 Data Arguments') + filt_args = parser.add_argument_group('Filter Argument') + filt_args.add_argument(prtovlp_opt[0], **prtovlp_opt[1]) + + fast5_args = parser.add_argument_group('FAST5 Data Argument') fast5_args.add_argument(corrgrp_opt[0], **corrgrp_opt[1]) misc_args, parser = add_misc_args(parser) @@ -1306,14 +1363,12 @@ def get_roc_parser(): out_args = parser.add_argument_group('Output Arguments') out_args.add_argument(pdf_opt[0], - default=OUTPUT_BASE + '.roc.pdf', - **pdf_opt[1]) - - filt_args = parser.add_argument_group('Filtering Arguments') - filt_args.add_argument(minreads_opt[0], default=1, **minreads_opt[1]) + default=OUTPUT_BASE + '.roc.pdf', + **pdf_opt[1]) - stat_args = parser.add_argument_group('Statistical Argument') - stat_args.add_argument(cvgdmp_opt[0], **cvgdmp_opt[1]) + limit_args = parser.add_argument_group('Down-sampling Arguments') + limit_args.add_argument(allspb_opt[0], **allspb_opt[1]) + limit_args.add_argument(tsl_opt[0], default=5000000, **tsl_opt[1]) misc_args, parser = add_misc_args(parser) @@ -1329,8 +1384,8 @@ def get_per_read_roc_parser(): req_args.add_argument(fasta_opt[0], required=True, **fasta_opt[1]) limit_args = parser.add_argument_group('Down-sampling Arguments') - limit_args.add_argument(spb_opt[0], **spb_opt[1]) - limit_args.add_argument(tsl_opt[0], **tsl_opt[1]) + limit_args.add_argument(spb_opt[0], default=100000, **spb_opt[1]) + limit_args.add_argument(tsl_opt[0], default=5000000, **tsl_opt[1]) out_args = parser.add_argument_group('Output Arguments') out_args.add_argument(pdf_opt[0], @@ -1391,9 +1446,6 @@ def get_browser_files_parser(): out_args.add_argument(brsrfn_opt[0], **brsrfn_opt[1]) out_args.add_argument(ftypes_opt[0], **ftypes_opt[1]) - stat_args = parser.add_argument_group('Statistical Argument') - stat_args.add_argument(cvgdmp_opt[0], **cvgdmp_opt[1]) - fast5_args, misc_args, parser = add_default_args(parser) return parser @@ -1420,14 +1472,11 @@ def get_write_signif_diff_parser(): seqs_opt[0], default=OUTPUT_BASE + '.significant_regions.fasta', **seqs_opt[1]) - stat_args = parser.add_argument_group('Statistical Argument') - stat_args.add_argument(cvgdmp_opt[0], **cvgdmp_opt[1]) - fast5_args, misc_args, parser = add_default_args(parser) return parser if __name__ == '__main__': - raise NotImplementedError( - 'This is a module. See commands with `tombo -h`') + sys.stderr.write('This is a module. See commands with `tombo -h`') + sys.exit(1) diff --git a/tombo/plot_commands.py b/tombo/_plot_commands.py similarity index 61% rename from tombo/plot_commands.py rename to tombo/_plot_commands.py index 20515d3..08563c0 100644 --- a/tombo/plot_commands.py +++ b/tombo/_plot_commands.py @@ -27,7 +27,8 @@ from . import tombo_stats as ts from . import tombo_helper as th -from ._default_parameters import SMALLEST_PVAL +from ._default_parameters import SMALLEST_PVAL, PLOT_PVAL_MAX, PLOT_LLR_MAX + VERBOSE = False @@ -43,9 +44,10 @@ try: from rpy2 import robjects as r from rpy2.robjects.packages import importr + _R_DF = r.DataFrame(()) except: - # pass here and raise error when main functions are actually called - # in order to give specific error message + # pass here and print detailed error when main functions are actually called + # in to give specific error message pass _PROFILE_PLOT_MAX = False @@ -55,79 +57,83 @@ #### ROC Curves #### #################### -def parse_motif_descs(stat_motif_descs): - parsed_motif_descs = [] - try: - for motif_desc in stat_motif_descs.split('::'): - raw_motif, mod_pos, mod_name = motif_desc.split(':') - motif = th.TomboMotif(raw_motif, int(mod_pos)) - parsed_motif_descs.append((motif, mod_name)) - except: - th._error_message_and_exit( - 'Invalid motif decriptions format. Format descriptions as: ' + - '"motif:mod_pos:name[::motif2:mod_pos2:name2...]".') - - return parsed_motif_descs - -def plot_roc(stats_fns, motif_descs, fasta_fn, min_reads, pdf_fn, - cov_damp_counts): +def plot_roc( + stats_fns, motif_descs, fasta_fn, pdf_fn, stats_per_block, + total_stats_limit): if len(motif_descs) != len(stats_fns): - th._error_message_and_exit( + th.error_message_and_exit( 'Must provide exactly one set of motif descriptions for ' + 'each statistics file.') - if VERBOSE: th._status_message('Parsing motifs.') - motif_descs = [parse_motif_descs(stat_motif_descs) + if VERBOSE: th.status_message('Parsing motifs.') + motif_descs = [th.parse_motif_descs(stat_motif_descs) for stat_motif_descs in motif_descs] mod_names = [mod_name for stat_mds in motif_descs for _, mod_name in stat_mds] if len(mod_names) != len(set(mod_names)): - th._error_message_and_exit('Modified base names are not unique.') + th.error_message_and_exit('Modified base names are not unique.') - if VERBOSE: th._status_message('Parsing genome.') + if VERBOSE: th.status_message('Parsing genome.') genome_index = th.Fasta(fasta_fn) - if VERBOSE: th._status_message('Computing accuracy statistics.') - tp_rates, fp_rates, precisions, mod_names_for_r = [], [], [], [] + all_motif_stats = {} + all_motif_stats_for_r = {} for stats_fn, stat_motif_descs in zip(stats_fns, motif_descs): if not os.path.isfile(stats_fn): - th._warning_message('Statistics file does not exist. Skipping: ' + + th.warning_message('Statistics file does not exist. Skipping: ' + stats_fn) continue - stats = ts.TomboStats(stats_fn) - stats.filter_coverage(min_reads) - if stats.is_empty(): - th._warning_message( - 'No locations pass coverage threshold. Skipping: ' + stats_fn) + try: + stats = ts.TomboStats(stats_fn) + except Exception as e: + th.warning_message( + 'Unexpected error parsing ' + stats_fn + '. Continuing ' + + 'without processing this file. \n\tError code:\n\t\t' + + str(e) + '\n') continue - stats.order_by_frac(cov_damp_counts) - - for motif, mod_name in stat_motif_descs: - if (stats.stat_type == ts.ALT_MODEL_TXT and - next(stats.iter_stat_seqs( - genome_index, motif.mod_pos - 1, - motif.motif_len - motif.mod_pos))[ - motif.mod_pos - 1] != motif.mod_base): - th._warning_message( - 'Cannot assess modified base accuracy with alternative ' + - 'model testing to another canonical base. Skipping: ' + - mod_name) - continue - mod_tp_rate, mod_fp_rate, mod_precision = ts.get_motif_stats( - motif, stats, genome_index) - # print auc and average precision - auc = np.sum(mod_tp_rate[:-1] * - (mod_fp_rate[1:] - mod_fp_rate[:-1])) - # TODO compute precision recall summary stat - if VERBOSE: sys.stderr.write('\t'.join(( - '', mod_name.ljust(30), 'AUC:', - '{:.4f}'.format(auc))) + '\n') - tp_rates.extend(mod_tp_rate) - fp_rates.extend(mod_fp_rate) - precisions.extend(mod_precision) - mod_names_for_r.extend(repeat(mod_name, len(mod_tp_rate))) - - if VERBOSE: th._status_message('Plotting.') + for mod_name, mod_stats in stats.compute_motif_stats( + stat_motif_descs, genome_index, stats_per_block, + total_stats_limit).items(): + all_motif_stats[mod_name] = mod_stats + stats.close() + + for mod_name, stats in all_motif_stats.items(): + unzip_stats = list(zip(*stats)) + all_motif_stats_for_r[mod_name] = r.DataFrame({ + 'stat':r.FloatVector(unzip_stats[0]), + 'motif_match':r.BoolVector(unzip_stats[1])}) + + # python2 rpy2 ListVector can't take unicode keys + if sys.version_info[0] < 3: + conv_all_motif_stats_for_r = {} + for k, v in all_motif_stats_for_r.items(): + conv_all_motif_stats_for_r[k.encode()] = v + all_motif_stats_for_r = conv_all_motif_stats_for_r + all_motif_stats_for_r = r.ListVector(all_motif_stats_for_r) + + if VERBOSE: th.status_message('Computing accuracy statistics.') + tp_rates, fp_rates, precisions, mod_names_for_r = [], [], [], [] + if VERBOSE: + sys.stderr.write(' {:<30}{:<6} {:<6}\n'.format( + 'Statistic Type', 'AUC', 'mean AP')) + sys.stderr.write(' {:<30}{:<6} {:<6}\n'.format( + '--------------', '---', '-------')) + for mod_name, mod_stats in all_motif_stats.items(): + # extract motif_match (bool) ordered by stat values + ordered_mod_tf = list(zip(*sorted(mod_stats)))[1] + mod_tp_rate, mod_fp_rate, mod_precision = ts.compute_accuracy_rates( + ordered_mod_tf) + auc = ts.compute_auc(mod_tp_rate, mod_fp_rate) + mean_ap = ts.compute_mean_avg_precison(mod_tp_rate, mod_precision) + if VERBOSE: + sys.stderr.write(' {:<30}{:6.4f} {:6.4f}\n'.format( + mod_name, auc, mean_ap)) + tp_rates.extend(mod_tp_rate) + fp_rates.extend(mod_fp_rate) + precisions.extend(mod_precision) + mod_names_for_r.extend(repeat(mod_name, len(mod_tp_rate))) + + if VERBOSE: th.status_message('Plotting.') rocDat = r.DataFrame({ 'TP':r.FloatVector(tp_rates), 'FP':r.FloatVector(fp_rates), @@ -141,74 +147,45 @@ def plot_roc(stats_fns, motif_descs, fasta_fn, min_reads, pdf_fn, return def plot_per_read_roc( - pr_stats_fns, motif_descs, fasta_fn, pdf_fn, - stats_per_block, total_stats_limit): + pr_stats_fns, motif_descs, fasta_fn, pdf_fn, stats_per_block, + total_stats_limit): if len(motif_descs) != len(pr_stats_fns): - th._error_message_and_exit( + th.error_message_and_exit( 'Must provide exactly one set of motif descriptions for ' + 'each statistics file.') - if VERBOSE: th._status_message('Parsing motifs.') - motif_descs = [parse_motif_descs(stat_motif_descs) + if VERBOSE: th.status_message('Parsing motifs.') + motif_descs = [th.parse_motif_descs(stat_motif_descs) for stat_motif_descs in motif_descs] mod_names = [mod_name for stat_mds in motif_descs for _, mod_name in stat_mds] if len(mod_names) != len(set(mod_names)): - th._error_message_and_exit('Modified base names are not unique.') + th.error_message_and_exit('Modified base names are not unique.') - if VERBOSE: th._status_message('Parsing genome.') + if VERBOSE: th.status_message('Parsing genome.') genome_index = th.Fasta(fasta_fn) - if VERBOSE: th._status_message('Extracting per-read statistics.') + if VERBOSE: th.status_message('Extracting per-read statistics.') all_motif_stats = {} all_motif_stats_for_r = {} for pr_stats_fn, stat_motif_descs in zip(pr_stats_fns, motif_descs): if not os.path.isfile(pr_stats_fn): - th._warning_message('Statistics file does not exist. Skipping: ' + + th.warning_message('Statistics file does not exist. Skipping: ' + pr_stats_fn) continue - pr_stats = ts.PerReadStats(pr_stats_fn) - for motif, mod_name in stat_motif_descs: - all_motif_stats[mod_name] = [] - before_bases = max((motif.mod_pos for motif, _ in stat_motif_descs)) - 1 - after_bases = max((motif.motif_len - motif.mod_pos - for motif, _ in stat_motif_descs)) - total_num_stats = 0 - for chrm, strand, start, end, block_stats in pr_stats: - if strand == '+': - seq_start = max(start - before_bases, 0) - seq_end = end + after_bases - else: - seq_start = max(start - after_bases, 0) - seq_end = end + before_bases - - reg_seq = genome_index.get_seq(chrm, seq_start, seq_end) - # randomly sub-sample per-read stats here - if block_stats.shape[0] > stats_per_block: - block_stats = block_stats[np.random.choice( - block_stats.shape[0], stats_per_block, replace=False)] - total_num_stats += block_stats.shape[0] - for r_pos_stat in block_stats: - # extract position sequence - if strand == '+': - r_pos_seq = reg_seq[ - r_pos_stat['pos'] - seq_start - before_bases: - r_pos_stat['pos'] - seq_start + after_bases + 1] - else: - r_pos_seq = th.rev_comp(reg_seq[ - r_pos_stat['pos'] - seq_start - after_bases: - r_pos_stat['pos'] - seq_start + before_bases + 1]) - - # add statistic and whether the sequence matches each motif - for motif, mod_name in stat_motif_descs: - if r_pos_seq[before_bases] != motif.mod_base: continue - all_motif_stats[mod_name].append(( - r_pos_stat['stat'], - bool(motif.motif_pat.match( - r_pos_seq[before_bases - motif.mod_pos + 1:])))) - - if total_num_stats >= total_stats_limit: - break + try: + pr_stats = ts.PerReadStats(pr_stats_fn) + except Exception as e: + th.warning_message( + 'Unexpected error parsing ' + pr_stats_fn + '. Continuing ' + + 'without processing this file. \n\tError code:\n\t\t' + + str(e) + '\n') + continue + for mod_name, mod_stats in pr_stats.compute_motif_stats( + stat_motif_descs, genome_index, stats_per_block, + total_stats_limit).items(): + all_motif_stats[mod_name] = mod_stats + pr_stats.close() for mod_name, stats in all_motif_stats.items(): unzip_stats = list(zip(*stats)) @@ -224,17 +201,23 @@ def plot_per_read_roc( all_motif_stats_for_r = conv_all_motif_stats_for_r all_motif_stats_for_r = r.ListVector(all_motif_stats_for_r) - if VERBOSE: th._status_message('Computing accuracy statistics.') + if VERBOSE: th.status_message('Computing accuracy statistics.') tp_rates, fp_rates, precisions, mod_names_for_r = [], [], [], [] + if VERBOSE: + sys.stderr.write(' {:<30}{:<6} {:<6}\n'.format( + 'Statistic Type', 'AUC', 'mean AP')) + sys.stderr.write(' {:<30}{:<6} {:<6}\n'.format( + '--------------', '---', '-------')) for mod_name, mod_stats in all_motif_stats.items(): + # extract motif_match (bool) ordered by stat values ordered_mod_tf = list(zip(*sorted(mod_stats)))[1] mod_tp_rate, mod_fp_rate, mod_precision = ts.compute_accuracy_rates( ordered_mod_tf) - auc = np.sum(mod_tp_rate[:-1] * (mod_fp_rate[1:] - mod_fp_rate[:-1])) - # TODO compute precision recall summary stat - if VERBOSE: sys.stderr.write('\t'.join(( - '', mod_name.ljust(30), 'AUC:', - '{:.4f}'.format(auc))) + '\n') + auc = ts.compute_auc(mod_tp_rate, mod_fp_rate) + mean_ap = ts.compute_mean_avg_precison(mod_tp_rate, mod_precision) + if VERBOSE: + sys.stderr.write(' {:<30}{:6.4f} {:6.4f}\n'.format( + mod_name, auc, mean_ap)) tp_rates.extend(mod_tp_rate) fp_rates.extend(mod_fp_rate) precisions.extend(mod_precision) @@ -246,7 +229,7 @@ def plot_per_read_roc( 'Precision':r.FloatVector(precisions), 'Comparison':r.StrVector(mod_names_for_r)}) - if VERBOSE: th._status_message('Plotting.') + if VERBOSE: th.status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotROCPerRead.R').decode()) r.r('pdf("' + pdf_fn + '", height=4, width=6)') r.globalenv[str('plotROCPerRead')](rocDat, all_motif_stats_for_r) @@ -260,20 +243,19 @@ def plot_per_read_roc( ################################### def plot_kmer_dist( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - read_mean, upstrm_bases, dnstrm_bases, kmer_thresh, num_reads, - r_struct_fn, dont_plot): + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, read_mean, + upstrm_bases, dnstrm_bases, kmer_thresh, num_reads, r_struct_fn, + dont_plot): kmer_width = upstrm_bases + dnstrm_bases + 1 reads_added = 0 all_kmers = defaultdict(list) - raw_read_coverage = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) - if VERBOSE: th._status_message('Extracting read levels.') - files = [r_data for cs_r_data in raw_read_coverage.values() - for r_data in cs_r_data] - np.random.shuffle(files) - for r_data in files: + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + + if VERBOSE: th.status_message('Extracting read levels.') + all_reads = list(reads_index.iter_reads()) + np.random.shuffle(all_reads) + for r_data in all_reads: r_means, r_seq = th.get_multiple_slots_read_centric( r_data, ['norm_mean', 'base']) if r_means is None: continue @@ -304,17 +286,18 @@ def plot_kmer_dist( break if reads_added in (0,1): - th._error_message_and_exit( + th.error_message_and_exit( 'No valid reads present.\n\t\tCheck that [--corrected-group] ' + - 'matches value used in resquiggle.\n\t\tAlso consider lowering ' + - '[--num-kmer-threshold] especially for k-mer lengths greater than 4.') + 'matches value used in resquiggle.\n\t\tAlso consider ' + + 'lowering [--num-kmer-threshold] especially for k-mer lengths ' + + 'greater than 4.') if reads_added < num_reads: - th._warning_message( + th.warning_message( 'Fewer valid reads present than requested.\n\tConsider ' + 'lowering [--num-kmer-threshold] especially for k-mer lengths ' + 'greater than 4.') - if VERBOSE: th._status_message('Preparing plot data.') + if VERBOSE: th.status_message('Preparing plot data.') kmer_levels = [kmer for means, kmer in sorted([ (np.mean(list(map(itemgetter(0), means))), kmer) for kmer, means in all_kmers.items()])] @@ -345,7 +328,7 @@ def plot_kmer_dist( i - upstrm_bases for kmer in kmer_levels for i in range(kmer_width)])}) except: - th._warning_message( + th.warning_message( 'Install R package `gridExtra` for ' + 'visual kmer display. Using text kmer display.') baseDat = r.NA_Character @@ -356,7 +339,7 @@ def plot_kmer_dist( r_struct_fn = r.StrVector([r_struct_fn,]) dont_plot_r = r.BoolVector([dont_plot,]) - if VERBOSE: th._status_message('Plotting.') + if VERBOSE: th.status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotKmerDist.R').decode()) if not dont_plot: r.r('pdf("' + pdf_fn + '", height=7, width=10)') if read_mean: @@ -371,6 +354,82 @@ def plot_kmer_dist( +########################## +#### Plot Single Read #### +########################## + +def plot_single_read( + read_data=None, fast5_fn=None, norm_type='median', scale_values=None, + add_vlines=False, png_fn='single_read_raw_signal.png', + corr_grp='RawGenomeCorrected_000', rna=False, manual_vlines=None, + num_obs=None, highlight_pos=None, highlight_ranges=None, + second_track_data=None): + vlineDat = r.DataFrame({'Position':r.IntVector([]),}) + if read_data is not None: + fast5_fn = read_data.fn + corr_grp = read_data.corr_group + rna = read_data.rna + + import h5py + with h5py.File(fast5_fn, 'r') as fast5_data: + raw_signal = th.get_raw_read_slot(fast5_data)['Signal'][:] + if num_obs is not None: + raw_signal = raw_signal[:num_obs] + if add_vlines: + corr_subgrp = fast5_data['/Analyses/' + corr_grp] + event_starts = corr_subgrp['Events']['start'] + events_end = event_starts[-1] + corr_subgrp['Events']['length'][-1] + raw_start = corr_subgrp['Events'].attrs.get('read_start_rel_to_raw') + raw_end = raw_start + events_end + if rna: + tmp_raw_end = raw_signal.shape[0] - raw_start + raw_start = raw_signal.shape[0] - raw_end + raw_end = tmp_raw_end + vlineDat = r.DataFrame({'Position':r.IntVector([ + raw_start, raw_end]),}) + elif manual_vlines is not None: + vlineDat = r.DataFrame({'Position':r.IntVector(manual_vlines),}) + + norm_signal, _ = ts.normalize_raw_signal( + raw_signal, norm_type=norm_type, scale_values=scale_values) + sigDat = r.DataFrame({ + 'Position':r.IntVector(range(norm_signal.shape[0])), + 'Signal':r.FloatVector(norm_signal)}) + + hDat = r.r('NULL') + if highlight_pos is not None: + if num_obs is not None: + highlight_pos = highlight_pos[highlight_pos < num_obs] + hDat = r.DataFrame({ + 'Position':r.IntVector(highlight_pos), + 'Signal':r.FloatVector(norm_signal[highlight_pos])}) + hrDat = r.r('NULL') + if highlight_ranges is not None: + if num_obs is not None: + highlight_ranges = highlight_ranges[highlight_ranges[:,1] < num_obs,] + hrDat = r.DataFrame({ + 'Position':r.IntVector(highlight_ranges[:,0]), + 'PositionEnd':r.IntVector(highlight_ranges[:,1])}) + + stDat = r.r('NULL') + if second_track_data is not None: + importr(str('cowplot')) + if num_obs is not None: + second_track_data = second_track_data[:num_obs] + stDat = r.DataFrame({ + 'Position':r.IntVector(range(len(second_track_data))), + 'Value':r.FloatVector(second_track_data)}) + + if VERBOSE: th.status_message('Plotting.') + r.r(resource_string(__name__, 'R_scripts/plotSingleRead.R').decode()) + r.r('png("' + png_fn + '", width=3000, height=1400)') + r.globalenv[str('plotSingleRead')](sigDat, vlineDat, hDat, hrDat, stDat) + r.r('dev.off()') + + return + + + ######################################## #### General data parsing functions #### ######################################## @@ -397,7 +456,7 @@ def get_read_correction_data( elif reg_type == 'random': reg_start = np.random.randint(0, events_end - num_obs) else: - raise NotImplementedError( + raise th.TomboError( 'Invalid reg_type (int or str) to extract read correction data') norm_reg_signal, _ = ts.normalize_raw_signal( @@ -516,52 +575,6 @@ def get_read_correction_data( return old_dat, new_dat, sig_dat, diff_dat -def get_read_reg_events(r_data, int_start, int_end): - r_means = th.get_single_slot_genome_centric(r_data, 'norm_mean') - if r_means is None: return None - if r_data.start > int_start and r_data.end < int_end: - # handle reads that are contained in a region - start_overlap = int_end - r_data.start - end_overlap = r_data.end - int_start - # create region with nan values - region_means = np.empty(int_end - int_start) - region_means[:] = np.NAN - region_means[-start_overlap:end_overlap] = r_means[ - -end_overlap:start_overlap] - elif r_data.start > int_start: - # handle reads that start in middle of region - start_overlap = int_end - r_data.start - # create region with nan values - region_means = np.empty(int_end - int_start) - region_means[:] = np.NAN - region_means[-start_overlap:] = r_means[:start_overlap] - elif r_data.end < int_end: - # handle reads that end inside region - end_overlap = r_data.end - int_start - # create region with nan values - region_means = np.empty(int_end - int_start) - region_means[:] = np.NAN - region_means[:end_overlap] = r_means[-end_overlap:] - else: - region_means = r_means[ - int_start - r_data.start:int_end - r_data.start] - - return region_means - -def get_reg_events(reg_reads, int_start, int_end, strand, - read_rows=False, num_reads=None): - reg_events = [ - get_read_reg_events(r_data, int_start, int_end) - for r_data in reg_reads if strand is None or r_data.strand == strand] - reg_events = [r_means for r_means in reg_events - if r_means is not None] - if num_reads is not None: - reg_events = reg_events[:num_reads] - - if read_rows: - return np.row_stack(reg_events) - return np.column_stack(reg_events) - def get_r_event_data( all_reg_data, plot_types, overplot_thresh, group_num='Group1'): Position, Signal, Strand, Region = [], [], [], [] @@ -572,8 +585,7 @@ def get_r_event_data( if sum(r_data.strand == strand for r_data in reg_data.reads) == 0: continue - reg_events = get_reg_events( - reg_data.reads, reg_data.start, reg_data.end, strand) + reg_events = reg_data.get_base_levels() for pos, base_read_means in enumerate(reg_events): # skip bases with zero or 1 read as ggplot won't # be able to estimate the density @@ -611,8 +623,7 @@ def get_r_boxplot_data( if sum(r_data.strand == strand for r_data in reg_data.reads) == 0: continue - reg_events = get_reg_events( - reg_data.reads, reg_data.start, reg_data.end, strand) + reg_events = reg_data.get_base_levels() for pos, base_read_means in enumerate(reg_events): # skip regions with no coverage if sum(~np.isnan(base_read_means)) == 0: @@ -655,8 +666,7 @@ def get_r_quant_data( if sum(r_data.strand == strand for r_data in reg_data.reads) == 0: continue - reg_events = get_reg_events( - reg_data.reads, reg_data.start, reg_data.end, strand) + reg_events = reg_data.get_base_levels() for pos, base_read_means in enumerate(reg_events): # skip regions with no coverage if sum(~np.isnan(base_read_means)) == 0: @@ -687,7 +697,8 @@ def get_r_quant_data( 'Group':r.StrVector(list(repeat(group_num, len(Position))))}) def get_r_raw_signal_data( - all_reg_data, plot_types, overplot_thresh, group_num='Group1'): + all_reg_data, plot_types, overplot_thresh, group_num='Group1', + genome_centric=True): not_warned = True Position, Signal, Read, Strand, Region = [], [], [], [], [] for reg_plot_sig, reg_data in zip(plot_types, all_reg_data): @@ -709,20 +720,30 @@ def get_r_raw_signal_data( reg_reads = plus_reads + minus_reads for r_num, r_data in enumerate(reg_reads): try: + # only extract the signal that overlaps this region (r_sig, overlap_seg_data, start_offset, scale_vals) = th.get_raw_signal( - r_data, reg_data.start, reg_data.end) + r_data, reg_data.start, reg_data.end) r_sig, _ = ts.normalize_raw_signal( r_sig, 0, r_sig.shape[0], scale_values=scale_vals) except: if not_warned: not_warned = False - th._warning_message( + th.warning_message( 'Genome resolved raw signal could not be retrieved ' + 'for some reads. Ensure that reads have been ' + 're-squiggled and that all data slot corresponding ' + 'accordingly.') continue + + if not genome_centric and r_data.strand == "-": + r_sig = r_sig[::-1] + overlap_seg_data = ( + overlap_seg_data[::-1] * -1) + overlap_seg_data[-1] + if len(overlap_seg_data) < reg_data.end - reg_data.start: + start_offset = reg_data.end - reg_data.start - len( + overlap_seg_data) - start_offset + for base_i, (b_start, b_end) in enumerate(zip( overlap_seg_data[:-1], overlap_seg_data[1:])): Position.extend( @@ -755,7 +776,8 @@ def get_plot_types_data(plot_args, quant_offset=0): return SignalData, QuantData, BoxData, EventData -def get_base_r_data(all_reg_data, zero_start=False, is_rna=False): +def get_base_r_data( + all_reg_data, zero_start=False, is_rna=False, genome_centric=True): BaseStart, Bases, BaseRegion, BaseStrand = [], [], [], [] for reg_data in all_reg_data: # skip regions without sequence data @@ -777,10 +799,18 @@ def get_base_r_data(all_reg_data, zero_start=False, is_rna=False): base = base.translate(th.COMP_BASES) if is_rna and base == 'T': base = 'U' - if zero_start: - BaseStart.append(unicode(i)) + if genome_centric: + if zero_start: + BaseStart.append(unicode(i)) + else: + BaseStart.append(unicode(i + reg_data.start)) else: - BaseStart.append(unicode(i + reg_data.start)) + if zero_start: + BaseStart.append(unicode( + reg_data.end - reg_data.start - i - 1)) + else: + BaseStart.append(unicode( + reg_data.end - i - 1)) Bases.append(base) BaseRegion.append(reg_data.reg_id) BaseStrand.append(REV_STRAND) @@ -794,7 +824,7 @@ def get_base_r_data(all_reg_data, zero_start=False, is_rna=False): ordered=True, levels=r.StrVector((FWD_STRAND, REV_STRAND)))}) -def get_model_r_data(all_reg_model_data): +def get_model_r_data(all_reg_model_data, genome_centric=True): Position, Strand, Mean, SD, Region = [], [], [], [], [] for reg_id, strand, fwd_model_data, rev_model_data in all_reg_model_data: if strand == '+' or strand is None: @@ -805,12 +835,21 @@ def get_model_r_data(all_reg_model_data): SD.append(base_model_sd) Region.append(reg_id) if strand == '-' or strand is None: - for pos, base_model_mean, base_model_sd in rev_model_data: - Position.append(pos) - Strand.append(REV_STRAND) - Mean.append(base_model_mean) - SD.append(base_model_sd) - Region.append(reg_id) + if genome_centric: + for pos, base_model_mean, base_model_sd in rev_model_data: + Position.append(pos) + Strand.append(REV_STRAND) + Mean.append(base_model_mean) + SD.append(base_model_sd) + Region.append(reg_id) + else: + start_pos, end_pos = rev_model_data[0][0], rev_model_data[-1][0] + for pos, base_model_mean, base_model_sd in rev_model_data: + Position.append(start_pos + end_pos - pos) + Strand.append(REV_STRAND) + Mean.append(base_model_mean) + SD.append(base_model_sd) + Region.append(reg_id) return r.DataFrame({ 'Position':r.FloatVector(Position), @@ -821,14 +860,17 @@ def get_model_r_data(all_reg_model_data): 'SD':r.FloatVector(SD), 'Region':r.StrVector(Region)}) -def get_reg_r_stats(all_reg_stats, are_pvals=True): +def get_reg_r_stats(all_reg_stats, are_pvals=False): Stats, Position, Read, Region = [], [], [], [] OrdRead, OrdRegion = [], [] - for reg_id, reg_stats in all_reg_stats: - if are_pvals: - reg_stats = -np.log10(reg_stats) + for reg_id, reg_strand, reg_stats in all_reg_stats: + # -log if p-values and trim/winsorize stats before clustering here + reg_stats = ts.transform_and_trim_stats( + reg_stats, are_pvals, PLOT_PVAL_MAX if are_pvals else PLOT_LLR_MAX) OrdRead.extend(ts.order_reads(reg_stats)) OrdRegion.extend(repeat(reg_id, reg_stats.shape[0])) + if reg_strand == '-': + reg_stats = reg_stats[:,::-1] for read_i, read_stats in enumerate(reg_stats): for pos, pos_stat in enumerate(read_stats): Stats.append(pos_stat) @@ -853,17 +895,14 @@ def get_reg_r_stats(all_reg_stats, are_pvals=True): ########################################################################### def plot_corrections( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - reg_type, num_obs, num_reads): - th._warning_message('The plot_correction command may be deprecated in ' + + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, reg_type, num_obs, + num_reads): + th.warning_message('The plot_correction command may be deprecated in ' + 'future versions of Tombo.') - if VERBOSE: th._status_message('Preparing plot data.') + if VERBOSE: th.status_message('Preparing plot data.') OldSegDat, NewSegDat, SigDat, DiffDat = [], [], [], [] - raw_read_coverage = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) - files = [r_data for cs_r_data in raw_read_coverage.values() - for r_data in cs_r_data] - for r_data in files: + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + for r_data in reads_index.iter_reads(): old_dat, new_dat, signal_dat, diff_dat = get_read_correction_data( r_data, reg_type, num_obs) if old_dat is None: @@ -877,22 +916,22 @@ def plot_corrections( if len(OldSegDat) >= num_reads: break if len(OldSegDat) == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'No reads were able to be processed. This command is ' + 'only applicable to reads processed with event_resquiggle. ' + 'Also check that --corrected-group and --basecall-subgroup ' + 'match the event_resquiggle command.') if VERBOSE and len(OldSegDat) < num_reads: - th._warning_message( + th.warning_message( 'Fewer reads than requested were able to ' + 'be processed. Likely too few reads provided or ' + 'those provided were not corrected.') - OldSegDat = r.DataFrame.rbind(*OldSegDat) - NewSegDat = r.DataFrame.rbind(*NewSegDat) - SigDat = r.DataFrame.rbind(*SigDat) - DiffDat = r.DataFrame.rbind(*DiffDat) + OldSegDat = _R_DF.rbind(*OldSegDat) + NewSegDat = _R_DF.rbind(*NewSegDat) + SigDat = _R_DF.rbind(*SigDat) + DiffDat = _R_DF.rbind(*DiffDat) - if VERBOSE: th._status_message('Plotting.') + if VERBOSE: th.status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotReadCorr.R').decode()) r.r('pdf("' + pdf_fn + '", height=7, width=11)') r.globalenv[str('plotReadCorr')](OldSegDat, NewSegDat, SigDat, DiffDat) @@ -901,19 +940,17 @@ def plot_corrections( return def plot_multi_corrections( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - num_reads_per_plot, num_regions, num_obs, include_orig_bcs, genome_locs): - th._warning_message('The plot_multi_correction command may be deprecated ' + + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, num_reads_per_plot, + num_regions, num_obs, include_orig_bcs, genome_locs): + th.warning_message('The plot_multi_correction command may be deprecated ' + 'in future versions of Tombo.') num_regions = num_regions if num_regions % 2 == 0 else \ num_regions + 1 - raw_read_coverage = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) - read_coverage = th.get_coverage(raw_read_coverage) + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) if genome_locs is None: coverage_regions = [] - for (chrm, strand), cs_coverage in read_coverage.items(): + for (chrm, strand), cs_coverage in reads_index.iter_cs_coverage(): reg_covs, reg_lens = zip(*[ (x, len(list(y))) for x, y in groupby(cs_coverage)]) coverage_regions.extend(zip( @@ -931,40 +968,38 @@ def plot_multi_corrections( ['{:03d}'.format(rn) for rn in range(num_regions)], coverage_regions[:num_regions])) if len(plot_locs) < num_regions: - th._warning_message( + th.warning_message( 'Fewer regions contain minimum ' + 'number of reads than requested.') else: - if VERBOSE: th._status_message('Parsing genome locations.') + if VERBOSE: th.status_message('Parsing genome locations.') parsed_locs = th.parse_genome_locations(genome_locs, default_strand='+') plot_locs = [ - ('{:03d}'.format(i), (chrm, int(pos) - 1, strand)) + ('{:03d}'.format(i), (chrm, pos, strand)) for i, (chrm, pos, strand) in enumerate(parsed_locs)] # filter regions with no coverage plot_locs = [ (reg_i, (chrm, start, strand)) for (reg_i, (chrm, start, strand)) in plot_locs - if (chrm, strand) in read_coverage and - read_coverage[(chrm, strand)][start] > 0] + if reads_index.get_coverage(chrm, start, strand) > 0] if len(plot_locs) < len(parsed_locs): - th._warning_message( + th.warning_message( 'Some regions did not contain read coverage.') if len(plot_locs) == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'No regions contain minimum number of reads.') - if VERBOSE: th._status_message('Preparing plot data.') + if VERBOSE: th.status_message('Preparing plot data.') OldSegDat, NewSegDat, SigDat = [], [], [] for reg_i, (chrm, reg_center, strand) in plot_locs: reg_num_reads = 0 - ## get num_reads_per_region reads from this region - reg_reads = [ - r_data for r_data in raw_read_coverage[(chrm, strand)] - if r_data.start <= reg_center - (num_obs / 2.0) and - r_data.end > reg_center + (num_obs / 2.0) and - r_data.strand == strand] - for r_data in reg_reads: + ## get num_reads_per_plot reads from this region + for r_data in ( + rd for rd in reads_index.get_cs_reads(chrm, strand) + if rd.start <= reg_center - (num_obs / 2.0) and + rd.end > reg_center + (num_obs / 2.0) and + rd.strand == strand): try: old_dat, new_dat, signal_dat, diff_dat \ = get_read_correction_data( @@ -985,16 +1020,17 @@ def plot_multi_corrections( if reg_num_reads >= num_reads_per_plot: break if reg_num_reads < num_reads_per_plot: - # TODO: figure out if we should warn here + th.warning_message( + 'Fewer reads found than requested for this region.') pass try: - OldSegDat = r.DataFrame.rbind(*OldSegDat) + OldSegDat = _R_DF.rbind(*OldSegDat) except: OldSegDat = None - NewSegDat = r.DataFrame.rbind(*NewSegDat) - SigDat = r.DataFrame.rbind(*SigDat) + NewSegDat = _R_DF.rbind(*NewSegDat) + SigDat = _R_DF.rbind(*SigDat) - if VERBOSE: th._status_message('Plotting.') + if VERBOSE: th.status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotMultiReadCorr.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') if include_orig_bcs and OldSegDat is not None: @@ -1010,8 +1046,9 @@ def plot_multi_corrections( #### Base plotting linker functions #### ######################################## -def get_plots_titles(all_reg_data, all_reg_data2, overplot_type, - overplot_thresh, model_plot=False): +def get_plots_titles( + all_reg_data, all_reg_data2, overplot_type, overplot_thresh, + model_plot=False, include_chrm=True, include_cov=True): strand_cov = [] for reg_i in range(len(all_reg_data)): reg_cov1 = [ @@ -1039,40 +1076,49 @@ def get_plots_titles(all_reg_data, all_reg_data2, overplot_type, titles = [] for int_i, r_cov, r_ovp in zip(all_reg_data, strand_cov, dnspl_stars): + reg_title = int_i.chrm if include_chrm else '' if all_reg_data2 is None: if int_i.strand is None: - reg_title = int_i.chrm + ' ' + int_i.reg_text + \ - " ::: Coverage: " + unicode(r_cov[0]) + r_ovp[0] + \ - " + " + unicode(r_cov[1]) + r_ovp[1] + " -" + reg_title += ' ' + int_i.reg_text + if include_cov: + reg_title += ( + " Coverage: " + unicode(r_cov[0]) + r_ovp[0] + " + " + + unicode(r_cov[1]) + r_ovp[1] + " -") else: - cov_str = unicode(r_cov[0]) + r_ovp[0] if int_i.strand == '+' \ - else unicode(r_cov[1]) + r_ovp[1] - reg_title = int_i.chrm + ( - ":" + int_i.strand if int_i.strand else '') + \ - ' ' + int_i.reg_text + " ::: Coverage: " + cov_str + if include_chrm: + reg_title += ':' + int_i.strand + reg_title += ' ' + int_i.reg_text + if include_cov: + cov_str = ( + unicode(r_cov[0]) + r_ovp[0] if int_i.strand == '+' + else unicode(r_cov[1]) + r_ovp[1]) + reg_title += " Coverage: " + cov_str if model_plot and overplot_type in ( 'Density', 'Quantile', 'Boxplot'): - reg_title += ' (Model in Black)' - titles.append(reg_title) + reg_title += ' (Model in Black)' else: if int_i.strand is None: - titles.append( - int_i.chrm + ' ' + int_i.reg_text + - " ::: Coverage: Sample (Red): " + - unicode(r_cov[0]) + r_ovp[0] + " + " + - unicode(r_cov[1]) + r_ovp[1] + " -; Control (Black): " + - unicode(r_cov[2]) + r_ovp[2] + " + " + - unicode(r_cov[3]) + r_ovp[3] + " -") + reg_title += ' ' + int_i.reg_text + if include_cov: + reg_title += ( + " Coverage: Sample (Red): " + + unicode(r_cov[0]) + r_ovp[0] + " + " + + unicode(r_cov[1]) + r_ovp[1] + " -; Control (Black): " + + unicode(r_cov[2]) + r_ovp[2] + " + " + + unicode(r_cov[3]) + r_ovp[3] + " -") else: - cov_str = ( - 'Sample (Red): ' + unicode(r_cov[0]) + r_ovp[0] + - '; Control (Black): ' + unicode(r_cov[2]) + r_ovp[2] - ) if int_i.strand == '+' else ( - 'Sample (Red): ' + unicode(r_cov[1]) + r_ovp[1] + - '; Control (Black): ' + unicode(r_cov[3]) + r_ovp[3]) - titles.append( - int_i.chrm + ":" + int_i.strand + ' ' + int_i.reg_text + - " ::: Coverage: " + cov_str) + if include_chrm: + reg_title += ":" + int_i.strand + reg_title += ' ' + int_i.reg_text + if include_cov: + cov_str = ( + 'Sample (Red): ' + unicode(r_cov[0]) + r_ovp[0] + + '; Control (Black): ' + unicode(r_cov[2]) + r_ovp[2] + ) if int_i.strand == '+' else ( + 'Sample (Red): ' + unicode(r_cov[1]) + r_ovp[1] + + '; Control (Black): ' + unicode(r_cov[3]) + r_ovp[3]) + reg_title += " Coverage: " + cov_str + titles.append(reg_title) Titles = r.DataFrame({ 'Title':r.StrVector(titles), @@ -1081,24 +1127,23 @@ def get_plots_titles(all_reg_data, all_reg_data2, overplot_type, return Titles, plot_types def plot_single_sample( - plot_intervals, raw_read_coverage, overplot_thresh, - overplot_type, pdf_fn): - if VERBOSE: th._status_message('Preparing plot data.') - all_reg_data = th.get_region_reads(plot_intervals, raw_read_coverage) - if len(all_reg_data) == 0: - th._error_message_and_exit('No reads in any selected regions.') - if len(all_reg_data) < len(plot_intervals): - th._warning_message('Some selected regions contain no reads.') - rna = th.is_rna(raw_read_coverage) + plot_intervals, reads_index, overplot_thresh, overplot_type, pdf_fn, + title_include_chrm=True, title_include_cov=True): + if VERBOSE: th.status_message('Preparing plot data.') + for p_int in plot_intervals: + p_int.add_reads(reads_index).add_seq() + plot_intervals = th.filter_empty_regions(plot_intervals) + rna = th.is_sample_rna(reads_index=reads_index) Titles, plot_types = get_plots_titles( - all_reg_data, None, overplot_type, overplot_thresh) + plot_intervals, None, overplot_type, overplot_thresh, + include_chrm=title_include_chrm, include_cov=title_include_cov) - BasesData = get_base_r_data(all_reg_data, is_rna=rna) + BasesData = get_base_r_data(plot_intervals, is_rna=rna) SignalData, QuantData, BoxData, EventData = get_plot_types_data( - (all_reg_data, plot_types, overplot_thresh, 'Group1')) + (plot_intervals, plot_types, overplot_thresh, 'Group1')) - if VERBOSE: th._status_message('Plotting.') + if VERBOSE: th.status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotSingleRun.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') r.globalenv[str('plotSingleRun')](SignalData, QuantData, BoxData, @@ -1107,12 +1152,12 @@ def plot_single_sample( return -def filter_and_merge_group_regs(g1_data, g2_data): +def filter_and_merge_regs(g1_data, g2_data): filt_g1, filt_g2, merged_reg_data, both_no_cov = [], [], [], [] for r1, r2 in zip(g1_data, g2_data): - both_reads = r1.reads + r2.reads - if len(both_reads) > 0: - merged_reg_data.append(r1._replace(reads=both_reads)) + merged_reg = r1.merge(r2) + if len(merged_reg.reads) > 0: + merged_reg_data.append(merged_reg.add_seq()) filt_g1.append(r1) filt_g2.append(r2) else: @@ -1121,34 +1166,33 @@ def filter_and_merge_group_regs(g1_data, g2_data): r1.strand)))) if len(both_no_cov) > 0 and VERBOSE: - th._warning_message( + th.warning_message( 'Some regions include no reads: ' + '\t'.join(both_no_cov)) if len(merged_reg_data) == 0: - th._error_message_and_exit('No reads in any selected regions.') + th.error_message_and_exit('No reads in any selected regions.') return merged_reg_data, filt_g1, filt_g2 def plot_two_samples( - plot_intervals, raw_read_coverage1, raw_read_coverage2, - overplot_thresh, overplot_type, pdf_fn, seqs_fn=None): - if VERBOSE: th._status_message('Preparing plot data.') + plot_intervals, reads_index, ctrl_reads_index, overplot_thresh, + overplot_type, pdf_fn, seqs_fn=None, title_include_chrm=True, + title_include_cov=True): + if VERBOSE: th.status_message('Preparing plot data.') # get reads overlapping each region - all_reg_data1 = th.get_region_reads( - plot_intervals, raw_read_coverage1, filter_no_cov=False, add_seq=False) - all_reg_data2 = th.get_region_reads( - plot_intervals, raw_read_coverage2, filter_no_cov=False, add_seq=False) + all_reg_data1 = [p_int.copy().add_reads(reads_index) + for p_int in plot_intervals] + all_reg_data2 = [p_int.copy().add_reads(ctrl_reads_index) + for p_int in plot_intervals] # filter regions with no coverage in either read group - merged_reg_data, all_reg_data1, all_reg_data2 = filter_and_merge_group_regs( + merged_reg_data, all_reg_data1, all_reg_data2 = filter_and_merge_regs( all_reg_data1, all_reg_data2) - if len(merged_reg_data) < len(plot_intervals): - th._warning_message('Some selected regions contain no reads.') Titles, plot_types = get_plots_titles( - all_reg_data1, all_reg_data2, overplot_type, overplot_thresh) + all_reg_data1, all_reg_data2, overplot_type, overplot_thresh, + include_chrm=title_include_chrm, include_cov=title_include_cov) - merged_reg_data = th.add_reg_seq(merged_reg_data) BasesData = get_base_r_data(merged_reg_data) # get plotting data for either quantiles of raw signal @@ -1157,19 +1201,19 @@ def plot_two_samples( SignalData2, QuantData2, BoxData2, EventData2 = get_plot_types_data( (all_reg_data2, plot_types, overplot_thresh, 'Group2'), 0.5) - if VERBOSE: th._status_message('Plotting.') + if VERBOSE: th.status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotGroupComp.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') r.globalenv[str('plotGroupComp')]( - r.DataFrame.rbind(SignalData1, SignalData2), - r.DataFrame.rbind(QuantData1, QuantData2), - r.DataFrame.rbind(BoxData1, BoxData2), - r.DataFrame.rbind(EventData1, EventData2), + _R_DF.rbind(SignalData1, SignalData2), + _R_DF.rbind(QuantData1, QuantData2), + _R_DF.rbind(BoxData1, BoxData2), + _R_DF.rbind(EventData1, EventData2), BasesData, Titles, 0.4) r.r('dev.off()') if seqs_fn is not None: - if VERBOSE: th._status_message('Outputting region seqeuences.') + if VERBOSE: th.status_message('Outputting region seqeuences.') with io.open(seqs_fn, 'wt') as seqs_fp: for int_i in merged_reg_data: # get the interval from the base data struct @@ -1181,48 +1225,42 @@ def plot_two_samples( return -def get_reg_kmers(tb_model_fn, plot_intervals, raw_read_coverage, - min_reg_overlap=None, alt_model_fn=None): +def get_reg_kmers( + std_ref, plot_intervals, reads_index, + min_reg_overlap=None, alt_ref=None): def get_reg_reads(reads, int_start, int_end): """ Filter reads obtained from expanded interval """ return [r_data for r_data in reads if not (r_data.start >= int_end or r_data.end <= int_start)] - std_ref = ts.TomboModel(tb_model_fn) # compute kmer values to make strand specific calculations easier dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 expand_width = max(std_ref.central_pos, dnstrm_bases) filt_width = expand_width if min_reg_overlap is None else \ expand_width + min_reg_overlap - if alt_model_fn is not None: - alt_ref = ts.TomboModel(alt_model_fn) + if alt_ref is not None: if (alt_ref.central_pos != std_ref.central_pos or alt_ref.kmer_width != alt_ref.kmer_width): - th._error_message_and_exit( + th.error_message_and_exit( 'Standard model not based on the same kmer position ' + 'as alternative model.') - # expand regions to get kmers at first and last positions - expanded_intervals = [p_int._replace(start=p_int.start - expand_width, - end=p_int.end + expand_width) - for p_int in plot_intervals] - # get reads and region sequence - expanded_intervals = th.get_region_reads( - expanded_intervals, raw_read_coverage) - expand_seqs = [int_i.seq for int_i in expanded_intervals] - rev_expand_seqs = [th.rev_comp(int_i.seq) for int_i in expanded_intervals] + # expand regions to get kmers at first and last positions, + # add reads and region sequence + for p_int in plot_intervals: + p_int.expand_interval(expand_width).add_reads(reads_index).add_seq() + expand_seqs = [p_int.seq for p_int in plot_intervals] + rev_expand_seqs = [th.rev_comp(p_int.seq) for p_int in plot_intervals] # convert back to original plot_intervals with seq from exanded intervals - all_reg_data = [ - int_i._replace(start=int_i.start + expand_width, - end=int_i.end - expand_width, - reads=get_reg_reads(int_i.reads, int_i.start + filt_width, - int_i.end - filt_width), - seq=int_i.seq[expand_width:-expand_width]) - for int_i in expanded_intervals] + for p_int in plot_intervals: + p_int.expand_interval(-expand_width) + p_int.update(reads=get_reg_reads(p_int.reads, p_int.start + filt_width, + p_int.end - filt_width), + seq=p_int.seq[expand_width:-expand_width]) all_reg_model_data, all_reg_alt_model_data = [], [] for reg_data, reg_seq, rev_seq in zip( - all_reg_data, expand_seqs, rev_expand_seqs): + plot_intervals, expand_seqs, rev_expand_seqs): clipped_reg_seq = reg_seq clipped_rev_seq = rev_seq if std_ref.central_pos > dnstrm_bases: @@ -1246,7 +1284,7 @@ def get_reg_reads(reads, int_start, int_end): std_ref.sds[kmer]) for pos, kmer in enumerate(rev_kmers) if not th.invalid_seq(kmer)])) # if alternative model is supplied add info - if alt_model_fn is not None: + if alt_ref is not None: all_reg_alt_model_data.append(( reg_data.reg_id, reg_data.strand, [(reg_data.start + pos, alt_ref.means[kmer], @@ -1258,54 +1296,54 @@ def get_reg_reads(reads, int_start, int_end): for pos, kmer in enumerate(rev_kmers) if not th.invalid_seq(kmer)])) - return all_reg_data, all_reg_model_data, all_reg_alt_model_data + return all_reg_model_data, all_reg_alt_model_data def plot_motif_centered_with_stats( - raw_read_coverage1, raw_read_coverage2, plot_intervals, - stat_locs, overplot_thresh, pdf_fn, tb_model_fn, alt_model_fn=None): - if VERBOSE: th._status_message('Preparing plot data.') + reads_index, ctrl_reads_index, plot_intervals, stat_locs, + overplot_thresh, pdf_fn, std_ref, alt_ref=None): + if VERBOSE: th.status_message('Preparing plot data.') + # note all genome_centric options so that plots are all read direction-centric ModelData = r.r('NULL') - if raw_read_coverage2 is None: - if tb_model_fn is None: - merged_reg_data = th.get_region_reads( - plot_intervals, raw_read_coverage1, filter_no_cov=False) - plot_types = ['Downsample' for _ in merged_reg_data] - SignalData, _, _, _ = get_plot_types_data( - (merged_reg_data, plot_types, overplot_thresh, 'Group1')) + plot_types = ['Downsample' for _ in plot_intervals] + if ctrl_reads_index is None: + if std_ref is None: + for p_int in plot_intervals: + p_int.add_reads(reads_index).add_seq() + SignalData = get_r_raw_signal_data( + plot_intervals, plot_types, overplot_thresh, 'Group1', + genome_centric=False) else: - (merged_reg_data, all_reg_model_data, - all_reg_alt_model_data) = get_reg_kmers( - tb_model_fn, plot_intervals, raw_read_coverage1, - alt_model_fn=alt_model_fn) - plot_types = ['Downsample' for _ in merged_reg_data] - SignalData, _, _, _ = get_plot_types_data( - (merged_reg_data, plot_types, overplot_thresh, 'Group1')) - ModelData = get_model_r_data(all_reg_model_data) - if alt_model_fn is not None: - AltModelData = get_model_r_data(all_reg_alt_model_data) + all_reg_model_data, all_reg_alt_model_data = get_reg_kmers( + std_ref, plot_intervals, reads_index, alt_ref=alt_ref) + SignalData = get_r_raw_signal_data( + plot_intervals, plot_types, overplot_thresh, 'Group1', + genome_centric=False) + + ModelData = get_model_r_data(all_reg_model_data, genome_centric=False) + if alt_ref is not None: + AltModelData = get_model_r_data( + all_reg_alt_model_data, genome_centric=False) else: - all_reg_data1 = th.get_region_reads( - plot_intervals, raw_read_coverage1, filter_no_cov=False, - add_seq=False) - all_reg_data2 = th.get_region_reads( - plot_intervals, raw_read_coverage2, filter_no_cov=False, - add_seq=False) - - (merged_reg_data, all_reg_data1, - all_reg_data2) = filter_and_merge_group_regs( - all_reg_data1, all_reg_data2) - plot_types = ['Downsample' for _ in merged_reg_data] - merged_reg_data = th.add_reg_seq(merged_reg_data) + all_reg_data = [p_int.copy().add_reads(reads_index) + for p_int in plot_intervals] + ctrl_reg_data = [p_int.copy().add_reads(ctrl_reads_index) + for p_int in plot_intervals] + + plot_intervals, all_reg_data, ctrl_reg_data = filter_and_merge_regs( + all_reg_data, ctrl_reg_data) # sigDat lists - SignalData1, _, _, _ = get_plot_types_data( - (all_reg_data1, plot_types, overplot_thresh, 'Group1')) - SignalData2, _, _, _ = get_plot_types_data( - (all_reg_data2, plot_types, overplot_thresh, 'Group2')) - SignalData = r.DataFrame.rbind(SignalData1, SignalData2) + plot_types = ['Downsample' for _ in plot_intervals] + SignalData = get_r_raw_signal_data( + all_reg_data, plot_types, overplot_thresh, 'Group1', + genome_centric=False) + CtrlSignalData = get_r_raw_signal_data( + ctrl_reg_data, plot_types, overplot_thresh, 'Group2', + genome_centric=False) + SignalData = _R_DF.rbind(SignalData, CtrlSignalData) - BasesData = get_base_r_data(merged_reg_data) + BasesData = get_base_r_data(plot_intervals, genome_centric=False) plot_poss, plot_stats = zip(*stat_locs) # stat lists @@ -1313,10 +1351,10 @@ def plot_motif_centered_with_stats( 'Position':r.FloatVector(plot_poss), 'Stat':r.FloatVector(plot_stats)}) - if VERBOSE: th._status_message('Plotting.') + if VERBOSE: th.status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotMotifStats.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=8)') - if alt_model_fn is None: + if alt_ref is None: r.globalenv[str('plotMotifStats')]( SignalData, BasesData, StatsData, ModelData) else: @@ -1327,32 +1365,30 @@ def plot_motif_centered_with_stats( return def plot_model_single_sample( - plot_intervals, raw_read_coverage, tb_model_fn, - overplot_type, overplot_thresh, pdf_fn, alt_model_fn=None, seqs_fn=None): - if VERBOSE: th._status_message('Preparing plot data.') + plot_intervals, reads_index, std_ref, overplot_type, overplot_thresh, + pdf_fn, alt_ref=None, seqs_fn=None, title_include_chrm=True, + title_include_cov=True): + if VERBOSE: th.status_message('Preparing plot data.') # get reads overlapping each region along with all kmers - all_reg_data, all_reg_model_data, all_reg_alt_model_data = get_reg_kmers( - tb_model_fn, plot_intervals, raw_read_coverage, - alt_model_fn=alt_model_fn) - if len(all_reg_data) == 0: - th._error_message_and_exit('No reads in any selected regions.') - if len(all_reg_data) < len(plot_intervals): - th._warning_message('Some selected regions contain no reads.') - rna = th.is_rna(raw_read_coverage) + all_reg_model_data, all_reg_alt_model_data = get_reg_kmers( + std_ref, plot_intervals, reads_index, alt_ref=alt_ref) + plot_intervals = th.filter_empty_regions(plot_intervals) + rna = th.is_sample_rna(reads_index=reads_index) Titles, plot_types = get_plots_titles( - all_reg_data, None, overplot_type, overplot_thresh, True) + plot_intervals, None, overplot_type, overplot_thresh, True, + include_chrm=title_include_chrm, include_cov=title_include_cov) ModelData = get_model_r_data(all_reg_model_data) - if alt_model_fn is not None: + if alt_ref is not None: AltModelData = get_model_r_data(all_reg_alt_model_data) - BasesData = get_base_r_data(all_reg_data, is_rna=rna) + BasesData = get_base_r_data(plot_intervals, is_rna=rna) SignalData, QuantData, BoxData, EventData = get_plot_types_data( - (all_reg_data, plot_types, overplot_thresh, 'Group1')) + (plot_intervals, plot_types, overplot_thresh, 'Group1')) - if VERBOSE: th._status_message('Plotting.') + if VERBOSE: th.status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotModelComp.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') - if alt_model_fn is None: + if alt_ref is None: r.globalenv[str('plotModelComp')]( SignalData, QuantData, BoxData, EventData, BasesData, Titles, ModelData) @@ -1363,9 +1399,9 @@ def plot_model_single_sample( r.r('dev.off()') if seqs_fn is not None: - if VERBOSE: th._status_message('Outputting region seqeuences.') + if VERBOSE: th.status_message('Outputting region seqeuences.') with io.open(seqs_fn, 'wt') as seqs_fp: - for int_i in all_reg_data: + for int_i in plot_intervals: reg_seq = int_i.seq if int_i.strand == '+' else th.rev_comp( int_i.seq) seqs_fp.write('>{0}::{1:d}::{2} {3}\n{4}\n'.format( @@ -1375,12 +1411,13 @@ def plot_model_single_sample( return def plot_per_read_modification( - all_reg_data, all_reg_stats, are_pvals, box_center, pdf_fn): - if VERBOSE: th._status_message('Preparing plot data.') + plot_intervals, all_reg_stats, are_pvals, box_center, pdf_fn): + if VERBOSE: th.status_message('Preparing plot data.') StatData, OrdData = get_reg_r_stats(all_reg_stats, are_pvals) - BasesData = get_base_r_data(all_reg_data, zero_start=True) + BasesData = get_base_r_data( + plot_intervals, zero_start=True, genome_centric=False) - if VERBOSE: th._status_message('Plotting.') + if VERBOSE: th.status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotPerReadStats.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') r.globalenv[str('plotPerReadStats')]( @@ -1394,77 +1431,65 @@ def plot_per_read_modification( #### Plot processing methods #### ################################# -def get_valid_model_fns( - tb_model_fn, plot_default_stnd, alt_model_fn, - plot_default_alt, raw_read_coverage, f5_dirs2=None): - # if no model was requested - if (tb_model_fn is None and not plot_default_stnd and - alt_model_fn is None and not plot_default_alt): - return None, None - - if tb_model_fn is None: - tb_model_fn, _ = ts.get_default_standard_ref(raw_read_coverage) - if alt_model_fn is None and plot_default_alt is not None: - alt_model_fn, _ = ts.get_default_alt_ref( - plot_default_alt, raw_read_coverage) - - if f5_dirs2 is not None and tb_model_fn is not None: - th._warning_message( - 'Both a second set of FAST5s and a tombo model were ' + - 'provided. Two samples with model plotting is not ' + - 'currently available. Models requested will be ignored.') - - return tb_model_fn, alt_model_fn - def plot_max_coverage( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - f5_dirs2, num_regions, num_bases, overplot_thresh, overplot_type, - tb_model_fn, alt_model_fn, plot_default_stnd, plot_default_alt): - raw_read_coverage = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, ctrl_fast5s_dirs, + num_regions, num_bases, overplot_thresh, overplot_type, tb_model_fn, + alt_model_fn, plot_default_stnd, plot_default_alt, uniq_intervals=True): + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) - tb_model_fn, alt_model_fn = get_valid_model_fns( + std_ref, alt_ref = ts.load_valid_models( tb_model_fn, plot_default_stnd, alt_model_fn, plot_default_alt, - raw_read_coverage, f5_dirs2) - if f5_dirs2 is None: + reads_index, ctrl_fast5s_dirs) + if ctrl_fast5s_dirs is None: coverage_regions = [] - for chrm, strand, cs_cov, cs_cov_starts in th.get_coverage_regions( - raw_read_coverage): + for (chrm, strand, cs_cov, + cs_cov_starts) in reads_index.iter_coverage_regions(): coverage_regions.extend(zip( cs_cov, cs_cov_starts, repeat(chrm), repeat(strand))) # max coverage plots both strands coverage - plot_intervals = [ - th.intervalData('{:03d}'.format(rn), chrm, start, start + num_bases) - for rn, (stat, start, chrm, strand) in - enumerate(sorted(coverage_regions, reverse=True)[:num_regions])] + if uniq_intervals: + plot_intervals = [ + th.intervalData(chrm=chrm, start=start, end=start + num_bases, + reg_id='{:03d}'.format(rn)) + for rn, (stat, start, chrm, strand) in + enumerate(sorted(coverage_regions, reverse=True)[:( + num_regions * 20)])] + plot_intervals = th.get_unique_intervals( + plot_intervals, num_regions=num_regions) + else: + plot_intervals = [ + th.intervalData(chrm=chrm, start=start, end=start + num_bases, + reg_id='{:03d}'.format(rn)) + for rn, (stat, start, chrm, strand) in + enumerate(sorted(coverage_regions, reverse=True)[:num_regions])] - if tb_model_fn is None: + if std_ref is None: plot_single_sample( - plot_intervals, raw_read_coverage, overplot_thresh, + plot_intervals, reads_index, overplot_thresh, overplot_type, pdf_fn) else: plot_model_single_sample( - plot_intervals, raw_read_coverage, tb_model_fn, - overplot_type, overplot_thresh, pdf_fn, alt_model_fn) + plot_intervals, reads_index, std_ref, + overplot_type, overplot_thresh, pdf_fn, alt_ref) else: - raw_read_coverage2 = th.parse_fast5s( - f5_dirs2, corrected_group, basecall_subgroups) + ctrl_reads_index = th.TomboReads(ctrl_fast5s_dirs, corr_grp, bc_subgrps) coverage_regions = [] # only process chromosomes in both read groups - for chrm, strand, cs_cov, cs_cov_starts in th.get_coverage_regions( - raw_read_coverage, raw_read_coverage2): + for (chrm, strand, cs_cov, + cs_cov_starts) in reads_index.iter_coverage_regions( + ctrl_reads_index): coverage_regions.extend(zip( cs_cov, cs_cov_starts, repeat(chrm), repeat(strand))) # max coverage plots both strands coverage plot_intervals = [ - th.intervalData('{:03d}'.format(rn), chrm, start, start + num_bases) + th.intervalData(chrm=chrm, start=start, end=start + num_bases, + reg_id='{:03d}'.format(rn)) for rn, (stat, start, chrm, strand) in enumerate(sorted(coverage_regions, reverse=True)[:num_regions])] - plot_two_samples( - plot_intervals, raw_read_coverage, raw_read_coverage2, + plot_intervals, reads_index, ctrl_reads_index, overplot_thresh, overplot_type, pdf_fn) return @@ -1473,86 +1498,81 @@ def plot_max_coverage( _plot_max_wrapper = plot_max_coverage def plot_max_coverage(*args, **kwargs): import cProfile - cProfile.runctx('_plot_max_wrapper(*args, **kwargs)', globals(), locals(), + cProfile.runctx('_plot_max_wrapper(*args, **kwargs)', + globals(), locals(), filename='plot_max_cov.prof') return def plot_genome_locations( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - f5_dirs2, num_bases, overplot_thresh, overplot_type, + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, + ctrl_fast5s_dirs, num_bases, overplot_thresh, overplot_type, genome_locs, tb_model_fn, alt_model_fn, plot_default_stnd, plot_default_alt): - if VERBOSE: th._status_message('Parsing genome locations.') + if VERBOSE: th.status_message('Parsing genome locations.') # minus one here as all python internal coords are 0-based, but # genome is generally 1-based plot_intervals = [] for i, (chrm, pos, strand) in enumerate(th.parse_genome_locations( genome_locs)): - int_start = max( - 0, int(int(pos) - np.floor(num_bases / 2.0) - 1)) + int_start = max(0, int(pos - np.floor(num_bases / 2.0))) plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, int_start + num_bases, strand)) + chrm=chrm, start=int_start, end=int_start + num_bases, strand=strand, + reg_id='{:03d}'.format(i))) - raw_read_coverage = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) - tb_model_fn, alt_model_fn = get_valid_model_fns( + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + std_ref, alt_ref = ts.load_valid_models( tb_model_fn, plot_default_stnd, alt_model_fn, plot_default_alt, - raw_read_coverage, f5_dirs2) + reads_index, ctrl_fast5s_dirs) - if f5_dirs2 is None: - if tb_model_fn is None: + if ctrl_fast5s_dirs is None: + if std_ref is None: plot_single_sample( - plot_intervals, raw_read_coverage, overplot_thresh, + plot_intervals, reads_index, overplot_thresh, overplot_type, pdf_fn) else: plot_model_single_sample( - plot_intervals, raw_read_coverage, tb_model_fn, - overplot_type, overplot_thresh, pdf_fn, alt_model_fn) + plot_intervals, reads_index, std_ref, overplot_type, + overplot_thresh, pdf_fn, alt_ref) else: - raw_read_coverage2 = th.parse_fast5s( - f5_dirs2, corrected_group, basecall_subgroups) + ctrl_reads_index = th.TomboReads(ctrl_fast5s_dirs, corr_grp, bc_subgrps) plot_two_samples( - plot_intervals, raw_read_coverage, raw_read_coverage2, + plot_intervals, reads_index, ctrl_reads_index, overplot_thresh, overplot_type, pdf_fn) return def plot_per_read_mods_genome_location( - f5_dirs, corrected_group, basecall_subgroups, pdf_fn, - per_read_stats_fn, genome_locs, num_bases, num_reads, box_center, - fasta_fn): - if VERBOSE: th._status_message('Parsing genome locations.') + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, per_read_stats_fn, + genome_locs, num_bases, num_reads, box_center, fasta_fn): + if VERBOSE: th.status_message('Parsing genome locations.') plot_intervals = [] for i, (chrm, pos, strand) in enumerate(th.parse_genome_locations( genome_locs, default_strand='+')): - int_start = max( - 0, int(int(pos) - np.floor(num_bases / 2.0) - 1) + 1) + int_start = max(0, int(pos - np.floor(num_bases / 2.0))) plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, int_start + num_bases, strand)) + chrm=chrm, start=int_start, end=int_start + num_bases, + strand=strand, reg_id='{:03d}'.format(i))) # add sequence to each region if fast5s or fasta are provided if fasta_fn is not None: genome_index = th.Fasta(fasta_fn) - plot_intervals_w_seq = [] for int_data in plot_intervals: - plot_intervals_w_seq.append( - int_data._replace(seq=genome_index.get_seq( - int_data.chrm, int_data.start, int_data.end))) - plot_intervals = plot_intervals_w_seq - elif f5_dirs is not None: - raw_read_coverage = th.parse_fast5s( - f5_dirs, corrected_group, basecall_subgroups) - plot_intervals = th.get_region_reads(plot_intervals, raw_read_coverage) + int_data.add_seq(genome_index) + elif fast5s_dirs is not None: + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + for p_int in plot_intervals: + p_int.add_reads(reads_index).add_seq() else: - th._warning_message( + th.warning_message( 'No read FAST5 directory or genome FASTA file provided. ' + 'Plotting without sequence.') - if VERBOSE: th._status_message('Parsing per read statistics.') + if VERBOSE: th.status_message('Parsing per read statistics.') per_read_stats = ts.PerReadStats(per_read_stats_fn) interval_stats = [] for int_data in plot_intervals: - int_stats = per_read_stats.get_region_per_read_stats(int_data, num_reads) + int_stats = per_read_stats.get_region_per_read_stats( + int_data, num_reads) if int_stats is not None: # convert long form stats to matrix form (so they can be clustered) # regular sort doesn't seem to work for string (object) types @@ -1561,7 +1581,7 @@ def plot_per_read_mods_genome_location( # use interval data instead of stats dimensions since regDat is # used to compute some window distances in R, so it must be full # matrix for the region with NAs - int_len = int_data.end - int_data.start + 1 + int_len = int_data.end - int_data.start all_read_stats = np.split( int_stats, np.where(int_stats['read_id'][:-1] != int_stats['read_id'][1:])[0] + 1) @@ -1572,7 +1592,8 @@ def plot_per_read_mods_genome_location( np.put(read_stats_mat[read_i,:], read_int_stats['pos'] - int_data.start, read_int_stats['stat']) - interval_stats.append((int_data.reg_id, read_stats_mat)) + interval_stats.append(( + int_data.reg_id, int_data.strand, read_stats_mat)) are_pvals = per_read_stats.are_pvals per_read_stats.close() @@ -1583,11 +1604,11 @@ def plot_per_read_mods_genome_location( return def plot_motif_centered( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - f5_dirs2, num_regions, num_bases, overplot_thresh, overplot_type, + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, ctrl_fast5s_dirs, + num_regions, num_bases, overplot_thresh, overplot_type, motif, fasta_fn, deepest_coverage, tb_model_fn, alt_model_fn, plot_default_stnd, plot_default_alt): - if VERBOSE: th._status_message('Identifying genomic k-mer locations.') + if VERBOSE: th.status_message('Identifying genomic k-mer locations.') genome_index = th.Fasta(fasta_fn) motif = th.TomboMotif(motif) @@ -1605,10 +1626,10 @@ def get_motif_locs(covered_chrms): motif_locs.append((chrm, motif_loc.start(), '-')) if len(motif_locs) == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'Motif (' + motif.raw_motif + ') not found in genome.') elif len(motif_locs) < num_regions: - th._warning_message( + th.warning_message( 'Motif (' + motif.raw_motif + ') only found ' + unicode(len(motif_locs)) + ' times in genome.') num_region = len(motif_locs) @@ -1616,16 +1637,12 @@ def get_motif_locs(covered_chrms): return motif_locs - def get_pos_cov(chrm, pos, strand, read_coverage, read_coverage2=None): + def get_pos_cov(chrm, pos, strand, reads_index, ctrl_reads_index): + """Compute minimum coverage between the 2 samples + """ def get_strand_cov(cov_strand): - try: - if read_coverage2 is None: - return read_coverage[(chrm, cov_strand)][pos] - else: - return min(read_coverage[(chrm, cov_strand)][pos], - read_coverage2[(chrm, cov_strand)][pos]) - except (IndexError, KeyError): - return 0 + return min(reads_index.get_coverage(chrm, pos, cov_strand), + ctrl_reads_index.get_coverage(chrm, pos, cov_strand)) # if strand is not specified get max coverage over both strands if strand is None: @@ -1634,24 +1651,20 @@ def get_strand_cov(cov_strand): return get_strand_cov(strand) - raw_read_coverage = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) - tb_model_fn, alt_model_fn = get_valid_model_fns( + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + std_ref, alt_ref = ts.load_valid_models( tb_model_fn, plot_default_stnd, alt_model_fn, plot_default_alt, - raw_read_coverage, f5_dirs2) + reads_index, ctrl_fast5s_dirs) - if deepest_coverage: - read_coverage = th.get_coverage(raw_read_coverage) - if f5_dirs2 is None: - covered_chrms = set(map(itemgetter(0), raw_read_coverage)) + if ctrl_fast5s_dirs is None: + covered_chrms = set(map(itemgetter(0), reads_index.get_all_cs())) # filter out motif_locs to chromosomes not covered motif_locs = get_motif_locs(covered_chrms) if deepest_coverage: - if VERBOSE: th._status_message('Finding deepest coverage regions.') + if VERBOSE: th.status_message('Finding deepest coverage regions.') motif_locs_cov = sorted([ - (get_pos_cov(chrm, pos, strand, read_coverage), - chrm, pos, strand) + (reads_index.get_coverage(chrm, pos, strand), chrm, pos, strand) for chrm, pos, strand in motif_locs], reverse=True) plot_intervals = [] for i, (cov, chrm, pos, strand) in enumerate(motif_locs_cov): @@ -1659,57 +1672,48 @@ def get_strand_cov(cov_strand): 0, pos - int((num_bases - motif.motif_len + 1) / 2.0)) int_end = int_start + num_bases plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, int_end, strand)) + chrm=chrm, start=int_start, end=int_end, strand=strand, + reg_id='{:03d}'.format(i))) if len(plot_intervals) >= num_regions: break # plot random covered regions else: # iterate over regions and check if they have any coverage plot_intervals = [] for i, (chrm, pos, strand) in enumerate(motif_locs): - int_start = max( - 0, pos - int((num_bases - motif.motif_len + 1) / 2.0)) - int_end = int_start + num_bases - if strand is None and any( - ((chrm, s) in raw_read_coverage and - any(r_data.start < pos < r_data.end - for r_data in raw_read_coverage[(chrm, s)])) - for s in ('+', '-')): - plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, int_end, strand)) - elif ((chrm, strand) in raw_read_coverage and - any(r_data.start < pos < r_data.end - for r_data in raw_read_coverage[(chrm, strand)])): + if reads_index.get_coverage(chrm, pos, strand) > 0: + int_start = max( + 0, pos - int((num_bases - motif.motif_len + 1) / 2.0)) plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, int_end, strand)) + chrm=chrm, start=int_start, end=int_start + num_bases, + strand=strand, reg_id='{:03d}'.format(i))) if len(plot_intervals) >= num_regions: break - if tb_model_fn is None: + if std_ref is None: plot_single_sample( - plot_intervals, raw_read_coverage, overplot_thresh, + plot_intervals, reads_index, overplot_thresh, overplot_type, pdf_fn) else: plot_model_single_sample( - plot_intervals, raw_read_coverage, tb_model_fn, - overplot_type, overplot_thresh, pdf_fn, alt_model_fn) + plot_intervals, reads_index, std_ref, overplot_type, + overplot_thresh, pdf_fn, alt_ref) # two sample plot else: - raw_read_coverage2 = th.parse_fast5s( - f5_dirs2, corrected_group, basecall_subgroups) + ctrl_reads_index = th.TomboReads(ctrl_fast5s_dirs, corr_grp, bc_subgrps) - covered_chrms = set(map(itemgetter(0), raw_read_coverage)).intersection( - map(itemgetter(0), raw_read_coverage2)) + covered_chrms = set( + map(itemgetter(0), reads_index.get_all_cs())).intersection( + map(itemgetter(0), ctrl_reads_index.get_all_cs())) # filter out motif_locs to chromosomes not covered motif_locs = get_motif_locs(covered_chrms) if deepest_coverage: - read_coverage2 = th.get_coverage(raw_read_coverage2) - if VERBOSE: th._status_message('Finding deepest coverage regions.') + if VERBOSE: th.status_message('Finding deepest coverage regions.') motif_locs_cov = sorted([ - (get_pos_cov(chrm, pos, strand, read_coverage, read_coverage2), + (get_pos_cov(chrm, pos, strand, reads_index, ctrl_reads_index), chrm, pos, strand) for chrm, pos, strand in motif_locs], reverse=True) if motif_locs_cov[0][0] == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'Motif not covered by both groups at any positions.') plot_intervals = [] @@ -1717,160 +1721,120 @@ def get_strand_cov(cov_strand): int_start = max( 0, pos - int((num_bases - motif.motif_len + 1) / 2.0)) plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, - int_start + num_bases, strand)) + chrm=chrm, start=int_start, end=int_start + num_bases, + strand=strand, reg_id='{:03d}'.format(i))) if len(plot_intervals) >= num_regions: break # plot random covered regions else: # iterate over regions and check if they have any coverage plot_intervals = [] for i, (chrm, pos, strand) in enumerate(motif_locs): - int_start = max( - 0, pos - int((num_bases - motif.motif_len + 1) / 2.0)) - int_end = int_start + num_bases - if strand is None and any(( - (chrm, s) in raw_read_coverage and - (chrm, s) in raw_read_coverage2 and - any(r_data.start < pos < r_data.end - for r_data in raw_read_coverage[(chrm, s)]) and - any(r_data2.start < pos < r_data2.end - for r_data2 in raw_read_coverage2[(chrm, s)])) - for s in ('+', '-')): + if get_pos_cov(chrm, pos, strand, + reads_index, ctrl_reads_index) > 0: + int_start = max( + 0, pos - int((num_bases - motif.motif_len + 1) / 2.0)) plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, int_end, strand)) - elif ((chrm, strand) in raw_read_coverage and - (chrm, strand) in raw_read_coverage2 and - any(r_data.start < pos < r_data.end - for r_data in raw_read_coverage[(chrm, strand)]) and - any(r_data2.start < pos < r_data2.end - for r_data2 in raw_read_coverage2[(chrm, strand)])): - plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, int_end, strand)) - - if len(plot_intervals) >= num_regions: break + chrm=chrm, start=int_start, end=int_start + num_bases, + strand=strand, reg_id='{:03d}'.format(i))) + if len(plot_intervals) >= num_regions: break if len(plot_intervals) == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'Motif not covered by both groups at any positions.') plot_two_samples( - plot_intervals, raw_read_coverage, raw_read_coverage2, + plot_intervals, reads_index, ctrl_reads_index, overplot_thresh, overplot_type, pdf_fn) return def plot_max_diff( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - f5_dirs2, num_regions, num_bases, overplot_thresh, overplot_type, - seqs_fn): - raw_read_coverage1 = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) - raw_read_coverage2 = th.parse_fast5s( - f5_dirs2, corrected_group, basecall_subgroups) - - if VERBOSE: th._status_message('Getting largest mean signal differences.') + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, ctrl_fast5s_dirs, + num_regions, num_bases, overplot_thresh, overplot_type, seqs_fn): + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + ctrl_reads_index = th.TomboReads(ctrl_fast5s_dirs, corr_grp, bc_subgrps) + + if VERBOSE: th.status_message('Getting largest mean signal differences.') plot_intervals = [ th.intervalData( - '{:03d}'.format(rn), chrm, start, start + num_bases, strand, - '(Mean diff: {:.2f})'.format(stat)) + chrm=chrm, start=start, end=start + num_bases, strand=strand, + reg_id='{:03d}'.format(rn), + reg_text='(Mean diff: {:.2f})'.format(stat)) for rn, (stat, start, chrm, strand) in enumerate(th.get_largest_signal_differences( - raw_read_coverage1, raw_read_coverage2, num_regions, num_bases))] + reads_index, ctrl_reads_index, num_regions, num_bases))] plot_two_samples( - plot_intervals, raw_read_coverage1, raw_read_coverage2, - overplot_thresh, overplot_type, pdf_fn, seqs_fn) + plot_intervals, reads_index, ctrl_reads_index, overplot_thresh, + overplot_type, pdf_fn, seqs_fn) return def plot_most_signif( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - f5_dirs2, num_regions, overplot_thresh, seqs_fn, num_bases, - overplot_type, stats_fn, tb_model_fn, alt_model_fn, - plot_default_stnd, plot_default_alt, cov_damp_counts): - if VERBOSE: th._status_message('Loading statistics from file.') + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, ctrl_fast5s_dirs, + num_regions, overplot_thresh, seqs_fn, num_bases, overplot_type, + stats_fn, tb_model_fn, alt_model_fn, + plot_default_stnd, plot_default_alt): + if VERBOSE: th.status_message('Loading statistics from file.') plot_intervals = ts.TomboStats(stats_fn).get_most_signif_regions( - num_bases, num_regions, cov_damp_counts=cov_damp_counts) + num_bases, num_regions, prepend_loc_to_text=True) - raw_read_coverage = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) - tb_model_fn, alt_model_fn = get_valid_model_fns( + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + std_ref, alt_ref = ts.load_valid_models( tb_model_fn, plot_default_stnd, alt_model_fn, plot_default_alt, - raw_read_coverage, f5_dirs2) + reads_index, ctrl_fast5s_dirs) - if f5_dirs2 is None: - if tb_model_fn is None: + if ctrl_fast5s_dirs is None: + if std_ref is None: plot_single_sample( - plot_intervals, raw_read_coverage, overplot_thresh, - overplot_type, pdf_fn) + plot_intervals, reads_index, overplot_thresh, + overplot_type, pdf_fn, title_include_chrm=False) else: plot_model_single_sample( - plot_intervals, raw_read_coverage, tb_model_fn, - overplot_type, overplot_thresh, pdf_fn, alt_model_fn) + plot_intervals, reads_index, std_ref, overplot_type, + overplot_thresh, pdf_fn, alt_ref, + title_include_chrm=False) else: - raw_read_coverage2 = th.parse_fast5s( - f5_dirs2, corrected_group, basecall_subgroups) + ctrl_reads_index = th.TomboReads(ctrl_fast5s_dirs, corr_grp, bc_subgrps) plot_two_samples( - plot_intervals, raw_read_coverage, raw_read_coverage2, - overplot_thresh, overplot_type, pdf_fn, seqs_fn) + plot_intervals, reads_index, ctrl_reads_index, + overplot_thresh, overplot_type, pdf_fn, seqs_fn, + title_include_chrm=False) return -def get_unique_intervals(plot_intervals, covered_poss=None, num_regions=None): - # unique genomic regions filter - uniq_p_intervals = [] - used_intervals = defaultdict(set) - for int_i in plot_intervals: - # could have significant region immediately next to - # beginning/end of reads - interval_poss = list(range(int_i.start, int_i.end)) - if int_i.start not in used_intervals[(int_i.chrm, int_i.strand)] and ( - covered_poss is None or all( - pos in covered_poss[(int_i.chrm, int_i.strand)] - for pos in interval_poss)): - uniq_p_intervals.append(int_i) - used_intervals[(int_i.chrm, int_i.strand)].update(interval_poss) - if num_regions is not None and len(uniq_p_intervals) >= num_regions: - break - - return uniq_p_intervals - def plot_motif_centered_signif( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - f5_dirs2, num_regions, overplot_thresh, motif, stats_fn, - context_width, num_stats, tb_model_fn, alt_model_fn, - plot_default_stnd, plot_default_alt, fasta_fn, cov_damp_counts): + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, ctrl_fast5s_dirs, + num_regions, overplot_thresh, motif, stats_fn, context_width, + num_stats, tb_model_fn, alt_model_fn, + plot_default_stnd, plot_default_alt, fasta_fn): try: importr(str('gridExtra')) except: - th._error_message_and_exit( + th.error_message_and_exit( 'Must have R packge `gridExtra` installed in order to ' + 'create motif centered plots.') motif = th.TomboMotif(motif) genome_index = th.Fasta(fasta_fn) - if VERBOSE: th._status_message('Loading statistics from file.') + if VERBOSE: th.status_message('Loading statistics from file.') all_stats = ts.TomboStats(stats_fn) - if VERBOSE: th._status_message('Sorting statistics.') - all_stats.order_by_frac(cov_damp_counts) - raw_read_coverage1 = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) - raw_read_coverage2 = th.parse_fast5s( - f5_dirs2, corrected_group, basecall_subgroups) \ - if f5_dirs2 is not None else None + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + ctrl_reads_index = th.TomboReads(ctrl_fast5s_dirs, corr_grp, bc_subgrps) \ + if ctrl_fast5s_dirs is not None else None - tb_model_fn, alt_model_fn = get_valid_model_fns( + std_ref, alt_ref = ts.load_valid_models( tb_model_fn, plot_default_stnd, alt_model_fn, plot_default_alt, - raw_read_coverage1, f5_dirs2) + reads_index, ctrl_fast5s_dirs) - if VERBOSE: th._status_message('Finding most signficant regions with motif.') + if VERBOSE: th.status_message('Finding most signficant regions with motif.') motif_regions_data = [] search_width = ((context_width + motif.motif_len) * 2) - 1 for reg_seq, chrm, strand, start, end in all_stats.iter_stat_seqs( genome_index, motif.motif_len + context_width - 1, - motif.motif_len + context_width - 1, include_pos=True): + motif.motif_len + context_width - 1): reg_match = motif.motif_pat.search(reg_seq) if reg_match: offset = reg_match.start() @@ -1886,11 +1850,12 @@ def plot_motif_centered_signif( break if len(motif_regions_data) == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'No covered and tested sites contain motif of interest.') if len(motif_regions_data) < num_stats: - th._warning_message( - 'Fewer covered and tested motif sites found than requested.') + th.warning_message( + 'Fewer covered and tested motif sites found than requested. ' + + 'Proceeding with ' + str(len(motif_regions_data)) + ' regions.') plot_width = motif.motif_len + (context_width * 2) def get_stat_pos(start, chrm, strand): @@ -1906,84 +1871,79 @@ def get_stat_pos(start, chrm, strand): return reg_pos_fracs - if VERBOSE: th._status_message('Getting all regions statistics.') + if VERBOSE: th.status_message('Getting all regions statistics.') stat_locs = [ loc_stat for motif_loc in motif_regions_data for loc_stat in get_stat_pos(*motif_loc)] - # TODO: Fix so that negative strand reads are plotted too. - # requires adding "don't reverse signal" option in getting plot data plot_intervals = [] for i, (reg_start, chrm, strand) in enumerate(motif_regions_data): - if strand == '-': continue plot_intervals.append(th.intervalData( - '{:03d}'.format(i), chrm, reg_start, reg_start + plot_width, - strand)) + chrm=chrm, start=reg_start, end=reg_start + plot_width, + strand=strand, reg_id='{:03d}'.format(i))) if len(plot_intervals) >= num_regions: break plot_motif_centered_with_stats( - raw_read_coverage1, raw_read_coverage2, plot_intervals, - stat_locs, overplot_thresh, pdf_fn, tb_model_fn, alt_model_fn) + reads_index, ctrl_reads_index, plot_intervals, + stat_locs, overplot_thresh, pdf_fn, std_ref, alt_ref) return def cluster_most_signif( - f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - f5_dirs2, num_regions, num_bases, - r_struct_fn, num_processes, fasta_fn, stats_fn, slide_span): - if VERBOSE: th._status_message('Loading statistics from file.') + fast5s_dirs, corr_grp, bc_subgrps, pdf_fn, ctrl_fast5s_dirs, + num_regions, num_bases, r_struct_fn, num_processes, fasta_fn, + stats_fn, slide_span): + if VERBOSE: th.status_message('Loading statistics from file.') plot_intervals = ts.TomboStats(stats_fn).get_most_signif_regions( num_bases + (slide_span * 2), num_regions) - raw_read_coverage1 = th.parse_fast5s( - f5_dirs1, corrected_group, basecall_subgroups) - raw_read_coverage2 = th.parse_fast5s( - f5_dirs2, corrected_group, basecall_subgroups) + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + ctrl_reads_index = th.TomboReads(ctrl_fast5s_dirs, corr_grp, bc_subgrps) # calculate positions covered by at least one read in both sets - read_coverage1 = th.get_coverage(raw_read_coverage1) - read_coverage2 = th.get_coverage(raw_read_coverage2) - covered_poss = dict( - (chrm_strand, set( - np.where(read_coverage1[chrm_strand] > 0)[0]).intersection( - np.where(read_coverage2[chrm_strand] > 0)[0])) - for chrm_strand in set(read_coverage1).intersection( - read_coverage2)) + covered_poss = {} + for chrm_strand in set(reads_index.get_all_cs()).intersection( + ctrl_reads_index.get_all_cs()): + covered_poss[chrm_strand] = set( + np.where(reads_index.get_cs_coverage( + *chrm_strand) > 0)[0]).intersection( + np.where(ctrl_reads_index.get_cs_coverage( + *chrm_strand) > 0)[0]) # unique genomic regions filter - plot_intervals = get_unique_intervals(plot_intervals, covered_poss) + plot_intervals = th.get_unique_intervals(plot_intervals, covered_poss) # get region data if outputting R data structure if r_struct_fn is not None: - if VERBOSE: th._status_message('Getting sequences.') + if VERBOSE: th.status_message('Getting sequences.') # expand regions for getting sequence by N in case motif is # the exact range found expand_pos = 2 - seq_intervals = [ - int_i._replace( - start=int_i.start - expand_pos, - end=int_i.start + expand_pos + num_bases + (slide_span * 2)) - for int_i in plot_intervals] + seq_intervals = [] + for p_int in plot_intervals: + seq_intervals.append(p_int.copy().update( + start=p_int.start - expand_pos, + end=p_int.start + expand_pos + num_bases + (slide_span * 2))) if fasta_fn is None: # add region sequences to column names for saved dist matrix - reg_seqs = [reg_data.seq for reg_data in th.get_region_sequences( - seq_intervals, raw_read_coverage1, raw_read_coverage2)] + reg_seqs = [ + seq_int.copy().add_reads(reads_index).merge( + seq_int.copy().add_reads(reads_index)).add_seq().seq + for seq_int in seq_intervals] else: genome_index = th.Fasta(fasta_fn) - reg_seqs = [ - genome_index.get_seq(int_i.chrm, int_i.start, int_i.end) - for int_i in seq_intervals] + reg_seqs = [int_i.add_seq(genome_index).seq + for int_i in seq_intervals] - if VERBOSE: th._status_message('Getting region signal differences.') - signal_diffs = th.get_signal_differences( - raw_read_coverage1, raw_read_coverage2) + if VERBOSE: th.status_message('Getting region signal differences.') + signal_diffs = th.get_signal_differences(reads_index, ctrl_reads_index) slide_span_val = slide_span if slide_span else 0 - reg_sig_diffs = [signal_diffs[(int_i.chrm, int_i.strand)][ - int_i.start:int_i.start+num_bases+(slide_span_val*2)] - for int_i in plot_intervals] + reg_sig_diffs = [signal_diffs[(p_int.chrm, p_int.strand)][ + p_int.start:p_int.start+num_bases+(slide_span_val*2)] + for p_int in plot_intervals] - if VERBOSE: th._status_message('Getting distance between signals.') + if VERBOSE: th.status_message('Getting distance between signals.') manager = mp.Manager() index_q = manager.Queue() dists_q = manager.Queue() @@ -2019,14 +1979,15 @@ def cluster_most_signif( ncol=len(reg_sig_diffs), byrow=True) if r_struct_fn is not None: + # add one for 1-based coordinates reg_sig_diff_dists.colnames = r.StrVector( - ['::'.join((seq, int_i.chrm, int_i.strand, unicode(int_i.start))) - for seq, int_i in zip(reg_seqs, plot_intervals)]) + ['::'.join((seq, p_int.chrm, p_int.strand, unicode(p_int.start + 1))) + for seq, p_int in zip(reg_seqs, plot_intervals)]) r_struct_fn = r.StrVector([r_struct_fn,]) else: r_struct_fn = r.NA_Character - if VERBOSE: th._status_message('Plotting (and saving data).') + if VERBOSE: th.status_message('Plotting (and saving data).') r.r(resource_string(__name__, 'R_scripts/plotSigMDS.R').decode()) r.r('pdf("' + pdf_fn + '", height=7, width=7)') r.globalenv[str('plotSigMDS')](reg_sig_diff_dists, r_struct_fn) @@ -2039,12 +2000,12 @@ def cluster_most_signif( #### Test rpy2 install #### ########################### -def test_r_install(): +def test_r_imports(): # first check for simple rpy2 install try: import rpy2 except: - th._error_message_and_exit( + th.error_message_and_exit( 'Must have rpy2 installed in order to plot. Run ' + '`python -c "import rpy2"` to identify installation issues.') @@ -2054,7 +2015,7 @@ def test_r_install(): r importr except NameError: - th._error_message_and_exit( + th.error_message_and_exit( 'R and rpy2 must be linked during installation.\n\t\tRun ' + '`python -c "from rpy2 import robjects; from ' + 'rpy2.robjects.packages import importr` to identify ' + @@ -2066,7 +2027,7 @@ def test_r_install(): try: importr(str('ggplot2')) except: - th._error_message_and_exit( + th.error_message_and_exit( 'Must have R package ggplot2 installed in order to plot. ' + 'Run `python -c "from rpy2.robjects.packages import importr; ' + 'importr(str(\'ggplot2\'));"` to identify installation issues.') @@ -2078,13 +2039,13 @@ def test_r_install(): #### Main plotting function #### ################################ -def _plot_main(args): +def plot_main(args): global VERBOSE VERBOSE = not args.quiet th.VERBOSE = VERBOSE ts.VERBOSE = VERBOSE - test_r_install() + test_r_imports() # roc plotting doesn't use read dirs try: @@ -2108,7 +2069,7 @@ def _plot_main(args): if 'num_reads' in args else None),] glocs_opt = [('genome_locs', args.genome_locations if 'genome_locations' in args else None),] - f5dirs2_opt = [('f5_dirs2', args.control_fast5_basedirs + ctrldirs_opt = [('ctrl_fast5s_dirs', args.control_fast5_basedirs if 'control_fast5_basedirs' in args else None),] rdata_opt = [('r_struct_fn', args.r_data_filename if 'r_data_filename' in args else None),] @@ -2127,42 +2088,40 @@ def _plot_main(args): if 'sequences_filename' in args else None),] statfn_opt = [('stats_fn', args.statistics_filename if 'statistics_filename' in args else None),] - covdamp_opt = [('cov_damp_counts', args.coverage_dampen_counts - if 'coverage_dampen_counts' in args else None),] if args.action_command == 'max_coverage': - kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + genome_opts + + kwargs = dict(ctrldirs_opt + nreg_opt + nbase_opt + genome_opts + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt) plot_max_coverage(*base_args, **kwargs) elif args.action_command == 'genome_locations': - kwargs = dict(f5dirs2_opt + nbase_opt + genome_opts + glocs_opt + + kwargs = dict(ctrldirs_opt + nbase_opt + genome_opts + glocs_opt + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt) plot_genome_locations(*base_args, **kwargs) elif args.action_command == 'motif_centered': - kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + genome_opts + + kwargs = dict(ctrldirs_opt + nreg_opt + nbase_opt + genome_opts + fasta_opt + motif_opt + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt + [('deepest_coverage', args.deepest_coverage),]) plot_motif_centered(*base_args, **kwargs) elif args.action_command == 'max_difference': - kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + genome_opts + + kwargs = dict(ctrldirs_opt + nreg_opt + nbase_opt + genome_opts + seqfn_opt) plot_max_diff(*base_args, **kwargs) elif args.action_command == 'most_significant': - kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + genome_opts + + kwargs = dict(ctrldirs_opt + nreg_opt + nbase_opt + genome_opts + seqfn_opt + statfn_opt + tbmod_opt + atbmod_opt + - dtbmod_opt + datbmod_opt + covdamp_opt) + dtbmod_opt + datbmod_opt) plot_most_signif(*base_args, **kwargs) elif args.action_command == 'motif_with_stats': - kwargs = dict(f5dirs2_opt + nreg_opt + motif_opt + statfn_opt + + kwargs = dict(ctrldirs_opt + nreg_opt + motif_opt + statfn_opt + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt + - fasta_opt + covdamp_opt + + fasta_opt + [('overplot_thresh', args.overplot_threshold), ('context_width', args.num_context), ('num_stats', args.num_statistics)]) plot_motif_centered_signif(*base_args, **kwargs) elif args.action_command == 'cluster_most_significant': - kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + + kwargs = dict(ctrldirs_opt + nreg_opt + nbase_opt + fasta_opt + statfn_opt + rdata_opt + [('num_processes', args.processes), ('slide_span', args.slide_span)]) @@ -2182,11 +2141,12 @@ def _plot_main(args): ('dont_plot', args.dont_plot)]) plot_kmer_dist(*base_args, **kwargs) elif args.action_command == 'roc': - kwargs = dict(fasta_opt + covdamp_opt + + kwargs = dict(fasta_opt + [('pdf_fn', args.pdf_filename), ('motif_descs', args.motif_descriptions), ('stats_fns', args.statistics_filenames), - ('min_reads', args.minimum_test_reads)]) + ('stats_per_block', args.statistics_per_block), + ('total_stats_limit', args.total_statistics_limit)]) plot_roc(**kwargs) elif args.action_command == 'per_read_roc': kwargs = dict(fasta_opt + @@ -2197,12 +2157,12 @@ def _plot_main(args): ('total_stats_limit', args.total_statistics_limit)]) plot_per_read_roc(**kwargs) else: - th._error_message_and_exit('Invalid tombo sub-command entered. ' + + th.error_message_and_exit('Invalid tombo sub-command entered. ' + 'Should have been caught by argparse.') return if __name__ == '__main__': - raise NotImplementedError( - 'This is a module. See commands with `tombo -h`') + sys.stderr.write('This is a module. See commands with `tombo -h`') + sys.exit(1) diff --git a/tombo/_preprocess.py b/tombo/_preprocess.py new file mode 100644 index 0000000..1fca27e --- /dev/null +++ b/tombo/_preprocess.py @@ -0,0 +1,533 @@ +from __future__ import division, unicode_literals, absolute_import + +from builtins import int, range, dict, map, zip + +import io +import os +import re +import sys +import queue + +# Future warning from cython in h5py +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) +import h5py + +from tqdm import tqdm +from time import sleep +from itertools import islice +from multiprocessing import Process, Queue, Pipe + +if sys.version_info[0] > 2: + unicode = str + +from . import tombo_helper as th + + +VERBOSE = False + +_MAX_QUEUE_SIZE = 1000 +_ITER_QUEUE_LIMIT = 1000 +_PROC_UPDATE_INTERVAL = 100 + +_MAX_FASTQ_QUEUE_SIZE = 10000 +_SEQ_SUMMARY_FN_FIELD = 'filename' +_SEQ_SUMMARY_ID_FIELD = 'read_id' + +# warning messages for annotate with fastqs over multiple processes, +# requiring passing warning codes to only print warning once. +_WARN_ID_VAL = 'ids' +_WARN_IO_VAL = 'io' +_WARN_MISMATCH_VAL = 'mismatch' +_WARN_OVRWRT_VAL = 'overwrite' +_WARN_UNIQ_VAL = 'uniq' +_WARN_CODES = (_WARN_ID_VAL, _WARN_IO_VAL, _WARN_MISMATCH_VAL, _WARN_OVRWRT_VAL) +_WARN_CODES_PREP = (_WARN_OVRWRT_VAL, _WARN_UNIQ_VAL) +_WARN_PREFIX = '****** WARNING ****** ' + + +########################## +###### Annotate Raw ###### +########################## + +def _prep_fast5_for_fastq(fast5_data, bc_grp_name, bc_subgrp_name, overwrite): + read_id = th.get_raw_read_slot(fast5_data).attrs.get('read_id') + try: + read_id = read_id.decode() + except (AttributeError, TypeError): + pass + if read_id is None: + return + + # if Analyses group doesn't exist yet, create it + try: + analyses_grp = fast5_data['/Analyses'] + except: + analyses_grp = fast5_data.create_group('Analyses') + + # create Fastq slot, unless value exists and --overwrite is not set + try: + bc_grp = analyses_grp[bc_grp_name] + bc_subgrp = analyses_grp[bc_subgrp_name] + except: + try: + bc_grp = analyses_grp.create_group(bc_grp_name) + bc_subgrp = bc_grp.create_group(bc_subgrp_name) + except: + if overwrite: + del analyses_grp[bc_grp_name] + bc_grp = analyses_grp.create_group(bc_grp_name) + bc_subgrp = bc_grp.create_group(bc_subgrp_name) + else: + raise th.TomboError( + bc_grp_name + ' exists and --overwrite is not set.') + + return read_id + +def _annotate_with_fastqs_worker( + fastq_rec_q, fast5s_read_ids, fastq_slot, fq_slot_prepped, + prog_q, warn_q, bc_grp_name, bc_subgrp_name, overwrite): + been_warned = dict((warn_code, False) for warn_code in _WARN_CODES) + num_recs_proc = 0 + while True: + fastq_rec = fastq_rec_q.get() + if fastq_rec is None: + break + + # extract read_id from fastq (which should be the first text after + # the "@" record delimiter up to the first white space or underscore + read_id = fastq_rec[0].split()[0].split('_')[0][1:] + if read_id not in fast5s_read_ids: + if not been_warned[_WARN_ID_VAL]: + been_warned[_WARN_ID_VAL] = True + warn_q.put(_WARN_ID_VAL) + continue + + try: + with h5py.File(fast5s_read_ids[read_id], 'r+') as fast5_data: + if not fq_slot_prepped: + try: + file_parsed_id = _prep_fast5_for_fastq( + fast5_data, bc_grp_name, bc_subgrp_name, overwrite) + except th.TomboError: + if not been_warned[_WARN_OVRWRT_VAL]: + been_warned[_WARN_OVRWRT_VAL] = True + warn_q.put(_WARN_OVRWRT_VAL) + continue + if read_id != file_parsed_id: + if not been_warned[_WARN_MISMATCH_VAL]: + been_warned[_WARN_MISMATCH_VAL] = True + warn_q.put(_WARN_MISMATCH_VAL) + continue + bc_slot = fast5_data[fastq_slot] + # add sequence to fastq slot + bc_slot.create_dataset( + 'Fastq', data=''.join(fastq_rec), + dtype=h5py.special_dtype(vlen=unicode)) + + # progress q update + num_recs_proc += 1 + if num_recs_proc % _PROC_UPDATE_INTERVAL == 0: + prog_q.put(_PROC_UPDATE_INTERVAL) + except: + if not been_warned[_WARN_IO_VAL]: + been_warned[_WARN_IO_VAL] = True + warn_q.put(_WARN_IO_VAL) + continue + + # add last number of records reported from this process + prog_q.put(num_recs_proc % _PROC_UPDATE_INTERVAL) + + return + +def _feed_seq_records_worker(fastq_fns, fastq_rec_q, num_processes): + for fastq_fn in fastq_fns: + n_recs = 0 + with io.open(fastq_fn) as fastq_fp: + while True: + fastq_rec = list(islice(fastq_fp, 4)) + # if record contains fewer than 4 lines this indicates the + # EOF, so move to next file + if len(fastq_rec) != 4: break + # if sequence identifier line does not start with "@" or quality + # score line does not start with a "+" the file may be + # corrupted, so don't process any more records + if (re.match('@', fastq_rec[0]) is None or + re.match('\+', fastq_rec[2]) is None): + # TODO maybe send this as a warning code to avoid poorly + # formatted output + th.warning_message( + 'Successfully parsed ' + unicode(n_recs) + + ' FASTQ records from ' + fastq_fn + ' before ' + + 'encountering an invalid record. The rest of ' + + 'this file will not be processed.') + break + n_recs += 1 + fastq_rec_q.put(fastq_rec) + + # put none records to trigger annotation processes to exit + for _ in range(num_processes): + fastq_rec_q.put(None) + + return + +def _get_ann_queues(prog_q, warn_q, num_read_ids, wp_conn): + if VERBOSE: bar = tqdm(total=num_read_ids, smoothing=0) + been_warned = dict((warn_code, False) for warn_code in _WARN_CODES) + + def update_warn(warn_val): + if warn_val == _WARN_ID_VAL: + if VERBOSE and not been_warned[_WARN_ID_VAL]: + bar.write( + _WARN_PREFIX + 'Some FASTQ records contain read ' + + 'identifiers not found in any FAST5 files or ' + + 'sequencing summary files.', + file=sys.stderr) + been_warned[_WARN_ID_VAL] = True + elif warn_val == _WARN_IO_VAL: + if VERBOSE and not been_warned[_WARN_IO_VAL]: + bar.write( + _WARN_PREFIX + 'Some read files that could not be accessed.', + file=sys.stderr) + been_warned[_WARN_IO_VAL] = True + elif warn_val == _WARN_MISMATCH_VAL: + if VERBOSE and not been_warned[_WARN_MISMATCH_VAL]: + bar.write( + _WARN_PREFIX + 'Read ID(s) found in sequencing summary ' + + 'and FAST5 file are discordant. Skipping such reads.', + file=sys.stderr) + been_warned[_WARN_MISMATCH_VAL] = True + elif warn_val == _WARN_OVRWRT_VAL: + if VERBOSE and not been_warned[_WARN_OVRWRT_VAL]: + bar.write( + _WARN_PREFIX + 'Basecalls exsit in specified slot for ' + + 'some reads. Set --overwrite option to overwrite these ' + + 'basecalls.', file=sys.stderr) + been_warned[_WARN_OVRWRT_VAL] = True + else: + if VERBOSE: bar.write( + _WARN_PREFIX + 'Invalid warning code encountered.', + file=sys.stderr) + + return + + + total_added_seqs = 0 + while True: + try: + iter_added = prog_q.get(block=False) + total_added_seqs += iter_added + if VERBOSE: bar.update(iter_added) + except queue.Empty: + try: + warn_val = warn_q.get(block=False) + update_warn(warn_val) + except queue.Empty: + sleep(0.1) + # check if main thread has finished with all fastq records + if wp_conn.poll(): + break + + # collect all remaining warn and progress values + while not prog_q.empty(): + iter_added = prog_q.get(block=False) + total_added_seqs += iter_added + if VERBOSE: bar.update(iter_added) + while not warn_q.empty(): + warn_val = warn_q.get(block=False) + update_warn(warn_val) + + if VERBOSE: + bar.close() + th.status_message('Added sequences to a total of ' + + str(total_added_seqs) + ' reads.') + if total_added_seqs < num_read_ids: + th.warning_message( + 'Not all read ids from FAST5s or sequencing summary files ' + + 'were found in FASTQs.\n\t\tThis can result from reads that ' + + 'failed basecalling or if full sets of FAST5s/sequence ' + + 'summaries are not processed with full sets of FASTQs.') + + return + +def _annotate_with_fastqs( + fastq_fns, fast5s_read_ids, fastq_slot, fq_slot_prepped, num_processes, + bc_grp_name, bc_subgrp_name, overwrite): + if VERBOSE: th.status_message('Annotating FAST5s with sequence from FASTQs.') + fastq_rec_q = Queue(maxsize=_MAX_FASTQ_QUEUE_SIZE) + prog_q = Queue() + warn_q = Queue() + + # open a single process to read fastq files and feed the fastq record queue + fq_feed_p = Process(target=_feed_seq_records_worker, + args=(fastq_fns, fastq_rec_q, num_processes)) + fq_feed_p.daemon = True + fq_feed_p.start() + + # open fast5 annotation processes + ann_args = (fastq_rec_q, fast5s_read_ids, fastq_slot, fq_slot_prepped, + prog_q, warn_q, bc_grp_name, bc_subgrp_name, overwrite) + ann_ps = [] + for p_id in range(num_processes): + ann_p = Process(target=_annotate_with_fastqs_worker, args=ann_args) + ann_p.daemon = True + ann_p.start() + ann_ps.append(ann_p) + + main_wp_conn, wp_conn = Pipe() + warn_prog_p = Process(target=_get_ann_queues, + args=(prog_q, warn_q, len(fast5s_read_ids), wp_conn)) + warn_prog_p.daemon = True + warn_prog_p.start() + + fq_feed_p.join() + for ann_p in ann_ps: + ann_p.join() + # send signal to warn/progress queue that all other processes are complete + main_wp_conn.send(True) + warn_prog_p.join() + + return + + +########################## +#### Extract read_ids #### +########################## + +def _get_prep_queue(read_ids_q, prog_q, warn_q, gp_conn, num_fast5s): + """Process all records from all fast5 prep queues + """ + ovrwrt_mess = ( + _WARN_PREFIX + 'Basecalls exsit in specified slot for some ' + + 'reads. Set --overwrite option to overwrite these basecalls.') + fast5s_read_ids = {} + # Warn non-unique read_ids in directory + been_warned = dict((warn_code, False) for warn_code in _WARN_CODES_PREP) + if VERBOSE: bar = tqdm(total=num_fast5s, smoothing=0) + + while True: + try: + read_id, fast5_fn = read_ids_q.get(block=False) + if read_id in fast5s_read_ids: + if VERBOSE and not been_warned[_WARN_UNIQ_VAL]: + bar.write( + _WARN_PREFIX + 'Multiple FAST5 files contain the ' + + 'same read ID. Ensure that FAST5 files are from a ' + + 'single run.', file=sys.stderr) + been_warned[_WARN_UNIQ_VAL] = True + continue + fast5s_read_ids[read_id] = fast5_fn + except queue.Empty: + try: + warn_val = warn_q.get(block=False) + if warn_val == _WARN_OVRWRT_VAL: + if VERBOSE and not been_warned[_WARN_OVRWRT_VAL]: + bar.write(ovrwrt_mess, file=sys.stderr) + been_warned[_WARN_OVRWRT_VAL] = True + else: + bar.write(_WARN_PREFIX + 'Invalid warning code encountered.', + file=sys.stderr) + except queue.Empty: + try: + if VERBOSE: bar.update(prog_q.get(block=False)) + except queue.Empty: + sleep(0.1) + # check if main thread has finished with all FAST5s + if gp_conn.poll(): + break + + while not read_ids_q.empty(): + read_id, fast5_fn = read_ids_q.get(block=False) + fast5s_read_ids[read_id] = fast5_fn + while not warn_q.empty(): + warn_val = warn_q.get(block=False) + if warn_val == _WARN_OVRWRT_VAL: + if VERBOSE and not been_warned[_WARN_OVRWRT_VAL]: + bar.write(ovrwrt_mess, file=sys.stderr) + been_warned[_WARN_OVRWRT_VAL] = True + else: + bar.write(_WARN_PREFIX + 'Invalid warning code encountered.', + file=sys.stderr) + while not prog_q.empty(): + if VERBOSE: bar.update(prog_q.get(block=False)) + + if VERBOSE: bar.close() + gp_conn.send(fast5s_read_ids) + + return + +def _prep_fastq_slot_worker( + fast5_q, bc_grp, bc_subgrp, overwrite, read_ids_q, prog_q, warn_q): + num_files_proc = 0 + been_warned_overwrite = False + while True: + try: + fast5_fn = fast5_q.get(block=False) + except queue.Empty: + sleep(0.1) + continue + + if fast5_fn is None: + break + + num_files_proc += 1 + if num_files_proc % _PROC_UPDATE_INTERVAL == 0: + prog_q.put(_PROC_UPDATE_INTERVAL) + + try: + with h5py.File(fast5_fn) as fast5_data: + try: + read_id = _prep_fast5_for_fastq( + fast5_data, bc_grp, bc_subgrp, overwrite) + except th.TomboError: + # avoid the warn queue getting too large by sending overwite + # warnings for each read from each thread + if not been_warned_overwrite: + been_warned_overwrite = True + warn_q.put(_WARN_OVRWRT_VAL) + continue + except: + continue + if read_id is None: + continue + + read_ids_q.put((read_id, fast5_fn)) + + prog_q.put(num_files_proc % _PROC_UPDATE_INTERVAL) + + return + +def _fill_files_queue(fast5_q, fast5_fns, num_ps): + for fast5_fn in fast5_fns: + fast5_q.put(fast5_fn) + for _ in range(num_ps): + fast5_q.put(None) + + return + +def _get_read_ids_and_prep_fastq_slot( + fast5s_dir, bc_grp, bc_subgrp, overwrite, num_processes): + """Extract read id from /Raw group and prep fastq slots for annotation with + associated FASTQ files. + """ + if VERBOSE: th.status_message( + 'Preparing reads and extracting read identifiers.') + fast5_q = Queue(maxsize=_MAX_QUEUE_SIZE) + read_ids_q = Queue() + prog_q = Queue() + warn_q = Queue() + + fast5_fns = th.get_files_list(fast5s_dir) + files_p = Process(target=_fill_files_queue, + args=(fast5_q, fast5_fns, num_processes)) + files_p.daemon = True + files_p.start() + + prep_args = (fast5_q, bc_grp, bc_subgrp, overwrite, read_ids_q, + prog_q, warn_q) + prep_ps = [] + for p_id in range(num_processes): + prep_p = Process(target=_prep_fastq_slot_worker, args=prep_args) + prep_p.daemon = True + prep_p.start() + prep_ps.append(prep_p) + + main_gp_conn, gp_conn = Pipe() + get_prep_p = Process( + target=_get_prep_queue, + args=(read_ids_q, prog_q, warn_q, gp_conn, len(fast5_fns))) + get_prep_p.daemon = True + get_prep_p.start() + + # join all processes into the main thread + files_p.join() + for prep_p in prep_ps: + prep_p.join() + # send signal to get_prep queue that all other processes are complete + main_gp_conn.send(True) + fast5s_read_ids = main_gp_conn.recv() + + return fast5s_read_ids + +def _parse_sequencing_summary_files(fast5s_dir, seq_summary_fns): + if VERBOSE: th.status_message('Getting read filenames.') + full_fast5_fns = {} + # walk through directory structure searching for fast5 files + for root, _, fns in os.walk(fast5s_dir): + for fn in fns: + if not fn.endswith('.fast5'): continue + full_fast5_fns[fn] = os.path.join(root, fn) + + if VERBOSE: th.status_message('Parsing sequencing summary files.') + fast5s_read_ids = {} + been_warned = False + for seq_summary_fn in seq_summary_fns: + with open(seq_summary_fn) as fp: + try: + header_fields = fp.readline().split() + fn_field = next(i for i, h_field in enumerate(header_fields) + if re.match(_SEQ_SUMMARY_FN_FIELD, h_field)) + id_field = next(i for i, h_field in enumerate(header_fields) + if re.match(_SEQ_SUMMARY_ID_FIELD, h_field)) + except: + th.warning_message( + 'Could not extract header information for sequencing ' + + 'summary file: ' + seq_summary_fn) + continue + try: + for line in fp: + rec_fields = line.split() + rec_short_fn = rec_fields[fn_field] + try: + rec_full_fn = full_fast5_fns[rec_short_fn] + except KeyError: + if not been_warned: + th.warning_message( + 'Some FASTQ records from sequencing summaries ' + + 'do not appear to have a matching file.') + been_warned = True + continue + # convert filename to full filename and link to read id + fast5s_read_ids[rec_fields[id_field]] = rec_full_fn + except: + th.warning_message( + 'Error parsing records for sequencing ' + + 'summary file: ' + seq_summary_fn) + + return fast5s_read_ids + + +################################## +###### Annotate FAST5s Main ###### +################################## + +def annotate_reads_with_fastq_main(args): + global VERBOSE + VERBOSE = not args.quiet + th.VERBOSE = VERBOSE + + fast5s_basedir = ( + args.fast5_basedir if args.fast5_basedir.endswith('/') else + args.fast5_basedir + '/') + if args.sequencing_summary_filenames: + fast5s_read_ids = _parse_sequencing_summary_files( + fast5s_basedir, args.sequencing_summary_filenames) + fq_slot_prepped = False + else: + fast5s_read_ids = _get_read_ids_and_prep_fastq_slot( + fast5s_basedir, args.basecall_group, args.basecall_subgroup, + args.overwrite, args.processes) + fq_slot_prepped = True + fastq_slot = '/'.join(('/Analyses', args.basecall_group, + args.basecall_subgroup)) + _annotate_with_fastqs( + args.fastq_filenames, fast5s_read_ids, fastq_slot, fq_slot_prepped, + args.processes, args.basecall_group, args.basecall_subgroup, + args.overwrite) + + return + + +if __name__ == '__main__': + sys.stderr.write('This is a module. See commands with `tombo -h`') + sys.exit(1) diff --git a/tombo/_text_output_commands.py b/tombo/_text_output_commands.py index 987e12b..881b1af 100644 --- a/tombo/_text_output_commands.py +++ b/tombo/_text_output_commands.py @@ -22,16 +22,41 @@ OUT_HEADER='track type={0} name="{1}_{2}_{3}{4}" ' + \ 'description="{1} {2} {3}{5}"\n' -OUT_TYPES = {'wig':'wiggle_0', 'bedgraph':'bedGraph'} -GROUP1_NAME='sample' -GROUP2_NAME='control' +BG_TYPE = 'bedgraph' +WIG_TYPE = 'wig' +OUT_TYPES = {WIG_TYPE:'wiggle_0', BG_TYPE:'bedGraph'} +GROUP_NAME='sample' +CTRL_NAME='control' + +COV_WIG_TYPE = 'coverage' + +# event table slot values +SIG_SLOT = 'norm_mean' +SD_SLOT = 'norm_stdev' +DWELL_SLOT = 'length' +SIG_WIG_TYPE = 'signal' +DIFF_WIG_TYPE = 'difference' +SD_WIG_TYPE = 'signal_sd' +DWELL_WIG_TYPE = 'dwell' + +# stat table slot values +POS_SLOT = 'pos' +FRAC_SLOT = 'frac' +DFRAC_SLOT = 'damp_frac' +VCOV_SLOT = 'valid_cov' +FRAC_WIG_TYPE = 'fraction' +DFRAC_WIG_TYPE = 'dampened_fraction' +VCOV_WIG_TYPE = 'valid_coverage' +FRAC_WIG_NAME = 'fraction_modified_reads' +DFRAC_WIG_NAME = 'dampened_fraction_modified_reads' +VCOV_WIG_NAME = 'valid_coverage' ######################## ###### WIG Output ###### ######################## -def open_browser_files(wig_base, group_text, type_name, out_type='wig'): +def open_browser_files(wig_base, group_text, type_name, out_type=WIG_TYPE): group_w_dot = '' if group_text == '' else '.' + group_text group_w_us = '' if group_text == '' else '_' + group_text group_w_space = '' if group_text == '' else ' ' + group_text @@ -63,61 +88,66 @@ def _write_cs_int_data(wig_fp, chrm, cs_poss, cs_vals): return def write_frac_wigs(all_stats, wig_base, do_frac, do_damp, do_valid_cov): - if VERBOSE: th._status_message( - 'Parsing and outputting statistics wiggles.') + if VERBOSE: th.status_message('Parsing and outputting statistics wiggles.') if do_frac: plus_frac_fp, minus_frac_fp = open_browser_files( - wig_base, '', 'fraction_modified_reads') + wig_base, '', FRAC_WIG_NAME) if do_damp: plus_damp_fp, minus_damp_fp = open_browser_files( - wig_base, '', 'dampened_fraction_modified_reads') + wig_base, '', DFRAC_WIG_NAME) if do_valid_cov: plus_vcov_fp, minus_vcov_fp = open_browser_files( - wig_base, '', 'valid_coverage') + wig_base, '', VCOV_WIG_NAME) (curr_chrm, curr_strand, curr_poss, curr_fracs, curr_damp_fracs, curr_valid_cov) = (None, None, [], [], [], []) - all_stats.order_by_pos() - for chrm, strand, pos, frac, damp_frac, valid_cov in all_stats.iter_fracs(): + for chrm, strand, start, end, block_stats in all_stats: if chrm != curr_chrm or strand != curr_strand: if len(curr_poss) > 0: + curr_poss = np.concatenate(curr_poss) # write current chrm/strand data if do_frac: wig_fp = plus_frac_fp if curr_strand == '+' else minus_frac_fp - _write_cs_data(wig_fp, curr_chrm, curr_poss, curr_fracs) + _write_cs_data(wig_fp, curr_chrm, curr_poss, + np.concatenate(curr_fracs)) if do_damp: wig_fp = plus_damp_fp if curr_strand == '+' else minus_damp_fp - _write_cs_data(wig_fp, curr_chrm, curr_poss, curr_damp_fracs) + _write_cs_data(wig_fp, curr_chrm, curr_poss, + np.concatenate(curr_damp_fracs)) if do_valid_cov: wig_fp = plus_vcov_fp if curr_strand == '+' else minus_vcov_fp - _write_cs_int_data( - wig_fp, curr_chrm, curr_poss, curr_valid_cov) + _write_cs_int_data(wig_fp, curr_chrm, curr_poss, + np.concatenate(curr_valid_cov)) # set new chrm and strand and empty lists curr_chrm, curr_strand = chrm, strand curr_poss, curr_fracs, curr_damp_fracs, curr_valid_cov = ( [], [], [], []) - # store position statistics - curr_poss.append(pos) + # store block statistics + curr_poss.append(block_stats[POS_SLOT]) if do_frac: - curr_fracs.append(1 - frac) + curr_fracs.append(1 - block_stats[FRAC_SLOT]) if do_damp: - curr_damp_fracs.append(1 - damp_frac) + curr_damp_fracs.append(1 - block_stats[DFRAC_SLOT]) if do_valid_cov: - curr_valid_cov.append(valid_cov) + curr_valid_cov.append(block_stats[VCOV_SLOT]) # write last chrm/strand data if len(curr_poss) > 0: + curr_poss = np.concatenate(curr_poss) if do_frac: wig_fp = plus_frac_fp if curr_strand == '+' else minus_frac_fp - _write_cs_data(wig_fp, curr_chrm, curr_poss, curr_fracs) + _write_cs_data(wig_fp, curr_chrm, curr_poss, + np.concatenate(curr_fracs)) if do_damp: wig_fp = plus_damp_fp if curr_strand == '+' else minus_damp_fp - _write_cs_data(wig_fp, curr_chrm, curr_poss, curr_damp_fracs) + _write_cs_data(wig_fp, curr_chrm, curr_poss, + np.concatenate(curr_damp_fracs)) if do_valid_cov: wig_fp = plus_vcov_fp if curr_strand == '+' else minus_vcov_fp - _write_cs_int_data(wig_fp, curr_chrm, curr_poss, curr_valid_cov) + _write_cs_int_data(wig_fp, curr_chrm, curr_poss, + np.concatenate(curr_valid_cov)) if do_frac: plus_frac_fp.close() @@ -136,31 +166,13 @@ def filter_cs_nans(cs_vals): valid_vals = cs_vals[valid_poss] return valid_poss, valid_vals -def write_length_wig( - raw_read_coverage, chrm_sizes, wig_base, group_name): - if VERBOSE: th._status_message('Parsing and outputting ' + group_name + - ' dwell times.') - plus_dwell_fp, minus_dwell_fp = open_browser_files( - wig_base, group_name, 'dwell') +def write_slot_mean_wig( + reads_index, chrm_sizes, wig_base, group_name, wig_type, slot_name): + if VERBOSE: th.status_message( + 'Parsing and outputting ' + group_name + ' ' + wig_type + '.') + plus_sd_fp, minus_sd_fp = open_browser_files(wig_base, group_name, wig_type) for chrm, strand, cs_vals in th.iter_mean_slot_values( - raw_read_coverage, chrm_sizes, 'length'): - dwell_fp = plus_dwell_fp if strand == '+' else minus_dwell_fp - cs_poss, cs_vals = filter_cs_nans(cs_vals) - _write_cs_data(dwell_fp, chrm, cs_poss, cs_vals) - - plus_dwell_fp.close() - minus_dwell_fp.close() - - return - -def write_signal_sd_wig( - raw_read_coverage, chrm_sizes, wig_base, group_name): - if VERBOSE: th._status_message('Parsing and outputting ' + group_name + - ' signal SDs.') - plus_sd_fp, minus_sd_fp = open_browser_files( - wig_base, group_name, 'signal_sd') - for chrm, strand, cs_vals in th.iter_mean_slot_values( - raw_read_coverage, chrm_sizes, 'norm_stdev'): + reads_index, chrm_sizes, slot_name): sd_fp = plus_sd_fp if strand == '+' else minus_sd_fp cs_poss, cs_vals = filter_cs_nans(cs_vals) _write_cs_data(sd_fp, chrm, cs_poss, cs_vals) @@ -171,25 +183,25 @@ def write_signal_sd_wig( return def write_signal_and_diff_wigs( - raw_read_coverage1, raw_read_coverage2, chrm_sizes, - wig_base, group1_name, write_sig, write_diff): - if VERBOSE: th._status_message( + reads_index, ctrl_reads_index, chrm_sizes, + wig_base, group_name, write_sig, write_diff): + if VERBOSE: th.status_message( 'Parsing and outputting signal means and differences.') # open all file pointers if write_sig: plus_sig1_fp, minus_sig1_fp = open_browser_files( - wig_base, group1_name, 'signal') - if raw_read_coverage2 is not None: + wig_base, group_name, SIG_WIG_TYPE) + if ctrl_reads_index is not None: plus_sig2_fp, minus_sig2_fp = open_browser_files( - wig_base, GROUP2_NAME, 'signal') + wig_base, CTRL_NAME, SIG_WIG_TYPE) if write_diff: plus_diff_fp, minus_diff_fp = open_browser_files( - wig_base, '', 'difference') + wig_base, '', DIFF_WIG_TYPE) # iterate over mean signal values for all chrm/strand combinations with # coverage in either sample. None returned if one sample is not covered for chrm, strand, cs_sig_means1, cs_sig_means2 in th.iter_mean_slot_values( - raw_read_coverage1, chrm_sizes, 'norm_mean', raw_read_coverage2): + reads_index, chrm_sizes, SIG_SLOT, ctrl_reads_index): # compute valid positions since it will either be used here for signal # output or for diff below # note small wasted effort for diff only output when second sample @@ -219,13 +231,13 @@ def write_signal_and_diff_wigs( return -def write_cov_wig(raw_read_coverage, out_base, group_text): - if VERBOSE: th._status_message('Getting and writing ' + group_text + - ' coverage bedgraphs.') +def write_cov_wig(reads_index, out_base, group_text): + if VERBOSE: th.status_message('Getting and writing ' + group_text + + ' coverage bedgraphs.') plus_bg_fp, minus_bg_fp = open_browser_files( - out_base, group_text, 'coverage', 'bedgraph') - for chrm, strand, cs_cov, cs_cov_starts in th.get_coverage_regions( - raw_read_coverage): + out_base, group_text, COV_WIG_TYPE, BG_TYPE) + for (chrm, strand, cs_cov, + cs_cov_starts) in reads_index.iter_coverage_regions(): # extract only values from each region and convert to str cs_cov = np.char.mod('%d', cs_cov) cs_cov_starts = np.char.mod('%d', cs_cov_starts) @@ -242,62 +254,60 @@ def write_cov_wig(raw_read_coverage, out_base, group_text): return def write_all_browser_files( - f5_dirs1, f5_dirs2, corr_grp, bc_subgrps, - stats_fn, wig_base, wig_types, cov_damp_counts): - if f5_dirs1 is not None: - raw_read_coverage1 = th.parse_fast5s( - f5_dirs1, corr_grp, bc_subgrps, sample_name='sample') - if len(raw_read_coverage1) == 0: - th._error_message_and_exit( - 'No reads present in --fast5-basedirs.') - - group1_name = '' if f5_dirs2 is None else GROUP1_NAME - if f5_dirs2 is not None: - raw_read_coverage2 = th.parse_fast5s( - f5_dirs2, corr_grp, bc_subgrps, sample_name='control') - chrm_sizes = th.get_chrm_sizes( - raw_read_coverage1, raw_read_coverage2) - - if 'coverage' in wig_types: - write_cov_wig(raw_read_coverage2, wig_base, GROUP2_NAME) - if 'signal_sd' in wig_types: - write_signal_sd_wig( - raw_read_coverage2, chrm_sizes, wig_base, GROUP2_NAME) - if 'dwell' in wig_types: - write_length_wig(raw_read_coverage2, chrm_sizes, - wig_base, GROUP2_NAME) + fast5s_dirs, ctrl_fast5s_dirs, corr_grp, bc_subgrps, + stats_fn, wig_base, wig_types): + if fast5s_dirs is not None: + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + if reads_index.is_empty(): + th.error_message_and_exit('No reads present in --fast5-basedirs.') + + group_name = '' if ctrl_fast5s_dirs is None else GROUP_NAME + if ctrl_fast5s_dirs is not None: + ctrl_reads_index = th.TomboReads(ctrl_fast5s_dirs, corr_grp, bc_subgrps) + chrm_sizes = th.get_chrm_sizes(reads_index, ctrl_reads_index) + + if COV_WIG_TYPE in wig_types: + write_cov_wig(ctrl_reads_index, wig_base, CTRL_NAME) + if SD_WIG_TYPE in wig_types: + write_slot_mean_wig( + ctrl_reads_index, chrm_sizes, wig_base, CTRL_NAME, + SD_WIG_TYPE, SD_SLOT) + if DWELL_WIG_TYPE in wig_types: + write_slot_mean_wig( + ctrl_reads_index, chrm_sizes, wig_base, CTRL_NAME, + DWELL_WIG_TYPE, DWELL_SLOT) # need to do signal and difference call once either with or # w/o second set of files (unlike coverage, sds and length - if 'signal' in wig_types or 'difference' in wig_types: + if SIG_WIG_TYPE in wig_types or DIFF_WIG_TYPE in wig_types: write_signal_and_diff_wigs( - raw_read_coverage1, raw_read_coverage2, chrm_sizes, - wig_base, group1_name, 'signal' in wig_types, - 'difference' in wig_types) - elif f5_dirs1 is not None: - chrm_sizes = th.get_chrm_sizes(raw_read_coverage1) - if 'signal' in wig_types: + reads_index, ctrl_reads_index, chrm_sizes, + wig_base, group_name, SIG_WIG_TYPE in wig_types, + DIFF_WIG_TYPE in wig_types) + elif fast5s_dirs is not None: + chrm_sizes = th.get_chrm_sizes(reads_index) + if SIG_WIG_TYPE in wig_types: write_signal_and_diff_wigs( - raw_read_coverage1, None, chrm_sizes, wig_base, - group1_name, 'signal' in wig_types, False) - - if 'coverage' in wig_types: - write_cov_wig(raw_read_coverage1, wig_base, group1_name) - if 'signal_sd' in wig_types: - write_signal_sd_wig( - raw_read_coverage1, chrm_sizes, wig_base, group1_name) - if 'dwell' in wig_types: - write_length_wig(raw_read_coverage1, chrm_sizes, wig_base, group1_name) + reads_index, None, chrm_sizes, wig_base, + group_name, SIG_WIG_TYPE in wig_types, False) + + if COV_WIG_TYPE in wig_types: + write_cov_wig(reads_index, wig_base, group_name) + if SD_WIG_TYPE in wig_types: + write_slot_mean_wig( + reads_index, chrm_sizes, wig_base, CTRL_NAME, SD_WIG_TYPE, SD_SLOT) + if DWELL_WIG_TYPE in wig_types: + write_slot_mean_wig( + reads_index, chrm_sizes, wig_base, CTRL_NAME, + DWELL_WIG_TYPE, DWELL_SLOT) if any(wig_type in wig_types for wig_type in ( - 'fraction', 'dampened_fraction', 'valid_coverge')): - if VERBOSE: th._status_message('Loading statistics from file.') + FRAC_WIG_TYPE, DFRAC_WIG_TYPE, VCOV_WIG_TYPE)): + if VERBOSE: th.status_message('Loading statistics from file.') all_stats = ts.TomboStats(stats_fn) - if 'dampened_fraction' in wig_types: - all_stats.calc_damp_fraction(cov_damp_counts) write_frac_wigs(all_stats, wig_base, - 'fraction' in wig_types, - 'dampened_fraction' in wig_types, - 'valid_coverage' in wig_types) + FRAC_WIG_TYPE in wig_types, + DFRAC_WIG_TYPE in wig_types, + VCOV_WIG_TYPE in wig_types) return @@ -307,33 +317,28 @@ def write_all_browser_files( ########################## def write_most_signif( - f5_dirs, fasta_fn, num_regions, corr_grp, bc_subgrps, seqs_fn, - num_bases, stats_fn, cov_damp_counts): - if VERBOSE: th._status_message('Loading statistics from file.') + fast5s_dirs, fasta_fn, num_regions, corr_grp, bc_subgrps, seqs_fn, + num_bases, stats_fn): + if VERBOSE: th.status_message('Loading statistics from file.') plot_intervals = ts.TomboStats(stats_fn).get_most_signif_regions( - num_bases, num_regions, cov_damp_counts=cov_damp_counts) + num_bases, num_regions, prepend_loc_to_text=True) # get each regions sequence either from reads or fasta index if fasta_fn is None: - raw_read_coverage = th.parse_fast5s(f5_dirs, corr_grp, bc_subgrps) - all_reg_data = th.get_region_sequences( - plot_intervals, raw_read_coverage) + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + for p_int in plot_intervals: + p_int.add_reads(reads_index).add_seq() else: genome_index = th.Fasta(fasta_fn) - all_reg_data = [ - int_i._replace( - seq=genome_index.get_seq(int_i.chrm, int_i.start, int_i.end)) - for int_i in plot_intervals if int_i.chrm in genome_index] + for p_int in plot_intervals: + p_int.add_seq(genome_index) - if VERBOSE: th._status_message('Outputting region seqeuences.') + if VERBOSE: th.status_message('Outputting region seqeuences.') with io.open(seqs_fn, 'wt') as seqs_fp: - for int_i in all_reg_data: - reg_seq = int_i.seq - if int_i.strand == '-': - reg_seq = th.rev_comp(reg_seq) - seqs_fp.write('>{0}:{1:d}:{2} {3}\n{4}\n'.format( - int_i.chrm, int(int_i.start + (num_bases // 2)), - int_i.strand, int_i.reg_text, ''.join(reg_seq))) + for p_int in plot_intervals: + reg_seq = (p_int.seq if p_int.strand == '+' else + th.rev_comp(p_int.seq)) + seqs_fp.write('>{0}\n{1}\n'.format(p_int.reg_text, ''.join(reg_seq))) return @@ -349,39 +354,33 @@ def _browser_files_main(args): ts.VERBOSE = VERBOSE if (any(data_type in args.file_types - for data_type in ['signal', 'difference', 'coverage', - 'signal_sd', 'dwell']) and + for data_type in [SIG_WIG_TYPE, DIFF_WIG_TYPE, COV_WIG_TYPE, + SD_WIG_TYPE, DWELL_WIG_TYPE]) and args.fast5_basedirs is None): - th._error_message_and_exit( + th.error_message_and_exit( 'Must provide a fast5 basedir to output signal, difference, ' + 'coverage, signal_sd and/or length browser files.') if (any(wig_type in args.file_types for wig_type in ( - 'fraction', 'dampened_fraction', 'valid_coverage')) and + FRAC_WIG_TYPE, DFRAC_WIG_TYPE, VCOV_WIG_TYPE)) and args.statistics_filename is None): - th._error_message_and_exit( + th.error_message_and_exit( 'Must provide a statistics filename to output ' + 'fraction or valid coverage browser files.') - if ('difference' in args.file_types and + if (DIFF_WIG_TYPE in args.file_types and args.control_fast5_basedirs is None): - th._error_message_and_exit( + th.error_message_and_exit( 'Must provide two sets of FAST5s ' + \ 'to output difference wiggle files.') if (args.control_fast5_basedirs is not None and args.fast5_basedirs is None): - th._error_message_and_exit( + th.error_message_and_exit( 'Cannot provide a control FAST5 set of directories ' + 'without a sample set of FAST5 directories.') - if (args.coverage_dampen_counts is None and - 'dampened_fraction' in args.file_types): - th._error_message_and_exit( - 'Cannot compute dampened fractions without ' + - '--coverage-dampened-counts values.') write_all_browser_files( args.fast5_basedirs, args.control_fast5_basedirs, args.corrected_group, args.basecall_subgroups, args.statistics_filename, - args.browser_file_basename, args.file_types, - args.coverage_dampen_counts) + args.browser_file_basename, args.file_types) return @@ -392,17 +391,17 @@ def _write_signif_diff_main(args): ts.VERBOSE = VERBOSE if args.fast5_basedirs is None and args.genome_fasta is None: - th._error_message_and_exit( + th.error_message_and_exit( 'Must provide either FAST5 directory(ies) or a fasta file.') write_most_signif( args.fast5_basedirs, args.genome_fasta, args.num_regions, args.corrected_group, args.basecall_subgroups, args.sequences_filename, - args.num_bases, args.statistics_filename, args.coverage_dampen_counts) + args.num_bases, args.statistics_filename) return if __name__ == '__main__': - raise NotImplementedError( - 'This is a module. See commands with `tombo -h`') + sys.stderr.write('This is a module. See commands with `tombo -h`') + sys.exit(1) diff --git a/tombo/_version.py b/tombo/_version.py index bb7fa68..62fdcfc 100644 --- a/tombo/_version.py +++ b/tombo/_version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -TOMBO_VERSION = '1.3' +TOMBO_VERSION = '1.4' diff --git a/tombo/dynamic_programming.py b/tombo/dynamic_programming.py deleted file mode 100644 index 80523ba..0000000 --- a/tombo/dynamic_programming.py +++ /dev/null @@ -1,71 +0,0 @@ -from __future__ import unicode_literals, absolute_import - -from builtins import int, range, dict - -import numpy as np -np.seterr(all='raise') - -from .c_dynamic_programming import c_base_forward_pass, c_base_traceback - -def forward_pass(reg_z_scores, min_obs_per_base): - # dynamic programming algorithm to find modeled signal to base assignment - - # fill banded path with cumulative probabilties from the previous signal - # either in the current base or the previous base (left or diagonal left - # from associated plotting) - - # get the first row data - prev_b_data, (prev_b_start, prev_b_end) = reg_z_scores[0] - prev_b_fwd_data = np.cumsum(prev_b_data) - # store number of observations since last diagonal at each position - # - forces forward pass to allow legal traceback paths while - # enforcing the minimum observations per base threshold - # - should also help from optimization pushing poor fitting bases - # to assign only an observation or two - # - will also use this data to traceback all reasonable paths - prev_b_last_diag = np.ones(prev_b_end - prev_b_start, - dtype=np.int64) * min_obs_per_base - # first row is just a cumsum since there is no previous row - reg_fwd_scores = [(prev_b_fwd_data, prev_b_last_diag, - (prev_b_start, prev_b_end))] - - for b_data, (b_start, b_end) in reg_z_scores[1:]: - b_fwd_data, prev_b_last_diag = c_base_forward_pass( - b_data, b_start, b_end, - prev_b_data, prev_b_start, prev_b_end, - prev_b_fwd_data, prev_b_last_diag, min_obs_per_base) - - # consider storing data to form traceback in one go at the - # end of this loop - reg_fwd_scores.append(( - b_fwd_data, prev_b_last_diag, (b_start, b_end))) - prev_b_data, prev_b_fwd_data, prev_b_start, prev_b_end = ( - b_data, b_fwd_data, b_start, b_end) - - return reg_fwd_scores - -def traceback(reg_fwd_scores, min_obs_per_base): - # traceback along maximally likely path - - # initilize array to store new segments - new_segs = np.empty(len(reg_fwd_scores) - 1, dtype=np.int64) - # get first two bases of data for lookups - curr_base_sig = 1 - curr_b_data, _, (curr_start, curr_end) = reg_fwd_scores[-1] - next_b_data, _, (next_start, next_end) = reg_fwd_scores[-2] - new_segs[-1] = c_base_traceback( - curr_b_data, curr_start, next_b_data, next_start, next_end, - curr_end - 1, min_obs_per_base) - for base_pos in range(len(reg_fwd_scores) - 3, -1, -1): - curr_b_data, curr_start = next_b_data, next_start - next_b_data, _, (next_start, next_end) = reg_fwd_scores[base_pos] - new_segs[base_pos] = c_base_traceback( - curr_b_data, curr_start, next_b_data, next_start, next_end, - new_segs[base_pos+1] - 1, min_obs_per_base) - - return new_segs - - -if __name__ == '__main__': - raise NotImplementedError( - 'This is a module. See commands with `tombo -h`') diff --git a/tombo/resquiggle.py b/tombo/resquiggle.py index edf2315..7bfb473 100644 --- a/tombo/resquiggle.py +++ b/tombo/resquiggle.py @@ -6,10 +6,20 @@ import io import re import sys -import mappy import queue +import traceback import threading +# pip allows tombo install without correct version of mappy, so check here +try: + import mappy + if sys.version_info[0] > 2: + mappy.Aligner(os.path.devnull).seq('') + else: + mappy.Aligner(os.path.devnull).seq(b'') +except AttributeError: + th.error_message_and_exit('Tombo requires mappy version >= 2.10.') + # Future warning from cython in h5py import warnings warnings.simplefilter(action='ignore', category=FutureWarning) @@ -20,9 +30,12 @@ import multiprocessing as mp from tqdm import tqdm +from tqdm._utils import _term_move_up + from time import sleep from operator import itemgetter from collections import defaultdict +from pkg_resources import resource_string if sys.version_info[0] > 2: unicode = str @@ -32,191 +45,321 @@ from . import tombo_helper as th from ._default_parameters import ( - SEG_PARAMS_TABLE, ALGN_PARAMS_TABLE, EXTRA_SIG_FACTOR, MASK_FILL_Z_SCORE, - MASK_BASES, START_BANDWIDTH, START_SEQ_WINDOW, BAND_BOUNDARY_THRESH, - DEL_FIX_WINDOW, MAX_DEL_FIX_WINDOW, MIN_EVENT_TO_SEQ_RATIO, MAX_RAW_CPTS, - PHRED_BASE, SHIFT_CHANGE_THRESH, SCALE_CHANGE_THRESH, SIG_MATCH_THRESH) + EXTRA_SIG_FACTOR, MASK_FILL_Z_SCORE, + MASK_BASES, DEL_FIX_WINDOW, MAX_DEL_FIX_WINDOW, + MIN_EVENT_TO_SEQ_RATIO, MAX_RAW_CPTS, SHIFT_CHANGE_THRESH, + SCALE_CHANGE_THRESH, SIG_MATCH_THRESH, DNA_SAMP_TYPE, RNA_SAMP_TYPE, + USE_RNA_EVENT_SCALE, RNA_SCALE_NUM_EVENTS, RNA_SCALE_MAX_FRAC_EVENTS, + START_CLIP_PARAMS, STALL_PARAMS, COLLAPSE_RNA_STALLS, COLLAPSE_DNA_STALLS) +START_CLIP_PARAMS = th.startClipParams(*START_CLIP_PARAMS) +DEFAULT_STALL_PARAMS = th.stallParams(**STALL_PARAMS) -from .dynamic_programming import traceback, forward_pass -from .c_helper import ( - c_new_means, c_valid_cpts_w_cap, c_valid_cpts_w_cap_t_test) -from .c_dynamic_programming import ( - c_reg_z_scores, c_banded_forward_pass, c_banded_traceback, - c_base_z_scores, c_adaptive_banded_forward_pass) +from ._c_dynamic_programming import ( + c_reg_z_scores, c_banded_forward_pass, c_base_z_scores, + c_base_forward_pass, c_base_traceback) -VERBOSE = False -PROC_UPDATE_INTERVAL = 100 + +# list of classes/functions to include in API +__all__ = [ + 'get_read_seq', 'map_read', 'resquiggle_read', + 'segment_signal', 'find_adaptive_base_assignment', + 'resolve_skipped_bases_with_raw', 'find_seq_start_in_events', + 'find_static_base_assignment'] + + +VERBOSE = True _PROFILE_RSQGL = False -_DEBUG_FIT = False -_DEBUG_FULL = False -_DEBUG_MIDDLE = False +# use (mapping) clipped bases at the start of read to identify start position +USE_START_CLIP_BASES = False + +# experimental RNA adapter trimming +TRIM_RNA_ADAPTER = False + + +# text output debugging _DEBUG_PARAMS = False -_DRY_RUN = any((_DEBUG_PARAMS, _DEBUG_FIT, _DEBUG_FULL, _DEBUG_MIDDLE)) -_NUM_DEBUG_ENDS = 250 +_DEBUG_BANDWIDTH = False +_DEBUG_START_BANDWIDTH = False + +# plot output debugging +_DEBUG_DP_ENDS = False +_DEBUG_DP_START = False +_DEBUG_CLIP_START = False +# fit debug plot requires r cowplot package to be installed +_DEBUG_FIT = False +_DEBUG_START_CLIP_FIT = False -############################################### -########## Read Segmentation Scoring ########## -############################################### +# don't plot more than one debug type at a time +assert sum(( + _DEBUG_DP_ENDS, _DEBUG_FIT, _DEBUG_START_CLIP_FIT, + _DEBUG_DP_START, _DEBUG_CLIP_START)) <= 1 +_DEBUG_PLOTTING = any(( + _DEBUG_FIT, _DEBUG_START_CLIP_FIT, _DEBUG_DP_ENDS, _DEBUG_DP_START, + _DEBUG_CLIP_START)) +_DRY_RUN = any(( + _DEBUG_PARAMS, _DEBUG_BANDWIDTH, _DEBUG_START_BANDWIDTH, _DEBUG_PLOTTING)) -def get_read_seg_score(r_means, r_ref_means, r_ref_sds): - return np.mean([ - np.abs((b_m - b_ref_m) / b_ref_s) - for b_m, b_ref_m, b_ref_s in zip(r_means, r_ref_means, r_ref_sds)]) +_UNEXPECTED_ERROR_FN = 'unexpected_tombo_errors.{}.err' +_MAX_NUM_UNEXP_ERRORS = 50 +_MAX_QUEUE_SIZE = 1000 ################################## ########## Debug Output ########## ################################## -def _write_middle_debug(z_scores, fwd_pass, band_event_starts, - debug_fp, reg_id, debug_num_seq=_NUM_DEBUG_ENDS, - short=False): +def _write_params_debug( + norm_signal, segs, r_ref_means, r_ref_sds, rsqgl_params, read_id): + r_means = ts.compute_base_means(norm_signal, segs) + mean_half_z_score = ts.get_read_seg_score(r_means, r_ref_means, r_ref_sds) + sys.stdout.write( + '\t'.join(map(str, ( + rsqgl_params.running_stat_width, + rsqgl_params.min_obs_per_base, + rsqgl_params.mean_obs_per_event, + rsqgl_params.match_evalue, + rsqgl_params.skip_pen, + rsqgl_params.bandwidth, read_id, + mean_half_z_score))) + '\n') + + return + +def _debug_plot_dp( + z_scores, fwd_pass, band_event_starts, fwd_pass_move, top_max_pos, + reg_id='0', debug_num_seq=500, short=False): + reg_id = unicode(reg_id) fwd_pass = fwd_pass[1:] if fwd_pass.shape[0] < debug_num_seq: debug_num_seq = fwd_pass.shape[0] debug_end_start = len(band_event_starts) - debug_num_seq - debug_fp.write('\n'.join( - '\t'.join(map(str, (band_pos + band_event_starts[seq_pos], seq_pos, - score, unicode(reg_id) + 'z_begin'))) - for seq_pos, s_data in enumerate(z_scores[:debug_num_seq]) - for band_pos, score in enumerate(s_data)) + '\n') - debug_fp.write('\n'.join( - '\t'.join(map(str, (band_pos + band_event_starts[seq_pos], seq_pos, - score, unicode(reg_id) + 'fwd_begin'))) - for seq_pos, s_data in enumerate(fwd_pass[:debug_num_seq]) - for band_pos, score in enumerate(s_data)) + '\n') - if short: return - - debug_fp.write('\n'.join( - '\t'.join(map(str, ( - band_pos + band_event_starts[debug_end_start + seq_pos], seq_pos, - score, unicode(reg_id) + 'z_end'))) - for seq_pos, s_data in enumerate(z_scores[-debug_num_seq:]) - for band_pos, score in enumerate(s_data)) + '\n') - debug_fp.write('\n'.join( - '\t'.join(map(str, ( - band_pos + band_event_starts[debug_end_start + seq_pos], seq_pos, - score, unicode(reg_id) + 'fwd_end'))) - for seq_pos, s_data in enumerate(fwd_pass[-debug_num_seq:]) - for band_pos, score in enumerate(s_data)) + '\n') + + event_poss, seq_poss, scores, regs = [], [], [], [] + for seq_pos, (s_z_data, s_f_data) in enumerate(zip(z_scores, fwd_pass)): + b_e_start = band_event_starts[seq_pos] + for band_pos, score in enumerate(s_z_data): + event_poss.append(band_pos + b_e_start) + seq_poss.append(seq_pos) + scores.append(score) + regs.append(reg_id + '_z_begin') + for band_pos, score in enumerate(s_f_data): + event_poss.append(band_pos + b_e_start) + seq_poss.append(seq_pos) + scores.append(score) + regs.append(reg_id + '_fwd_begin') + if seq_pos >= debug_num_seq: + break + if not short: + for seq_pos, (s_z_data, s_f_data) in enumerate(zip( + z_scores[::-1], fwd_pass[::-1])): + end_seq_pos = debug_num_seq - seq_pos - 1 + b_e_start = band_event_starts[debug_end_start + end_seq_pos] + for band_pos, score in enumerate(s_z_data): + event_poss.append(band_pos + b_e_start) + seq_poss.append(end_seq_pos) + scores.append(score) + regs.append(reg_id + '_z_end') + for band_pos, score in enumerate(s_f_data): + event_poss.append(band_pos + b_e_start) + seq_poss.append(end_seq_pos) + scores.append(score) + regs.append(reg_id + '_fwd_end') + if seq_pos >= debug_num_seq: + break + + dpDat = r.DataFrame({ + 'EventPos':r.IntVector(event_poss), + 'SeqPos':r.IntVector(seq_poss), + 'Score':r.FloatVector(scores), + 'Region':r.StrVector(regs)}) + + event_poss, seq_poss, regs = [], [], [] + read_tb = th.banded_traceback(fwd_pass_move, band_event_starts, top_max_pos) + for seq_pos, event_pos in enumerate(read_tb[:debug_num_seq]): + event_poss.append(event_pos) + seq_poss.append(seq_pos) + regs.append(reg_id + '_fwd_begin') + if not short: + for seq_pos, event_pos in enumerate(read_tb[-debug_num_seq:]): + event_poss.append(event_pos) + seq_poss.append(seq_pos) + regs.append(reg_id + '_fwd_end') + + tbDat = r.DataFrame({ + 'EventPos':r.IntVector(event_poss), + 'SeqPos':r.IntVector(seq_poss), + 'Region':r.StrVector(regs)}) + + r.r(resource_string(__name__, 'R_scripts/debugDP.R').decode()) + r.globalenv[str('plotDP')](dpDat, tbDat) return -def _write_full_debug(fwd_pass_move, band_event_starts, top_max_pos, - z_scores, debug_fp, failed_fp, reg_id, final_score): - read_tb = c_banded_traceback(fwd_pass_move, band_event_starts, top_max_pos) +def _debug_fit( + fwd_pass_move, band_event_starts, top_max_pos, z_scores, reg_id, + final_score, bandwidth, event_means, r_ref_means, + running_window=501, static_bw=False): + read_tb = th.banded_traceback(fwd_pass_move, band_event_starts, top_max_pos) prev_event_pos = read_tb[0] - band_poss = [] - event_scores = [] + band_poss, event_scores, ref_means = [], [], [] for seq_pos, event_pos in enumerate(read_tb[1:]): seq_band_poss = [e_pos - band_event_starts[seq_pos] for e_pos in range(prev_event_pos, event_pos)] band_poss.extend(seq_band_poss) event_scores.extend([z_scores[seq_pos][b_pos] for b_pos in seq_band_poss]) + ref_means.extend([r_ref_means[seq_pos] for _ in seq_band_poss]) prev_event_pos = event_pos - debug_fp.write('\n'.join( - '\t'.join(map(str, (e_pos, b_pos, e_score, unicode(reg_id)))) - for e_pos, (b_pos, e_score) in enumerate(zip( - band_poss, event_scores))) + '\n') - fail_str = (('Failed ' if final_score < 0 else 'Pass ') + - unicode(final_score) + ' ' + unicode(final_score / len(read_tb))) - failed_fp.write(fail_str + '\t' + unicode(reg_id) + '\n') + if _DEBUG_BANDWIDTH or _DEBUG_START_BANDWIDTH: + half_bandwidth = bandwidth // 2 + min_bw_edge_buffer = ( + half_bandwidth - np.max(np.abs( + np.array(band_poss) - half_bandwidth)) if _DEBUG_BANDWIDTH else + max(band_poss)) + sys.stdout.write('{:d}\t{:d}\t{}\t{}\n'.format( + bandwidth, min_bw_edge_buffer, final_score / len(read_tb), reg_id)) + sys.stdout.flush() + if not (_DEBUG_FIT or _DEBUG_START_CLIP_FIT): + return - return + if len(event_scores) > running_window: + # make sure window is odd + if running_window % 2 != 1: + running_window =+ 1 + half_window = running_window // 2 + score_cumsum = np.concatenate([[0,], np.cumsum(event_scores)]) + event_scores = np.concatenate([ + np.repeat(np.NAN, half_window), + (score_cumsum[:-running_window] - score_cumsum[running_window:]) / + running_window, np.repeat(np.NAN, half_window)]) + + reg_name = (unicode(reg_id) + '__' + unicode(final_score) + '__' + + unicode(final_score / len(read_tb))) + + fitDat = r.DataFrame({ + 'EventPos':r.IntVector(range(len(band_poss))), + 'BandPos':r.IntVector(band_poss), + 'EventMean':r.FloatVector(event_means[read_tb[0]:read_tb[-1]]), + 'ModelMean':r.FloatVector(ref_means), + 'EventScore':r.FloatVector(event_scores), + 'Region':r.StrVector([reg_name,] * len(band_poss))}) + + r.r(resource_string(__name__, 'R_scripts/debugFit.R').decode()) + r.globalenv[str('plotFit')](fitDat, bandwidth) -def _write_tb_debug(fwd_pass_move, band_event_starts, top_max_pos, - debug_fp, reg_id, debug_num_seq=_NUM_DEBUG_ENDS): - read_tb = c_banded_traceback(fwd_pass_move, band_event_starts, top_max_pos) - debug_fp.write('\n'.join( - '\t'.join(map(str, (event_pos, seq_pos, - unicode(reg_id) + 'fwd_begin'))) - for seq_pos, event_pos in enumerate(read_tb[:debug_num_seq])) + '\n') - debug_fp.write('\n'.join( - '\t'.join(map(str, (event_pos, seq_pos, - unicode(reg_id) + 'fwd_end'))) - for seq_pos, event_pos in enumerate(read_tb[-debug_num_seq:])) + '\n') return -def _write_fit_debug( - norm_signal, segs, r_ref_means, r_ref_sds, genome_seq): - norm_means = c_new_means(norm_signal, segs) - with io.open('eventless_testing.model.txt', 'wt') as fp: - fp.write('Position\tMean\tSD\n' + '\n'.join( - '\t'.join(map(str, (pos, p_mean, p_std))) - for pos, (p_mean, p_std) in enumerate(zip( - r_ref_means, r_ref_sds))) + '\n') - with io.open('eventless_testing.seq.txt', 'wt') as fp: - fp.write('Base\tPosition\tSignalMean\n' + '\n'.join( - '\t'.join(map(str, (b, pos, p_mean))) for pos, (b, p_mean) in - enumerate(zip(genome_seq, norm_means))) + '\n') - Position, Signal = [], [] - for base_i, (b_start, b_end) in enumerate(zip(segs[:-1], segs[1:])): - Position.extend( - base_i + np.linspace(0, 1, b_end - b_start, endpoint=False)) - Signal.extend(norm_signal[b_start:b_end]) - with io.open('eventless_testing.signal.txt', 'wt') as fp: - fp.write('Position\tSignal\n' + '\n'.join( - '\t'.join(map(str, (pos, sig))) - for pos, sig in zip(Position, Signal)) + '\n') +def _open_debug_pdf(): + # import plotting modules + from rpy2 import robjects as r + global r + from rpy2.robjects.packages import importr + importr(str('ggplot2')) + + if _DEBUG_DP_ENDS: + r.r('pdf("debug_event_align.pdf", height=4.5, width=6)') + elif _DEBUG_CLIP_START: + r.r('pdf("debug_event_align.clip_start.pdf", height=4.5, width=10)') + elif _DEBUG_DP_START: + r.r('pdf("debug_event_align.start.pdf", height=4.5, width=10)') + elif _DEBUG_FIT: + importr(str('cowplot')) + r.r('pdf("debug_event_align.full_fit.pdf", width=15, height=5)') + elif _DEBUG_START_CLIP_FIT: + importr(str('cowplot')) + r.r('pdf("debug_event_align.start_clip_fit.pdf", width=15, height=5)') + else: + th.error_message_and_exit('Must specify which debug plot to open.') return -def _write_params_debug( - norm_signal, segs, r_ref_means, r_ref_sds, - running_stat_width, min_obs_per_base, mean_obs_per_event, - match_evalue, skip_pen, bandwidth, fast5_fn): - r_means = c_new_means(norm_signal, segs) - mean_half_z_score = get_read_seg_score(r_means, r_ref_means, r_ref_sds) - sys.stdout.write( - '\t'.join(map(str, ( - running_stat_width, min_obs_per_base, mean_obs_per_event, - match_evalue, skip_pen, bandwidth, fast5_fn, - mean_half_z_score))) + '\n') - +def _close_debug_pdf(): + r.r('dev.off()') return -def _open_debug_fps(): - score_fp = io.open('debug_event_align.txt', 'wt') - score_fp.write('EventPos\tSeqPos\tScore\tRegion\n') - tb_fp = io.open('debug_event_align.traceback.txt', 'wt') - tb_fp.write('EventPos\tSeqPos\tRegion\n') - full_fit_fp = io.open('debug_event_align.full_fit.txt', 'wt') - full_fit_fp.write('EventPos\tBandPos\tEventScore\tRegion\n') - full_failed_fp = io.open('debug_event_align.full_failed.txt', 'wt') - full_failed_fp.write('DidFail\tRegion\n') - debug_fps = [score_fp, tb_fp, full_fit_fp, full_failed_fp] - - return debug_fps - ############################################ ########## Raw Signal Re-squiggle ########## ############################################ -def get_model_fit_segs( - segs, norm_signal, r_ref_means, r_ref_sds, min_obs_per_base, - max_raw_cpts=None, del_fix_window=DEL_FIX_WINDOW, +def raw_forward_pass(reg_z_scores, min_obs_per_base): + # dynamic programming algorithm to find modeled signal to base assignment + + # fill banded path with cumulative probabilties from the previous signal + # either in the current base or the previous base (left or diagonal left + # from associated plotting) + + # get the first row data + prev_b_data, (prev_b_start, prev_b_end) = reg_z_scores[0] + prev_b_fwd_data = np.cumsum(prev_b_data) + # store number of observations since last diagonal at each position + # - forces forward pass to allow legal traceback paths while + # enforcing the minimum observations per base threshold + # - should also help from optimization pushing poor fitting bases + # to assign only an observation or two + # - will also use this data to traceback all reasonable paths + prev_b_last_diag = np.ones(prev_b_end - prev_b_start, + dtype=np.int64) * min_obs_per_base + # first row is just a cumsum since there is no previous row + reg_fwd_scores = [(prev_b_fwd_data, prev_b_last_diag, + (prev_b_start, prev_b_end))] + + for b_data, (b_start, b_end) in reg_z_scores[1:]: + b_fwd_data, prev_b_last_diag = c_base_forward_pass( + b_data, b_start, b_end, + prev_b_data, prev_b_start, prev_b_end, + prev_b_fwd_data, prev_b_last_diag, min_obs_per_base) + + # consider storing data to form traceback in one go at the + # end of this loop + reg_fwd_scores.append(( + b_fwd_data, prev_b_last_diag, (b_start, b_end))) + prev_b_data, prev_b_fwd_data, prev_b_start, prev_b_end = ( + b_data, b_fwd_data, b_start, b_end) + + return reg_fwd_scores + +def raw_traceback(reg_fwd_scores, min_obs_per_base): + # traceback along maximally likely path + + # initilize array to store new segments + new_segs = np.empty(len(reg_fwd_scores) - 1, dtype=np.int64) + # get first two bases of data for lookups + curr_base_sig = 1 + curr_b_data, _, (curr_start, curr_end) = reg_fwd_scores[-1] + next_b_data, _, (next_start, next_end) = reg_fwd_scores[-2] + new_segs[-1] = c_base_traceback( + curr_b_data, curr_start, next_b_data, next_start, next_end, + curr_end - 1, min_obs_per_base) + for base_pos in range(len(reg_fwd_scores) - 3, -1, -1): + curr_b_data, curr_start = next_b_data, next_start + next_b_data, _, (next_start, next_end) = reg_fwd_scores[base_pos] + new_segs[base_pos] = c_base_traceback( + curr_b_data, curr_start, next_b_data, next_start, next_end, + new_segs[base_pos+1] - 1, min_obs_per_base) + + return new_segs + +def resolve_skipped_bases_with_raw( + dp_res, norm_signal, rsqgl_params, + max_raw_cpts=MAX_RAW_CPTS, del_fix_window=DEL_FIX_WINDOW, max_del_fix_window=MAX_DEL_FIX_WINDOW, - extra_sig_factor=EXTRA_SIG_FACTOR, max_half_z_score=None): - """ - Find new segments at skipped bases during dynamic programming segmentation. - - :param segs: current Read segment locations - :param norm_signal: Normalized read siganl - :param r_ref_means: Read refererence means from genomic sequence - :param r_ref_sds: Read refererence standard deviations from genomic sequence - :param min_obs_per_base: Minimum raw observations to assign to each base - :param max_raw_cpts: Maximum new changepoints to find from raw signal - :param del_fix_window: initial bases to extend skipped base windows - :param max_del_fix_window: max bases to extend skipped base windows - :param extra_sig_factor: Amount of extra signal to require in order to - perform signal space re-squiggle - - :returns: New segments with skipped bases resolved + extra_sig_factor=EXTRA_SIG_FACTOR): + """Perform dynamic-programming forward pass over raw signal z-scores. For details, see https://nanoporetech.github.io/tombo/resquiggle.html#resolve-skipped-bases + + Args: + dp_res (:class:`tombo_helper.dpResults`): dynamic programming results + norm_signal (`np.array::np.float64`): normalized raw siganl + rsqgl_params (:class:`tombo.tombo_helper.resquiggleParams`): parameters for the re-squiggle algorithm + max_raw_cpts (int): maximum new changepoints to find from raw signal (optional) + del_fix_window (int): initial bases to extend skipped base windows (optional) + max_del_fix_window (int): max bases to extend skipped base windows (optional) + extra_sig_factor (float): amount of extra signal required to perform signal space re-squiggle (optional) + + Returns: + ``np.array::np.int64`` containing new deletion resolved base raw signal start positions """ def merge_del_windows(all_del_windows): merged_del_windows = [] @@ -231,11 +374,12 @@ def merge_del_windows(all_del_windows): def window_too_small(start, end): n_events = end - start - sig_start, sig_end = segs[start], segs[end] + sig_start, sig_end = dp_res.segs[start], dp_res.segs[end] sig_len = sig_end - sig_start # windows are expanded by one base and the extra signal factor # to allow some room to search for best path - return sig_len <= ((n_events + 1) * min_obs_per_base) * extra_sig_factor + return sig_len <= ((n_events + 1) * + rsqgl_params.min_obs_per_base) * extra_sig_factor def expand_small_windows(all_del_windows): expanded_del_windows = [] @@ -253,15 +397,15 @@ def trim_del_window_ends(all_del_windows): # potentially trim first and last windows if all_del_windows[0][0] < 0: all_del_windows[0] = (0, all_del_windows[0][1]) - if all_del_windows[-1][1] > len(segs) - 1: - all_del_windows[-1] = (all_del_windows[-1][0], len(segs) - 1) + if all_del_windows[-1][1] > len(dp_res.segs) - 1: + all_del_windows[-1] = (all_del_windows[-1][0], len(dp_res.segs) - 1) return all_del_windows def get_deletion_windows(): # get initial windows around deletions/skipped bases all_del_windows = [] - for del_pos in np.where(np.diff(segs) == 0)[0]: + for del_pos in np.where(np.diff(dp_res.segs) == 0)[0]: if (len(all_del_windows) > 0 and del_pos < all_del_windows[-1][1] + del_fix_window): all_del_windows[-1] = (all_del_windows[-1][0], @@ -286,24 +430,25 @@ def get_deletion_windows(): if windows_expanded and any( window_too_small(start, end) for start, end in all_del_windows): - raise NotImplementedError( + raise th.TomboError( 'Not enough raw signal around potential genomic deletion(s)') if max_raw_cpts is not None and max([ end - start for start, end in all_del_windows]) > max_raw_cpts: - raise NotImplementedError( + raise th.TomboError( 'Read contains too many potential genomic deletions') return all_del_windows all_del_windows = get_deletion_windows() + resolved_segs = dp_res.segs.copy() if all_del_windows is None: - return segs + return resolved_segs for start, end in all_del_windows: n_events = end - start - sig_start, sig_end = segs[start], segs[end] + sig_start, sig_end = dp_res.segs[start], dp_res.segs[end] sig_len = sig_end - sig_start # find signal space z-scores mapping without real banding by allowing @@ -311,48 +456,47 @@ def get_deletion_windows(): # windows to enforce min_obs_per_base) pseudo_starts = np.linspace(0, sig_len, n_events + 1, dtype=np.int64) reg_z_scores = c_reg_z_scores( - norm_signal[sig_start:sig_end], r_ref_means[start:end], - r_ref_sds[start:end], pseudo_starts, - 0, n_events, n_events, min_obs_per_base, - max_half_z_score=max_half_z_score) - reg_fwd_scores = forward_pass(reg_z_scores, min_obs_per_base) + norm_signal[sig_start:sig_end], dp_res.ref_means[start:end], + dp_res.ref_sds[start:end], pseudo_starts, + 0, n_events, n_events, rsqgl_params.min_obs_per_base, + max_half_z_score=rsqgl_params.max_half_z_score) + reg_fwd_scores = raw_forward_pass( + reg_z_scores, rsqgl_params.min_obs_per_base) # perform signal based scoring segmentation # - it is ~60X faster than base space - reg_segs = traceback(reg_fwd_scores, min_obs_per_base) + sig_start - assert reg_segs.shape[0] == end - start - 1 - segs[start+1:end] = reg_segs + reg_segs = raw_traceback( + reg_fwd_scores, rsqgl_params.min_obs_per_base) + sig_start + if reg_segs.shape[0] != end - start - 1: + raise th.TomboError('Invalid segmentation results.') + resolved_segs[start+1:end] = reg_segs - if np.diff(segs).min() < 1: - raise NotImplementedError('New segments include zero length events') - if segs[0] < 0: - raise NotImplementedError('New segments start with negative index') - if segs[-1] > norm_signal.shape[0]: - raise NotImplementedError('New segments end past raw signal values') + if np.diff(resolved_segs).min() < 1: + raise th.TomboError('New segments include zero length events') + if resolved_segs[0] < 0: + raise th.TomboError('New segments start with negative index') + if resolved_segs[-1] > norm_signal.shape[0]: + raise th.TomboError('New segments end past raw signal values') - return segs + return resolved_segs ##################################################### ########## Static Band Dynamic Programming ########## ##################################################### -def get_short_read_event_mapping( - event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, - reg_id=None, debug_fps=None, max_half_z_score=None): - """ - Perform banded dynamic programming sequence to event alignment - without masking - - :param event_means: Numpy array with read base means - :param r_ref_means: Numpy array with read reference means - :param r_ref_sds: Numpy array with read reference standard deviations - :param skip_pen: Penalty applied to skipped genomic bases - :param stay_pen: Penalty applied to stay states (should shift to 0 - expected value) - :param z_shift: Shift z-scores by this amount (includes matching - positive expected value) - - :returns: Event to sequence mapping for full length of short read +def find_static_base_assignment( + event_means, r_ref_means, r_ref_sds, rsqgl_params, + reg_id=None): + """Align expected (from genome sequence) signal levels to observed using a dynamic programming approach with static bandwidth start positions + + Args: + event_means (`np.array::np.float64`): read base means + r_ref_means (`np.array::np.float64`): expected base signal levels + r_ref_sds (`np.array::np.float64`): expected base level SDs + rsqgl_params (:class:`tombo.tombo_helper.resquiggleParams`): parameters for the re-squiggle algorithm + + Returns: + `np.array::np.int64` containng event to sequence mapping for full length of short read """ seq_len = r_ref_means.shape[0] events_len = event_means.shape[0] @@ -367,32 +511,31 @@ def get_short_read_event_mapping( shifted_z_scores = np.empty((band_event_starts.shape[0], bandwidth)) for seq_pos, event_pos in enumerate(band_event_starts): - if max_half_z_score is None: - shifted_z_scores[seq_pos,:] = z_shift - np.abs( + if rsqgl_params.max_half_z_score is None: + shifted_z_scores[seq_pos,:] = rsqgl_params.z_shift - np.abs( event_means[event_pos:event_pos + bandwidth] - r_ref_means[seq_pos]) / r_ref_sds[seq_pos] else: - shifted_z_scores[seq_pos,:] = z_shift - np.minimum( - max_half_z_score, np.abs( + shifted_z_scores[seq_pos,:] = rsqgl_params.z_shift - np.minimum( + rsqgl_params.max_half_z_score, np.abs( event_means[event_pos:event_pos + bandwidth] - r_ref_means[seq_pos]) / r_ref_sds[seq_pos]) fwd_pass, fwd_pass_move = c_banded_forward_pass( - shifted_z_scores, band_event_starts, skip_pen, stay_pen) + shifted_z_scores, band_event_starts, rsqgl_params.skip_pen, + rsqgl_params.stay_pen) # perform traceback top_max_pos = np.argmax(fwd_pass[-1,:]) - if _DEBUG_FULL: - _write_full_debug(fwd_pass_move, band_event_starts, top_max_pos, - shifted_z_scores, debug_fps[2], debug_fps[3], reg_id, - fwd_pass[-1,top_max_pos]) - if _DEBUG_MIDDLE: - _write_middle_debug(shifted_z_scores, fwd_pass, band_event_starts, - debug_fps[0], reg_id, short=True) - _write_tb_debug(fwd_pass_move, band_event_starts, top_max_pos, - debug_fps[1], reg_id) - - read_tb = c_banded_traceback(fwd_pass_move, band_event_starts, top_max_pos) + if _DEBUG_FIT: + _debug_fit(fwd_pass_move, band_event_starts, top_max_pos, + shifted_z_scores, reg_id, fwd_pass[-1,top_max_pos], + bandwidth, event_means, r_ref_means, static_bw=True) + if _DEBUG_DP_ENDS: + _debug_plot_dp(shifted_z_scores, fwd_pass, band_event_starts, + fwd_pass_move, top_max_pos, reg_id, short=True) + + read_tb = th.banded_traceback(fwd_pass_move, band_event_starts, top_max_pos) return read_tb @@ -401,48 +544,23 @@ def get_short_read_event_mapping( ########## Adaptive Band Dynamic Programming ########## ####################################################### -def get_masked_start_fwd_pass( +def _get_masked_start_fwd_pass( event_means, r_ref_means, r_ref_sds, mapped_start_offset, - skip_pen, stay_pen, z_shift, bandwidth, events_per_base, - mask_fill_z_score=MASK_FILL_Z_SCORE, - mask_bases=MASK_BASES, reg_id=None, debug_fps=None, - max_half_z_score=None): - """ - Perform banded dynamic programming sequence to event alignment forcing - the path to start and end at the previously discovered locations. - This is performed by masking the z-scores outside a "cone" extended - mask_bases from the beginning and end of the middle of the read. - - :param event_means: Numpy array with read base means - :param r_ref_means: Numpy array with read reference means - :param r_ref_sds: Numpy array with read reference standard deviations - :param mapped_start_offset: Previously identified start of genomic - sequence within events - :param skip_pen: Penalty applied to skipped genomic bases - :param stay_pen: Penalty applied to stay states (should shift to 0 - expected value) - :param z_shift: Shift z-scores by this amount (includes matching positive - expected value) - :param bandwidth: Bandwidth over which to search for sequence to - event mapping - :param events_per_base: Average events per base for the start mapping - - :returns: Event to sequence mapping for start of read including forward - pass scores, forward pass move - values, band starts within the events vector and z-scores - """ - assert event_means.shape[0] - mapped_start_offset >= bandwidth, ( - 'Read sequence to signal matching starts too far into events for ' + - 'full adaptive assignment.') + rsqgl_params, events_per_base, mask_fill_z_score=MASK_FILL_Z_SCORE, + mask_bases=MASK_BASES): + if event_means.shape[0] - mapped_start_offset < rsqgl_params.bandwidth: + raise th.TomboError( + 'Read sequence to signal matching starts too far into events for ' + + 'full adaptive assignment') # if max_half_z_score is none set it to valid float for cython # z-score computation - if max_half_z_score is None: + if rsqgl_params.max_half_z_score is None: do_winsorize_z = False - max_half_z_score = 0.0 + rsqgl_params = rsqgl_params._replace(max_half_z_score = 0.0) else: do_winsorize_z = True - half_bandwidth = bandwidth // 2 + half_bandwidth = rsqgl_params.bandwidth // 2 # check if the mapped start position is too close to the end of # the events array and extend the bandwidth window if so @@ -464,101 +582,108 @@ def get_masked_start_fwd_pass( # get masked z-scores at the beginning of the read mask_start_pos = np.linspace( mapped_start_offset + 1, - band_event_starts[mask_bases - 1] + bandwidth, + band_event_starts[mask_bases - 1] + rsqgl_params.bandwidth, mask_bases).astype(np.int64) def get_start_mask_z_score(seq_pos, event_pos): start_mask_len = max(mapped_start_offset - event_pos, 0) - end_mask_len = (0 if seq_pos >= mask_bases else - bandwidth - (mask_start_pos[seq_pos] - event_pos)) - event_vals = event_means[event_pos + start_mask_len: - event_pos + bandwidth - end_mask_len] + end_mask_len = ( + 0 if seq_pos >= mask_bases else + rsqgl_params.bandwidth - (mask_start_pos[seq_pos] - event_pos)) + # if the end mask does not clip back to the end of the events table + # then extend the end mask to do so + if (event_pos + rsqgl_params.bandwidth - end_mask_len > + event_means.shape[0]): + end_mask_len = (event_pos + rsqgl_params.bandwidth - + event_means.shape[0]) + event_vals = event_means[ + event_pos + start_mask_len: + event_pos + rsqgl_params.bandwidth - end_mask_len] b_z_scores = c_base_z_scores( event_vals, r_ref_means[seq_pos], r_ref_sds[seq_pos], - do_winsorize_z=do_winsorize_z, max_half_z_score=max_half_z_score) + do_winsorize_z=do_winsorize_z, + max_half_z_score=rsqgl_params.max_half_z_score) masked_z_scores = np.concatenate([ - [mask_fill_z_score] * start_mask_len, b_z_scores, - [mask_fill_z_score] * end_mask_len]) + [mask_fill_z_score - rsqgl_params.z_shift] * + start_mask_len, b_z_scores, + [mask_fill_z_score - rsqgl_params.z_shift] * end_mask_len]) + # This should have been handled above by checking the end_clip_len, + # but raise an error here in case + if masked_z_scores.shape[0] != rsqgl_params.bandwidth: + raise th.TomboError('Masked z-score contains too few events.') return masked_z_scores - shifted_z_scores = np.empty((band_event_starts.shape[0], bandwidth)) + shifted_z_scores = np.empty((band_event_starts.shape[0], + rsqgl_params.bandwidth)) for seq_pos, event_pos in enumerate(band_event_starts): shifted_z_scores[seq_pos,:] = get_start_mask_z_score(seq_pos, event_pos) - shifted_z_scores += z_shift + shifted_z_scores += rsqgl_params.z_shift fwd_pass, fwd_pass_move = c_banded_forward_pass( - shifted_z_scores, band_event_starts, skip_pen, stay_pen) + shifted_z_scores, band_event_starts, rsqgl_params.skip_pen, + rsqgl_params.stay_pen) return fwd_pass, fwd_pass_move, band_event_starts, shifted_z_scores -def get_mapping_start( - event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, - seq_window, bandwidth, norm_signal, valid_cpts, - min_obs_per_base, reg_id=None, debug_fps=None, max_half_z_score=None): +def find_seq_start_in_events( + event_means, r_ref_means, r_ref_sds, rsqgl_params, + num_bases, num_events, seq_samp_type=None, reg_id=None): + """Identify most probably start of expected levels within observed events + + Args: + event_means (`np.array::np.float64`): event normalized raw signal means + r_ref_means (`np.array::np.float64`): expected base signal levels + r_ref_sds (`np.array::np.float64`): expected base level SDs + rsqgl_params (:class:`tombo.tombo_helper.resquiggleParams`): parameters for the re-squiggle algorithm + num_bases (int): number of bases to process + num_events (int): number of events to process + reg_id (str): debug + + Returns: + 1) event position (0-based) corresponding to the start of expected signal levels + 2) mean events per base identified from start of the read """ - Perform banded dynamic programming sequence to event alignment through - The beginning of an read to identify the start of genome sequence to - event matching - - :param event_means: Numpy array with read base means - :param r_ref_means: Numpy array with read reference means - :param r_ref_sds: Numpy array with read reference standard deviations - :param skip_pen: Penalty applied to skipped genomic bases - :param stay_pen: Penalty applied to stay states (should shift to 0 - expected value) - :param z_shift: Shift z-scores by this amount (includes matching positive - expected value) - :param seq_window: Number of genomic bases to search over for the start of - the read - :param bandwidth: Bandwidth over which to search for sequence to - event mapping - :param norm_signal: Normalized raw signal vector - :param valid_cpts: Segmentation positions within norm_signal - - :returns: Start position (0-based) of seqeunce to event alignment within - events and the mean events_per_base through the queried portion of read - """ - if event_means.shape[0] < bandwidth: - raise NotImplementedError('Read too short for start/end discovery') - if r_ref_means.shape[0] < seq_window: - raise NotImplementedError( + if event_means.shape[0] < num_events + num_bases: + raise th.TomboError('Read too short for start/end discovery') + if r_ref_means.shape[0] < num_bases: + raise th.TomboError( 'Genomic mapping too short for start/end discovery') # banded z-scores (moving up one event per base for start/end discovery - start_z_scores = np.empty((seq_window, bandwidth)) - for seq_event_pos in range(seq_window): - if max_half_z_score is None: - start_z_scores[seq_event_pos,:] = z_shift - np.abs( - event_means[seq_event_pos:seq_event_pos + bandwidth] + start_z_scores = np.empty((num_bases, num_events)) + for seq_event_pos in range(num_bases): + if rsqgl_params.max_half_z_score is None: + start_z_scores[seq_event_pos,:] = rsqgl_params.z_shift - np.abs( + event_means[seq_event_pos:seq_event_pos + num_events] - r_ref_means[seq_event_pos]) / r_ref_sds[seq_event_pos] else: - start_z_scores[seq_event_pos,:] = z_shift - np.minimum( - max_half_z_score, np.abs( - event_means[seq_event_pos:seq_event_pos + bandwidth] + start_z_scores[seq_event_pos,:] = rsqgl_params.z_shift - np.minimum( + rsqgl_params.max_half_z_score, np.abs( + event_means[seq_event_pos:seq_event_pos + num_events] - r_ref_means[seq_event_pos]) / r_ref_sds[seq_event_pos]) - start_band_event_starts = np.arange(seq_window, dtype=np.int64) + start_band_event_starts = np.arange(num_bases, dtype=np.int64) start_fwd_pass, start_fwd_pass_move = c_banded_forward_pass( - start_z_scores, start_band_event_starts, skip_pen, stay_pen) + start_z_scores, start_band_event_starts, rsqgl_params.skip_pen, + rsqgl_params.stay_pen) # find max along the top and right edges to start traceback top_max_pos = np.argmax(start_fwd_pass[-1,:]) + if _DEBUG_DP_START: + _debug_plot_dp( + start_z_scores, start_fwd_pass, start_band_event_starts, + start_fwd_pass_move, top_max_pos, reg_id=reg_id) + if _DEBUG_START_BANDWIDTH: + _debug_fit( + start_fwd_pass_move, start_band_event_starts, top_max_pos, + start_z_scores, reg_id, start_fwd_pass[-1, top_max_pos], + num_events, event_means, r_ref_means) # perform traceback - start_tb = c_banded_traceback( + start_tb = th.banded_traceback( start_fwd_pass_move, start_band_event_starts, top_max_pos) - - # check that read start mapping is valid to avoid wasted compute on - # adaptive dp - start_segs = valid_cpts[start_tb] - start_sig = norm_signal[start_segs[0]:start_segs[-1]] - start_segs = start_segs - start_segs[0] - start_segs = get_model_fit_segs( - start_segs, start_sig, r_ref_means[:seq_window], - r_ref_sds[:seq_window], min_obs_per_base, - max_half_z_score=max_half_z_score) - start_means = c_new_means(start_sig, start_segs) - #if get_read_seg_score(start_means, r_ref_means[:seq_window], - # r_ref_sds[:seq_window]) > sig_match_thresh: - # raise NotImplementedError( - # 'Poor raw to expected signal matching at read start') + if (seq_samp_type is not None and + ts.score_valid_bases(start_tb, event_means, r_ref_means, r_ref_sds) > + SIG_MATCH_THRESH[seq_samp_type.name]): + raise th.TomboError( + 'Poor raw to expected signal matching in beginning of read.') # compute the average events per base to use for the start forward pass events_per_base = (start_tb[-1] - start_tb[0]) / len(start_tb) @@ -566,291 +691,440 @@ def get_mapping_start( return start_loc, events_per_base -def find_adaptive_base_assignment( - norm_signal, running_stat_width, min_obs_per_base, num_events, std_ref, - genome_seq, genome_loc, skip_pen, stay_pen, z_shift, bandwidth, is_rna, - start_bandwidth=START_BANDWIDTH, start_seq_window=START_SEQ_WINDOW, - band_boundary_thresh=BAND_BOUNDARY_THRESH, reg_id=None, debug_fps=None, - max_half_z_score=None): - """ - Perform banded dynamic programming sequence to event alignment by first - identifying the start of the sequence to event matching and then - performing banded matching through the whole read - - :param norm_signal: Numpy array with normalized read signal - :param running_stat_width: Width of neighboring windows over which to - compute changepoint stats - :param min_obs_per_base: Minimum number of raw observations per base - :param num_events: Number of events to identify in this read - :param std_ref: A TomboModel object - :param genome_seq: Genomic sequence for this read - :param genome_loc: Mapped genomic location for this read - :param skip_pen: Penalty applied to skipped genomic bases - :param stay_pen: Penalty applied to stay states (should shift to 0 - expected value) - :param z_shift: Shift z-scores by this amount (includes matching positive - expected value) - :param bandwidth: Bandwidth over which to search for sequence to - event mapping - :param is_rna: Is this an RNA read - - :returns: Start of seqeunce to event alignment and the mean - events_per_base through the queried portion of a read - """ - # get events - if is_rna: - # RNA bases show consistent variable spread so use t-test segmentation - valid_cpts = c_valid_cpts_w_cap_t_test( - norm_signal, min_obs_per_base, running_stat_width, num_events) - else: - valid_cpts = c_valid_cpts_w_cap( - norm_signal, min_obs_per_base, running_stat_width, num_events) - #valid_cpts = ts.get_valid_cpts( - # norm_signal, running_stat_width, num_events) - valid_cpts.sort() - event_means = c_new_means(norm_signal, valid_cpts) - - dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 - r_ref_means, r_ref_sds, _, _ = ts.get_ref_from_seq(genome_seq, std_ref) - # trim genome seq to match model-able positions - genome_seq = genome_seq[std_ref.central_pos:-dnstrm_bases] - seq_len = len(genome_seq) - if genome_loc.Strand == '+': - genome_loc = genome_loc._replace( - Start=genome_loc.Start + std_ref.central_pos) - else: - genome_loc = genome_loc._replace(Start=genome_loc.Start + dnstrm_bases) - - # for short reads, just search the whole read with an appropriate bandwidth - if (event_means.shape[0] < start_bandwidth + start_seq_window or - seq_len < max(start_seq_window, bandwidth / 2)): - seq_events = get_short_read_event_mapping( - event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, - z_shift, reg_id=reg_id, debug_fps=debug_fps, - max_half_z_score=max_half_z_score) - seq_segs = valid_cpts[seq_events] - read_start_rel_to_raw = seq_segs[0] - seq_segs = seq_segs - read_start_rel_to_raw - - return (seq_segs, r_ref_means, r_ref_sds, read_start_rel_to_raw, - genome_seq, genome_loc) - - # identify the start and end of the read within the signal using a larger - # bandwidth - mapped_start, events_per_base = get_mapping_start( - event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, - start_seq_window, start_bandwidth, norm_signal, valid_cpts, - min_obs_per_base, reg_id=reg_id, - debug_fps=debug_fps, max_half_z_score=max_half_z_score) +def _trim_traceback(read_tb, events_len): + start_trim_i = 0 + while read_tb[start_trim_i] < 0: + read_tb[start_trim_i] = 0 + start_trim_i += 1 + end_trim_i = 1 + while read_tb[-end_trim_i] > events_len: + read_tb[-end_trim_i] = events_len + end_trim_i += 1 - # get number of events to clip and how far into the events the - # discovered start is located - half_bandwidth = bandwidth // 2 - if mapped_start < half_bandwidth: - events_start_clip = 0 - mapped_start_offset = mapped_start - else: - events_start_clip = mapped_start - half_bandwidth - mapped_start_offset = half_bandwidth + return read_tb - # process long enough reads that start too far into read for normal - # adaptive processing just as with short reads - if (event_means.shape[0] - mapped_start_offset - - events_start_clip < bandwidth): - seq_events = get_short_read_event_mapping( - event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, - z_shift, reg_id=reg_id, debug_fps=debug_fps, - max_half_z_score=max_half_z_score) - seq_segs = valid_cpts[seq_events] - read_start_rel_to_raw = seq_segs[0] - seq_segs = seq_segs - read_start_rel_to_raw - - return (seq_segs, r_ref_means, r_ref_sds, read_start_rel_to_raw, - genome_seq, genome_loc) +def find_seq_start_from_clip_basecalls( + event_means, rsqgl_params, start_clip_bases, genome_seq, std_ref, + num_genome_bases, reg_id=None): + """Perform dynamic programming over clipped basecalls and genome sequence to identify the start of the genomic mapping + + Args: + event_means (`np.array::np.float64`): normalized raw signal event means + rsqgl_params (:class:`tombo.tombo_helper.resquiggleParams`): parameters for the re-squiggle algorithm + start_clip_bases (str): read bases clipped from before ``genome_seq`` + genome_seq (str): genomic mapping sequence (inlcuding extra bases based on k-mer size) + std_ref (:class:`tombo.tombo_stats.TomboModel`): canonical model + num_genome_bases (int): genome sequence length used to identify start position (needed for traceback) + reg_id (str): debug + + Returns: + 1) event position (0-based) corresponding to the start of expected signal levels from genome_seq + 2) mean events per base identified from start of the read + """ + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + start_genome_seq = genome_seq[ + std_ref.central_pos:num_genome_bases + dnstrm_bases] + start_seq = start_clip_bases + start_genome_seq + r_ref_means, r_ref_sds, _, _ = ts.get_ref_from_seq(start_seq, std_ref) + seq_len = r_ref_means.shape[0] # now find full sequence to events path using a smaller bandwidth - event_means = event_means[events_start_clip:] - valid_cpts = valid_cpts[events_start_clip:] (start_fwd_pass, start_fwd_pass_move, - start_event_starts, start_z_scores) = get_masked_start_fwd_pass( - event_means, r_ref_means, r_ref_sds, - mapped_start_offset, skip_pen, stay_pen, z_shift, bandwidth, - events_per_base, reg_id=reg_id, debug_fps=debug_fps) + start_event_starts, start_z_scores) = _get_masked_start_fwd_pass( + event_means, r_ref_means, r_ref_sds, 0, + rsqgl_params, (rsqgl_params.bandwidth // 2) / float(MASK_BASES)) start_seq_len = start_event_starts.shape[0] - fwd_pass = np.empty((seq_len+1, bandwidth), dtype=np.float64) + fwd_pass = np.empty((seq_len+1, rsqgl_params.bandwidth), dtype=np.float64) fwd_pass[:start_seq_len+1] = start_fwd_pass - fwd_pass_move = np.empty((seq_len+1, bandwidth), dtype=np.int64) + fwd_pass_move = np.empty((seq_len+1, rsqgl_params.bandwidth), dtype=np.int64) fwd_pass_move[:start_seq_len+1] = start_fwd_pass_move band_event_starts = np.empty((seq_len,), dtype=np.int64) band_event_starts[:start_seq_len] = start_event_starts - #fwd_pass[start_seq_len+1:,:] = np.NAN - #fwd_pass_move[start_seq_len+1:,:] = np.NAN - #band_event_starts[start_seq_len:] = np.NAN - # if max_half_z_score is none set it to valid float for cython - # z-score computation - if max_half_z_score is None: + if rsqgl_params.max_half_z_score is None: do_winsorize_z = False - max_half_z_score = 0.0 + rsqgl_params = rsqgl_params._replace(max_half_z_score = 0.0) else: do_winsorize_z = True - if _DEBUG_FULL or _DEBUG_MIDDLE: - rest_z_scores = c_adaptive_banded_forward_pass( + if _DEBUG_CLIP_START or _DEBUG_START_CLIP_FIT: + # save z-scores for debug plotting + rest_z_scores = th.adaptive_banded_forward_pass( fwd_pass, fwd_pass_move, band_event_starts, event_means, - r_ref_means, r_ref_sds, z_shift, skip_pen, stay_pen, - start_seq_len, MASK_FILL_Z_SCORE, do_winsorize_z, max_half_z_score, - return_z_scores=True) - shifted_z_scores = np.empty((seq_len, bandwidth), dtype=np.float64) + r_ref_means, r_ref_sds, + z_shift=rsqgl_params.z_shift, + skip_pen=rsqgl_params.skip_pen, stay_pen=rsqgl_params.stay_pen, + start_seq_pos=start_seq_len, mask_fill_z_score=MASK_FILL_Z_SCORE, + do_winsorize_z=do_winsorize_z, + max_half_z_score=rsqgl_params.max_half_z_score, return_z_scores=True) + shifted_z_scores = np.empty(( + seq_len, rsqgl_params.bandwidth), dtype=np.float64) shifted_z_scores[:start_seq_len] = start_z_scores shifted_z_scores[start_seq_len:] = rest_z_scores else: - c_adaptive_banded_forward_pass( + th.adaptive_banded_forward_pass( fwd_pass, fwd_pass_move, band_event_starts, event_means, - r_ref_means, r_ref_sds, z_shift, skip_pen, stay_pen, - start_seq_len, MASK_FILL_Z_SCORE, do_winsorize_z, max_half_z_score) + r_ref_means, r_ref_sds, + z_shift=rsqgl_params.z_shift, + skip_pen=rsqgl_params.skip_pen, stay_pen=rsqgl_params.stay_pen, + start_seq_pos=start_seq_len, mask_fill_z_score=MASK_FILL_Z_SCORE, + do_winsorize_z=do_winsorize_z, + max_half_z_score=rsqgl_params.max_half_z_score) top_max_pos = np.argmax(fwd_pass[-1,:]) - if _DEBUG_FULL: - _write_full_debug(fwd_pass_move, band_event_starts, top_max_pos, - shifted_z_scores, debug_fps[2], debug_fps[3], reg_id, - fwd_pass[-1,top_max_pos]) - if _DEBUG_MIDDLE: - _write_middle_debug(shifted_z_scores, fwd_pass, band_event_starts, - debug_fps[0], reg_id) - _write_tb_debug(fwd_pass_move, band_event_starts, top_max_pos, - debug_fps[1], reg_id) - - read_tb = c_banded_traceback( - fwd_pass_move, band_event_starts, top_max_pos, band_boundary_thresh) + if _DEBUG_CLIP_START: + _debug_plot_dp(shifted_z_scores, fwd_pass, band_event_starts, + fwd_pass_move, top_max_pos, short=True, reg_id=reg_id) + + if _DEBUG_START_CLIP_FIT: + _debug_fit(fwd_pass_move, band_event_starts, top_max_pos, + shifted_z_scores, reg_id, fwd_pass[-1,top_max_pos], + rsqgl_params.bandwidth, event_means, r_ref_means) + + read_tb = th.banded_traceback( + fwd_pass_move, band_event_starts, top_max_pos, + rsqgl_params.band_bound_thresh) + # trim invalid traceback positions + read_tb = _trim_traceback(read_tb, events_len=event_means.shape[0]) + + assert len(start_clip_bases) >= std_ref.central_pos, ( + 'Invalid start clip base processing.') + start_loc = read_tb[len(start_clip_bases) - std_ref.central_pos] + events_per_base = (read_tb[-1] - start_loc) / ( + len(start_genome_seq) - dnstrm_bases) - start_trim_i = 0 - while read_tb[start_trim_i] < 0: - read_tb[start_trim_i] = 0 - start_trim_i += 1 - end_trim_i = 1 - events_len = event_means.shape[0] - while read_tb[-end_trim_i] > events_len: - read_tb[-end_trim_i] = events_len - end_trim_i += 1 + return start_loc, events_per_base - seq_segs = valid_cpts[read_tb] +def get_rel_raw_coords(valid_cpts, seq_events): + """get raw coordinates relative to the start of the assigned signal + """ + seq_segs = valid_cpts[seq_events] read_start_rel_to_raw = seq_segs[0] seq_segs = seq_segs - read_start_rel_to_raw + return seq_segs, read_start_rel_to_raw + +def find_adaptive_base_assignment( + valid_cpts, event_means, rsqgl_params, std_ref, genome_seq, + start_clip_bases=None, start_clip_params=START_CLIP_PARAMS, + seq_samp_type=th.seqSampleType(DNA_SAMP_TYPE, False), reg_id=None): + """Align expected (from genome sequence) signal levels to observed using a dynamic programming approach with adaptive bandwidth start positions + + Args: + valid_cpts (`np.array::np.int64`): raw signal base start locations + event_means (`np.array::np.float64`): event normalized raw signal means + rsqgl_params (:class:`tombo.tombo_helper.resquiggleParams`): parameters for the re-squiggle algorithm + std_ref (:class:`tombo.tombo_stats.TomboModel`): canonical model + genome_seq (str): genome sequence (from mapping) + start_clip_bases (str): mapping read clipped bases + start_clip_params (:class:`tombo.tombo_helper.startClipParams`): start clip basecall params + seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): sequencing sample type (default: DNA) + reg_id (str): debug + + Returns: + :class:`tombo.tombo_helper.dpResults` + """ + def get_short_read_results(r_ref_means, r_ref_sds, genome_seq): + seq_events = find_static_base_assignment( + event_means, r_ref_means, r_ref_sds, rsqgl_params, reg_id=reg_id) + seq_segs, read_start_rel_to_raw = get_rel_raw_coords( + valid_cpts, seq_events) + return th.dpResults( + read_start_rel_to_raw=read_start_rel_to_raw, segs=seq_segs, + ref_means=r_ref_means, ref_sds=r_ref_sds, genome_seq=genome_seq) + + def run_fwd_pass(): + # find full sequence to events path using a smaller bandwidth + (start_fwd_pass, start_fwd_pass_move, + start_event_starts, start_z_scores) = _get_masked_start_fwd_pass( + event_means[events_start_clip:], r_ref_means, r_ref_sds, + mapped_start_offset, rsqgl_params, events_per_base) + start_seq_len = start_event_starts.shape[0] + fwd_pass = np.empty((seq_len+1, rsqgl_params.bandwidth), dtype=np.float64) + fwd_pass[:start_seq_len+1] = start_fwd_pass + fwd_pass_move = np.empty((seq_len+1, rsqgl_params.bandwidth), + dtype=np.int64) + fwd_pass_move[:start_seq_len+1] = start_fwd_pass_move + band_event_starts = np.empty((seq_len,), dtype=np.int64) + band_event_starts[:start_seq_len] = start_event_starts + + # if max_half_z_score is none set it to valid float for cython + # z-score computation + if rsqgl_params.max_half_z_score is None: + do_winsorize_z = False + wa_rsqgl_params = rsqgl_params._replace(max_half_z_score = 0.0) + else: + do_winsorize_z = True + wa_rsqgl_params = rsqgl_params + + shifted_z_scores = None + if _DEBUG_FIT or _DEBUG_BANDWIDTH or _DEBUG_DP_ENDS: + # save z-scores for debug plotting + rest_z_scores = th.adaptive_banded_forward_pass( + fwd_pass, fwd_pass_move, band_event_starts, + event_means[events_start_clip:], r_ref_means, r_ref_sds, + wa_rsqgl_params.z_shift, wa_rsqgl_params.skip_pen, + wa_rsqgl_params.stay_pen, start_seq_len, MASK_FILL_Z_SCORE, + do_winsorize_z, wa_rsqgl_params.max_half_z_score, + return_z_scores=True) + shifted_z_scores = np.empty(( + seq_len, wa_rsqgl_params.bandwidth), dtype=np.float64) + shifted_z_scores[:start_seq_len] = start_z_scores + shifted_z_scores[start_seq_len:] = rest_z_scores + else: + th.adaptive_banded_forward_pass( + fwd_pass, fwd_pass_move, band_event_starts, + event_means[events_start_clip:], + r_ref_means, r_ref_sds, wa_rsqgl_params.z_shift, + wa_rsqgl_params.skip_pen, wa_rsqgl_params.stay_pen, start_seq_len, + MASK_FILL_Z_SCORE, do_winsorize_z, + wa_rsqgl_params.max_half_z_score) + + return fwd_pass, fwd_pass_move, band_event_starts, shifted_z_scores + + def plot_debug(shifted_z_scores): + if _DEBUG_FIT or _DEBUG_BANDWIDTH: + _debug_fit( + fwd_pass_move, band_event_starts, top_max_pos, shifted_z_scores, + reg_id, fwd_pass[-1, top_max_pos], rsqgl_params.bandwidth, + event_means[events_start_clip:], r_ref_means) + if _DEBUG_DP_ENDS: + _debug_plot_dp(shifted_z_scores, fwd_pass, band_event_starts, + fwd_pass_move, top_max_pos, reg_id) + return + + + # if start clip bases are provided, run better start identification algorithm + if (start_clip_bases is not None and + len(genome_seq) > start_clip_params.num_genome_bases): + if len(start_clip_bases) < std_ref.central_pos: + mapped_start = len(start_clip_bases) * 2 + events_per_base = 2 + else: + clip_params = rsqgl_params._replace( + bandwidth=start_clip_params.bandwidth) + mapped_start, events_per_base = find_seq_start_from_clip_basecalls( + event_means, clip_params, start_clip_bases, genome_seq, std_ref, + start_clip_params.num_genome_bases, reg_id=reg_id) + + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + r_ref_means, r_ref_sds, _, _ = ts.get_ref_from_seq(genome_seq, std_ref) + # trim genome seq to match model-able positions + genome_seq = genome_seq[std_ref.central_pos:-dnstrm_bases] + seq_len = len(genome_seq) + if seq_len != r_ref_means.shape[0]: + raise th.TomboError('Discordant reference and seqeunce lengths.') + + # if read was too short for start clip map start discovery, but need r_ref* + if (start_clip_bases is not None and + seq_len <= start_clip_params.num_genome_bases): + return get_short_read_results(r_ref_means, r_ref_sds, genome_seq) + + if start_clip_bases is None: + # for short reads, just search the whole read with an appropriate + # bandwidth + if (event_means.shape[0] < rsqgl_params.start_bw + + rsqgl_params.start_n_bases or + seq_len < rsqgl_params.start_n_bases): + return get_short_read_results(r_ref_means, r_ref_sds, genome_seq) + try: + # identify the start of genomic sequence within raw signal + mapped_start, events_per_base = find_seq_start_in_events( + event_means, r_ref_means, r_ref_sds, rsqgl_params, + rsqgl_params.start_n_bases, rsqgl_params.start_bw, + seq_samp_type, reg_id=reg_id) + except th.TomboError: + if (event_means.shape[0] < rsqgl_params.start_save_bw + + rsqgl_params.start_n_bases): + return get_short_read_results(r_ref_means, r_ref_sds, genome_seq) + # if smaller (and faster) bandwidth did not find a sufficiently + # scoring raw signal mapping, try again with larger save_bandwidth + # and don't check score (by not passing seq_samp_type) + mapped_start, events_per_base = find_seq_start_in_events( + event_means, r_ref_means, r_ref_sds, rsqgl_params, + rsqgl_params.start_n_bases, rsqgl_params.start_save_bw, + reg_id=reg_id) + + if events_per_base == 0: + raise th.TomboError( + 'Very poor signal quality. Read likely includes open pore.') + + # get number of events to clip and how far into the events the + # discovered start is located + half_bandwidth = rsqgl_params.bandwidth // 2 + if mapped_start < half_bandwidth: + events_start_clip = 0 + mapped_start_offset = mapped_start + else: + events_start_clip = mapped_start - half_bandwidth + mapped_start_offset = half_bandwidth + + # process long enough reads that start too far into read for normal + # adaptive processing just as with short reads + if (int((half_bandwidth + 1) / events_per_base) >= r_ref_means.shape[0] or + (event_means.shape[0] - mapped_start_offset - + events_start_clip < rsqgl_params.bandwidth)): + return get_short_read_results(r_ref_means, r_ref_sds, genome_seq) + + fwd_pass, fwd_pass_move, band_event_starts, shifted_z_scores = run_fwd_pass() + + # find position of last base at the maximal score + top_max_pos = np.argmax(fwd_pass[-1,:]) - return (seq_segs, r_ref_means, r_ref_sds, read_start_rel_to_raw, - genome_seq, genome_loc) + # plot debugging if requested + plot_debug(shifted_z_scores) + + read_tb = th.banded_traceback( + fwd_pass_move, band_event_starts, top_max_pos, + rsqgl_params.band_bound_thresh) + # trim invalid traceback positions + read_tb = _trim_traceback( + read_tb, events_len=event_means.shape[0] - events_start_clip) + + # get segment positions within raw signal vector + seq_segs, read_start_rel_to_raw = get_rel_raw_coords( + valid_cpts[events_start_clip:], read_tb) + + return th.dpResults( + read_start_rel_to_raw=read_start_rel_to_raw, segs=seq_segs, + ref_means=r_ref_means, ref_sds=r_ref_sds, genome_seq=genome_seq) ###################################### ########## Re-squiggle Read ########## ###################################### -def resquiggle_read( - all_raw_signal, channel_info, genome_seq, genome_loc, - align_info, std_ref, outlier_thresh, corr_grp, - bio_samp_type, seg_params, sig_aln_params, - fast5_fn=None, max_raw_cpts=MAX_RAW_CPTS, - min_event_to_seq_ratio=MIN_EVENT_TO_SEQ_RATIO, skip_index=False, - reg_id=None, debug_fps=None, const_scale=None, skip_seq_scaling=False, - scale_values=None, use_save_bandwith=False): - """ - Perform banded dynamic programming sequence to event alignment for this read - - :param all_raw_signal: Vector containing raw (DAC) current signal values - :param channel_info: Channel info containing info for signal normalization - :param fast5_fn: Relative path to filename for index creation - :param genome_seq: Genomic sequence for this read - :param genome_loc: Mapped genomic location named tuple for this read - :param align_info: A alignInfo named tuple for this read - :param std_ref: A TomboModel object - :param outlier_thresh: Outlier threshold for raw signal normalization - :param bc_grp: The basecalled read group to analyze - :param corr_grp: The tombo corrected group to write results - :param bio_samp_type: Biological sample type (either 'DNA' or 'RNA' or - None to determine from read) - :param seg_params: 3 segmenation parameters (mean_obs_per_event, - running_stat_width and min_obs_per_base) - :param sig_aln_params: Signal align parameters (match_evalue, skip_pen - bandwidth, save_bandwidth, signal_matching_threshold and windsorizor - score) +def segment_signal( + map_res, num_events, rsqgl_params, outlier_thresh=None, const_scale=None): + """Normalize and segment raw signal as defined by `rsqgl_params` into `num_events`. + + Args: + map_res (:class:`tombo.tombo_helper.resquiggleResults`): containing mapping results + num_events (int): number of events to process + rsqgl_params (:class:`tombo.tombo_helper.resquiggleParams`): parameters for the re-squiggle algorithm + outlier_thresh (float): windsorize signal greater than this value (optional) + + Returns: + 1) identified event positions (0-based) + 2) normalized raw signal + 3) scale values (:class:`tombo.tombo_helper.scaleValues`) """ - # flip raw signal for re-squiggling - is_rna = bio_samp_type == 'RNA' - if is_rna: - all_raw_signal = all_raw_signal[::-1] - - if sig_aln_params is None: - (match_evalue, skip_pen, bandwidth, save_bandwidth, - max_half_z_score) = ALGN_PARAMS_TABLE[bio_samp_type] - else: - # unpack signal alignment parameters - (match_evalue, skip_pen, bandwidth, save_bandwidth, - max_half_z_score) = sig_aln_params - bandwidth = int(bandwidth) - save_bandwidth = int(save_bandwidth) - - if use_save_bandwith: - bandwidth = save_bandwidth - z_shift, stay_pen = ts.get_dynamic_prog_params(match_evalue) - - if seg_params is None: - (running_stat_width, min_obs_per_base, - mean_obs_per_event) = SEG_PARAMS_TABLE[bio_samp_type] + if rsqgl_params.use_t_test_seg: + # RNA bases show consistent variable spread so use t-test segmentation + valid_cpts = th.valid_cpts_w_cap_t_test( + map_res.raw_signal.astype(np.float64), rsqgl_params.min_obs_per_base, + rsqgl_params.running_stat_width, num_events) + + # remove cpts within stall locations + if map_res.stall_ints is not None: + valid_cpts = ts.remove_stall_cpts(map_res.stall_ints, valid_cpts) + + if map_res.scale_values is not None: + norm_signal, new_scale_values = ts.normalize_raw_signal( + map_res.raw_signal, scale_values=map_res.scale_values) + elif const_scale is not None: + norm_signal, new_scale_values = ts.normalize_raw_signal( + map_res.raw_signal, norm_type='median_const_scale', + outlier_thresh=outlier_thresh, const_scale=const_scale) + else: + if USE_RNA_EVENT_SCALE: + scale_values = ts.get_scale_values_from_events( + map_res.raw_signal, valid_cpts, outlier_thresh, + num_events=RNA_SCALE_NUM_EVENTS, + max_frac_events=RNA_SCALE_MAX_FRAC_EVENTS) + else: + scale_values = None + norm_signal, new_scale_values = ts.normalize_raw_signal( + map_res.raw_signal, scale_values=scale_values) else: - (running_stat_width, min_obs_per_base, mean_obs_per_event) = seg_params + # normalize signal + if map_res.scale_values is not None: + norm_signal, new_scale_values = ts.normalize_raw_signal( + map_res.raw_signal, scale_values=map_res.scale_values) + elif const_scale is not None: + norm_signal, new_scale_values = ts.normalize_raw_signal( + map_res.raw_signal, norm_type='median_const_scale', + outlier_thresh=outlier_thresh, const_scale=const_scale) + else: + norm_signal, new_scale_values = ts.normalize_raw_signal( + map_res.raw_signal, norm_type='median', + outlier_thresh=outlier_thresh) + + valid_cpts = th.valid_cpts_w_cap( + norm_signal, rsqgl_params.min_obs_per_base, + rsqgl_params.running_stat_width, num_events) + # remove cpts within stall locations + if map_res.stall_ints is not None: + valid_cpts = ts.remove_stall_cpts(map_res.stall_ints, valid_cpts) + + return valid_cpts, norm_signal, new_scale_values + +def resquiggle_read( + map_res, std_ref, rsqgl_params, outlier_thresh=None, + all_raw_signal=None, max_raw_cpts=MAX_RAW_CPTS, + min_event_to_seq_ratio=MIN_EVENT_TO_SEQ_RATIO, const_scale=None, + skip_seq_scaling=False, + seq_samp_type=th.seqSampleType(DNA_SAMP_TYPE, False)): + """Identify raw signal to genome sequence assignment, using adaptive banded dynamic programming + + Args: + map_res (:class:`tombo.tombo_helper.resquiggleResults`): mapping results + std_ref (:class:`tombo.tombo_stats.TomboModel`): canonical base model + rsqgl_params (:class:`tombo.tombo_helper.resquiggleParams`): parameters for the re-squiggle algorithm + outlier_thresh (float): windsorize signal greater than this value (optional) + all_raw_signal (`np.array::np.int64`): raw data acquisition (DAC) current signal values (optional; default use value in map_res) + max_raw_cpts (int): read will fail if more than `max_raw_cpts` must be found to produce a valid re-squiggle results (optional) + min_event_to_seq_ratio (float): minimum event to sequence ratio (optional) + const_scale (float): constant scale value (optional; may be deprecated) + skip_seq_scaling (bool): skip sequence-based scaling step + seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): sequencing sample type (default: DNA) + + Returns: + :class:`tombo.tombo_helper.resquiggleResults` containing raw signal to genome sequence alignment + """ + if all_raw_signal is not None: + map_res = map_res._replace(raw_signal = all_raw_signal) + if map_res.raw_signal is None: + raise th.TomboError( + 'Must have raw signal in order to complete re-squiggle algorithm') # compute number of events to find # ensure at least a minimal number of events per mapped sequence are found - num_events = max(all_raw_signal.shape[0] // mean_obs_per_event, - int(len(genome_seq) * min_event_to_seq_ratio)) + num_mapped_bases = len(map_res.genome_seq) - std_ref.kmer_width + 1 + num_events = ts.compute_num_events( + map_res.raw_signal.shape[0], num_mapped_bases, + rsqgl_params.mean_obs_per_event, min_event_to_seq_ratio) # ensure that there isn't *far* too much signal for the mapped sequence # i.e. one adaptive bandwidth per base is too much to find a good mapping - if num_events / bandwidth > len(genome_seq): - raise NotImplementedError('Too much raw signal for mapped sequence') - - # normalize signal - # note that channel_info is only used for pA normalization, which is not - # available here. This option is retained here in case some channel - # info should become useful in the future. The primary target for this is - # the before median parameter. - if scale_values is not None: - norm_signal, scale_values = ts.normalize_raw_signal( - all_raw_signal, 0, all_raw_signal.shape[0], - scale_values=scale_values) - elif const_scale is not None: - norm_signal, scale_values = ts.normalize_raw_signal( - all_raw_signal, 0, all_raw_signal.shape[0], - 'median_const_scale', channel_info, outlier_thresh, - const_scale=const_scale) - else: - norm_signal, scale_values = ts.normalize_raw_signal( - all_raw_signal, 0, all_raw_signal.shape[0], - 'median', channel_info, outlier_thresh) - - (segs, r_ref_means, r_ref_sds, read_start_rel_to_raw, - genome_seq, genome_loc) = find_adaptive_base_assignment( - norm_signal, running_stat_width, min_obs_per_base, num_events, std_ref, - genome_seq, genome_loc, skip_pen, stay_pen, z_shift, bandwidth, is_rna, - reg_id=reg_id, debug_fps=debug_fps, max_half_z_score=max_half_z_score) - norm_signal = norm_signal[read_start_rel_to_raw: - read_start_rel_to_raw + segs[-1]] + if num_events / rsqgl_params.bandwidth > num_mapped_bases: + raise th.TomboError('Too much raw signal for mapped sequence') + + valid_cpts, norm_signal, new_scale_values = segment_signal( + map_res, num_events, rsqgl_params, outlier_thresh, const_scale) + event_means = ts.compute_base_means(norm_signal, valid_cpts) + + dp_res = find_adaptive_base_assignment( + valid_cpts, event_means, rsqgl_params, std_ref, map_res.genome_seq, + start_clip_bases=map_res.start_clip_bases, + seq_samp_type=seq_samp_type, reg_id=map_res.align_info.ID) + # clip raw signal to only part mapping to genome seq + norm_signal = norm_signal[dp_res.read_start_rel_to_raw: + dp_res.read_start_rel_to_raw + dp_res.segs[-1]] # identify all stretches of genomic deletions within del_fix_window # to be fixed. - segs = get_model_fit_segs( - segs, norm_signal, r_ref_means, r_ref_sds, - min_obs_per_base, max_raw_cpts, max_half_z_score=max_half_z_score) + segs = resolve_skipped_bases_with_raw( + dp_res, norm_signal, rsqgl_params, max_raw_cpts) if skip_seq_scaling: norm_params_changed = False else: (shift, scale, shift_corr_factor, scale_corr_factor) = ts.calc_kmer_fitted_shift_scale( - scale_values.shift, scale_values.scale, - c_new_means(norm_signal, segs), r_ref_means, method='theil_sen') - scale_values = th.scaleValues( - shift, scale, scale_values.lower_lim, scale_values.upper_lim) + new_scale_values.shift, new_scale_values.scale, + ts.compute_base_means(norm_signal, segs), dp_res.ref_means, + method='theil_sen') + new_scale_values = new_scale_values._replace( + shift=shift, scale=scale, outlier_thresh=outlier_thresh) # re-normalize signal with new fitted parameters norm_signal = (norm_signal - shift_corr_factor) / scale_corr_factor # determine if normalization parameters changed enough to warrant @@ -859,41 +1133,51 @@ def resquiggle_read( np.abs(shift_corr_factor) > SHIFT_CHANGE_THRESH or np.abs(scale_corr_factor - 1) > SCALE_CHANGE_THRESH) - sig_match_score = get_read_seg_score(c_new_means(norm_signal, segs), - r_ref_means, r_ref_sds) - if segs.shape[0] != len(genome_seq) + 1: - raise ValueError('Aligned sequence does not match number ' + - 'of segments produced') + sig_match_score = ts.get_read_seg_score( + ts.compute_base_means(norm_signal, segs), + dp_res.ref_means, dp_res.ref_sds) + if segs.shape[0] != len(dp_res.genome_seq) + 1: + raise th.TomboError('Aligned sequence does not match number ' + + 'of segments produced') # Output for testing/visualization of re-squiggle if _DEBUG_PARAMS: _write_params_debug( - norm_signal, segs, r_ref_means, r_ref_sds, - running_stat_width, min_obs_per_base, mean_obs_per_event, - match_evalue, skip_pen, bandwidth, fast5_fn) - if _DEBUG_FIT: - _write_fit_debug( - norm_signal, segs, r_ref_means, r_ref_sds, genome_seq) + norm_signal, segs, dp_res.ref_means, dp_res.ref_sds, + rsqgl_params, map_res.align_info.ID) - return (genome_loc, read_start_rel_to_raw, segs, genome_seq, norm_signal, - scale_values, corr_grp, align_info, is_rna, sig_match_score, - norm_params_changed) + return map_res._replace( + read_start_rel_to_raw=dp_res.read_start_rel_to_raw, segs=segs, + genome_seq=dp_res.genome_seq, raw_signal=norm_signal, + scale_values=new_scale_values, sig_match_score=sig_match_score, + norm_params_changed=norm_params_changed) ####################################### ########## Genomic Alignment ########## ####################################### -def get_read_seq(fast5_data, bc_grp, bc_subgrp, bio_samp_type, q_score_thresh): - """ - Extract the read sequence from the Fastq slot providing useful error - messages +def get_read_seq( + fast5_data, bc_grp='Basecall_1D_000', bc_subgrp='BaseCalled_template', + seq_samp_type=th.seqSampleType(DNA_SAMP_TYPE, False), q_score_thresh=0): + """Extract read sequence from the Fastq slot providing useful error messages + + Args: + + fast5_data (:class:`tombo.tombo_helper.readData`): read information + bc_grp (str): group location containing read information (optional; default: 'Basecall_1D_000') + bc_subgrp (str): sub-group location containing read information (optional; default: 'BaseCalled_template') + seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): sequencing sample type (default: DNA) + q_score_thresh (float): basecalling mean q-score threshold (optional; default: 0/no filtering) + + Returns: + :class:`tombo.tombo_helper.sequenceData` """ try: fastq_raw_value = fast5_data[ - '/Analyses/' + bc_grp + '/' + bc_subgrp + '/Fastq'].value - except: - raise NotImplementedError('Fastq slot not present in --basecall-group') + '/Analyses/' + bc_grp + '/' + bc_subgrp + '/Fastq'][()] + except KeyError: + raise th.TomboError('Fastq slot not present in --basecall-group') # depending on how fastq data was stored it may already be encoded # as unicode, so this would fail. @@ -906,45 +1190,56 @@ def get_read_seq(fast5_data, bc_grp, bc_subgrp, bio_samp_type, q_score_thresh): read_seq, read_q = s_fastq[1], s_fastq[3] # compute read q-score - if sys.version_info[0] > 2: - mean_q_score = np.mean([q_val - PHRED_BASE - for q_val in read_q.encode('ASCII')]) - else: - mean_q_score = np.mean([ord(q_val) - PHRED_BASE - for q_val in read_q.encode('ASCII')]) + mean_q_score = th.get_mean_q_score(read_q) if q_score_thresh is not None and mean_q_score < q_score_thresh: - raise NotImplementedError('Read filtered by q-score.') + raise th.TomboError('Read filtered by q-score.') read_data = th.get_raw_read_slot(fast5_data) # looks like read_id attribute has been removed in some files and attribute # is not really necessary for tombo try: - read_id = read_data.attrs['read_id'] - except: + read_id = read_data.attrs.get('read_id') + except KeyError: try: - read_id = unicode(read_data.attrs['read_num']) - except: + read_id = unicode(read_data.attrs.get('read_num')) + except KeyError: read_id = unicode(np.random.randint(1000000000)) - try: - if bio_samp_type is None: - bio_samp_type = 'RNA' if th.is_read_rna(fast5_data) else 'DNA' - except: - raise NotImplementedError('Cannot determine whether read is DNA or RNA') - if bio_samp_type == 'RNA': + # only really here for the API + if seq_samp_type is None: + seq_samp_type = th.get_seq_sample_type(fast5_data) + if seq_samp_type.name == RNA_SAMP_TYPE: read_seq = th.rev_transcribe(read_seq) - return read_seq, read_id, bio_samp_type, mean_q_score - -def map_read(fast5_data, bc_grp, bc_subgrp, corr_grp, aligner, bio_samp_type, - map_thr_buf, q_score_thresh): - read_seq, read_id, bio_samp_type, mean_q_score = get_read_seq( - fast5_data, bc_grp, bc_subgrp, bio_samp_type, q_score_thresh) + return th.sequenceData(seq=read_seq, id=read_id, mean_q_score=mean_q_score) + +def map_read( + fast5_data, aligner, std_ref, + seq_samp_type=th.seqSampleType(DNA_SAMP_TYPE, False), + bc_grp='Basecall_1D_000', bc_subgrp='BaseCalled_template', + map_thr_buf=None, q_score_thresh=0): + """Extract read sequence from the Fastq slot providing useful error messages + + Args: + fast5_data (:class:`tombo.tombo_helper.readData`): read information + aligner (mappy.Aligner): aligner object + std_ref (:class:`tombo.tombo_stats.TomboModel`): canonical model (in order to extract extended genomic sequence) + seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): sequencing sample type (default: DNA) + bc_grp (str): group location containing read information (optional; default: 'Basecall_1D_000') + bc_subgrp (str): sub-group location containing read information (optional; default: 'BaseCalled_template') + map_thr_buf (mappy.ThreadBuffer): mappy thread buffer object (optional; default: None) + q_score_thresh (float): basecalling mean q-score threshold (optional; default: 0/no filtering) + + Returns: + :class:`tombo.tombo_helper.resquiggleResults` containing valid mapping values + """ + seq_data = get_read_seq( + fast5_data, bc_grp, bc_subgrp, seq_samp_type, q_score_thresh) try: - alignment = next(aligner.map(str(read_seq), buf=map_thr_buf)) + alignment = next(aligner.map(str(seq_data.seq), buf=map_thr_buf)) except StopIteration: - raise NotImplementedError('Alignment not produced') + raise th.TomboError('Alignment not produced') chrm = alignment.ctg # subtract one to put into 0-based index @@ -952,7 +1247,7 @@ def map_read(fast5_data, bc_grp, bc_subgrp, corr_grp, aligner, bio_samp_type, ref_end = alignment.r_en strand = '+' if alignment.strand == 1 else '-' num_match = alignment.mlen - num_ins, num_del, num_aligned = 0, 0 ,0 + num_ins, num_del, num_aligned = 0, 0, 0 for op_len, op in alignment.cigar: if op == 1: num_ins += op_len elif op in (2,3): num_del += op_len @@ -961,112 +1256,148 @@ def map_read(fast5_data, bc_grp, bc_subgrp, corr_grp, aligner, bio_samp_type, else: # soft and hard clipping are not reported in the # mappy cigar - raise NotImplementedError('Invalid cigar operation') + raise th.TomboError('Invalid cigar operation') + + # store number of clipped bases relative to read sequence if strand == '+': - start_clipped_bases = alignment.q_st - end_clipped_bases = len(read_seq) - alignment.q_en + num_start_clipped_bases = alignment.q_st + num_end_clipped_bases = len(seq_data.seq) - alignment.q_en else: - start_clipped_bases = len(read_seq) - alignment.q_en - end_clipped_bases = alignment.q_st + num_start_clipped_bases = len(seq_data.seq) - alignment.q_en + num_end_clipped_bases = alignment.q_st + + align_info = th.alignInfo( + seq_data.id.decode(), bc_subgrp, num_start_clipped_bases, + num_end_clipped_bases, num_ins, num_del, num_match, + num_aligned - num_match) # extract genome sequence from mappy aligner - genome_seq = aligner.seq(chrm, ref_start, ref_end) + # expand sequence to get model levels for all sites (need to handle new + # sequence coordinates downstream) + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + if ((seq_samp_type.name == RNA_SAMP_TYPE and strand == '+') or + (seq_samp_type.name == DNA_SAMP_TYPE and strand == '-' and + USE_START_CLIP_BASES) or + (seq_samp_type.name == DNA_SAMP_TYPE and strand == '+' and + not USE_START_CLIP_BASES)): + if ref_start < std_ref.central_pos: + ref_start = std_ref.central_pos + ref_seq_start = ref_start - std_ref.central_pos + ref_seq_end = ref_end + dnstrm_bases + else: + if ref_start < dnstrm_bases: + ref_start = dnstrm_bases + ref_seq_start = ref_start - dnstrm_bases + ref_seq_end = ref_end + std_ref.central_pos + genome_seq = aligner.seq(chrm, ref_seq_start, ref_seq_end) + if genome_seq is None or genome_seq == '': + raise th.TomboReads('Invalid mapping location') + if sys.version_info[0] < 3: genome_seq = genome_seq.decode() if strand == '-': genome_seq = th.rev_comp(genome_seq) - assert len(genome_seq) == ref_end - ref_start, ( - 'Discordant mapped position and sequence') - align_info = th.alignInfo( - read_id, bc_subgrp, start_clipped_bases, end_clipped_bases, - num_ins, num_del, num_match, num_aligned - num_match) - genome_loc = th.genomeLoc(ref_start, strand, chrm) - - return genome_seq, genome_loc, align_info, bio_samp_type, mean_q_score + # discordant mapping to sequence extraction is due to reads mapping up to the + # end of a seqeunce record (and don't need to carry around record lens), + # so don't error on these discordant lengths here + #if len(genome_seq) != ref_end - ref_start + std_ref.kmer_width - 1: + # raise th.TomboError('Discordant mapped position and sequence') + genome_loc = th.genomeLocation(ref_start, strand, chrm) + + # store sequence at the end of the read without an adapter + # for simpler read start identification (start of RNA genomic sequence + # end of DNA genomic sequence) + start_clip_bases = None + if USE_START_CLIP_BASES: + start_clip_bases = seq_data.seq[alignment.q_en:][::-1] + + return th.resquiggleResults( + align_info=align_info, genome_loc=genome_loc, genome_seq=genome_seq, + mean_q_score=seq_data.mean_q_score, start_clip_bases=start_clip_bases) def _io_and_map_read( fast5_data, failed_reads_q, bc_subgrps, bc_grp, corr_grp, aligner, - bio_samp_type, map_thr_buf, fast5_fn, num_processed, map_conn, + seq_samp_type, map_thr_buf, fast5_fn, num_processed, map_conn, outlier_thresh, compute_sd, obs_filter, index_q, q_score_thresh, - sig_match_thresh): + sig_match_thresh, std_ref): try: # extract channel and raw data for this read channel_info = th.get_channel_info(fast5_data) - all_raw_signal = th.get_raw_read_slot(fast5_data)['Signal'].value - except: - failed_reads_q.put( - ('Channel or raw signal information not found in FAST5 file', - fast5_fn)) - return + except th.TomboError: + # channel info is not needed currently, so just pass + channel_info = None + all_raw_signal = th.get_raw_read_slot(fast5_data)['Signal'][:] for bc_subgrp in bc_subgrps: try: - # TODO warn if reads appear to switch bio sample type - (genome_seq, genome_loc, align_info, bio_samp_type, - mean_q_score) = map_read( - fast5_data, bc_grp, bc_subgrp, corr_grp, aligner, - bio_samp_type, map_thr_buf, q_score_thresh) - if th.invalid_seq(genome_seq): - raise NotImplementedError( + map_res = map_read( + fast5_data, aligner, std_ref, seq_samp_type, + bc_grp, bc_subgrp, map_thr_buf, q_score_thresh) + if th.invalid_seq(map_res.genome_seq): + raise th.TomboError( 'Reference mapping contains non-canonical bases ' + '(transcriptome reference cannot contain U bases)') - map_conn.send([ - all_raw_signal, channel_info, fast5_fn, genome_seq, - genome_loc, align_info, bio_samp_type, num_processed]) + map_res = map_res._replace( + raw_signal=all_raw_signal, channel_info=channel_info) + + # send mapping data to _resquiggle_worker process + map_conn.send([map_res, fast5_fn]) # wait until re-squiggle returns - read_failed, rsqgl_data = map_conn.recv() + read_failed, rsqgl_res = map_conn.recv() + if read_failed: failed_reads_q.put(( - rsqgl_data[0], bc_subgrp + ':::' + rsqgl_data[1])) + rsqgl_res[0], bc_subgrp + ':::' + rsqgl_res[1], + rsqgl_res[2])) continue - # unpack data needed to write new event data - # this is the return data from resquiggle_read - (genome_loc, read_start_rel_to_raw, segs, genome_seq, - norm_signal, scale_values, corr_grp, align_info, - is_rna, sig_match_score) = rsqgl_data if not _DRY_RUN: # write re-squiggle event assignment to the read FAST5 file th.write_new_fast5_group( - fast5_data, genome_loc, read_start_rel_to_raw, segs, - genome_seq, norm_signal, scale_values, corr_grp, - align_info.Subgroup, 'median', outlier_thresh, - compute_sd, align_info=align_info, rna=is_rna, - sig_match_score=sig_match_score) + fast5_data, corr_grp, rsqgl_res, 'median', compute_sd, + rna=seq_samp_type.rev_sig) if index_q is not None: # check that read passes reversible filters is_filtered = False - if sig_match_score > sig_match_thresh: + if rsqgl_res.sig_match_score > sig_match_thresh: failed_reads_q.put(( 'Poor raw to expected signal matching ' + - '(revert with `tombo clear_filters`)', - bc_subgrp + ':::' + fast5_fn)) + '(revert with `tombo filter clear_filters`)', + bc_subgrp + ':::' + fast5_fn, True)) is_filtered = True elif obs_filter is not None: - base_lens = np.diff(segs) + base_lens = np.diff(rsqgl_res.segs) is_filtered = any(np.percentile(base_lens, pctl) > thresh for pctl, thresh in obs_filter) failed_reads_q.put(( 'Read filtered by observation per base ' + - 'thresholds (revert with `tombo clear_filters`)', - bc_subgrp + ':::' + fast5_fn)) + 'thresholds (revert with `tombo filter clear_filters`)', + bc_subgrp + ':::' + fast5_fn, True)) # prep and load data into index queue - index_q.put(th.prep_index_data( - fast5_fn, genome_loc, read_start_rel_to_raw, segs, - corr_grp, align_info.Subgroup, is_rna, is_filtered, - sig_match_score, mean_q_score)) - except Exception as e: - # uncomment to identify mysterious errors - #map_conn.send(None) - #raise + mapped_end = ( + rsqgl_res.genome_loc.Start + len(rsqgl_res.segs) - 1) + index_q.put(( + rsqgl_res.genome_loc.Chrom, + rsqgl_res.genome_loc.Strand, th.readData( + rsqgl_res.genome_loc.Start, mapped_end, is_filtered, + rsqgl_res.read_start_rel_to_raw, + rsqgl_res.genome_loc.Strand, fast5_fn, + corr_grp + '/' + bc_subgrp, seq_samp_type.rev_sig, + rsqgl_res.sig_match_score, rsqgl_res.mean_q_score, + rsqgl_res.align_info.ID))) + except th.TomboError as e: try: th.write_error_status( fast5_fn, corr_grp, bc_subgrp, unicode(e)) except: pass failed_reads_q.put(( - unicode(e), bc_subgrp + ':::' + fast5_fn)) + unicode(e), bc_subgrp + ':::' + fast5_fn, True)) + except Exception as e: + # is_tombo_error = False + failed_reads_q.put(( + traceback.format_exc(), bc_subgrp + ':::' + fast5_fn, False)) return @@ -1076,12 +1407,66 @@ def _io_and_map_read( ######################################### def _resquiggle_worker( - rsqgl_conns, std_ref, outlier_thresh, corr_grp, bio_samp_type, - seg_params, sig_aln_params, skip_index, const_scale, skip_seq_scaling, + rsqgl_conns, std_ref, outlier_thresh, corr_grp, seq_samp_type, + rsqgl_params, save_params, const_scale, skip_seq_scaling, max_scaling_iters): - debug_fps = None - if _DEBUG_MIDDLE or _DEBUG_FULL: - debug_fps = _open_debug_fps() + def run_rsqgl_iters(map_res, params, fast5_fn, all_raw_signal): + rsqgl_res = resquiggle_read( + map_res, std_ref, params, outlier_thresh, + const_scale=const_scale, skip_seq_scaling=skip_seq_scaling, + seq_samp_type=seq_samp_type) + n_iters = 1 + while n_iters < max_scaling_iters and rsqgl_res.norm_params_changed: + rsqgl_res = resquiggle_read( + map_res._replace(scale_values=rsqgl_res.scale_values), + std_ref, params, outlier_thresh, all_raw_signal=all_raw_signal, + seq_samp_type=seq_samp_type) + n_iters += 1 + return rsqgl_res + + def adjust_map_res(map_res): + if seq_samp_type.name == RNA_SAMP_TYPE: + if TRIM_RNA_ADAPTER: + # trim DNA adapter off of RNA signal + adapter_end = ts.trim_rna(map_res.raw_signal, rsqgl_params) + # trim off adapter + map_res = map_res._replace( + raw_signal=map_res.raw_signal[adapter_end:]) + + # flip raw signal for re-squiggling + map_res = map_res._replace(raw_signal=map_res.raw_signal[::-1]) + + elif seq_samp_type.name == DNA_SAMP_TYPE and USE_START_CLIP_BASES: + # flip raw signal, genome and start clip seqs for re-squiggling + map_res = map_res._replace( + raw_signal=map_res.raw_signal[::-1], + genome_seq=map_res.genome_seq[::-1]) + + if ((COLLAPSE_RNA_STALLS and seq_samp_type.name == RNA_SAMP_TYPE) or + (COLLAPSE_DNA_STALLS and seq_samp_type.name == DNA_SAMP_TYPE)): + map_res = map_res._replace( + stall_ints=ts.identify_stalls( + map_res.raw_signal, DEFAULT_STALL_PARAMS)) + + return map_res + + def adjust_rsqgl_res(rsqgl_res, all_raw_signal): + if seq_samp_type.name == DNA_SAMP_TYPE and USE_START_CLIP_BASES: + # flip raw signal and events back for storage in genome direction + rev_rsrtr = (all_raw_signal.shape[0] - + rsqgl_res.read_start_rel_to_raw - + rsqgl_res.segs[-1]) + rev_segs = -1 * (rsqgl_res.segs[::-1] - rsqgl_res.segs[-1]) + rsqgl_res = rsqgl_res._replace( + read_start_rel_to_raw=rev_rsrtr, segs=rev_segs, + genome_seq=rsqgl_res.genome_seq[::-1], + raw_signal=rsqgl_res.raw_signal[::-1]) + + return rsqgl_res + + + if _DEBUG_PLOTTING: + _open_debug_pdf() while len(rsqgl_conns) > 0: # get next active connection or wait for one to be ready @@ -1101,55 +1486,30 @@ def _resquiggle_worker( del rsqgl_conns[conn_num] continue - (all_raw_signal, channel_info, fast5_fn, genome_seq, genome_loc, - align_info, bio_samp_type, reg_id) = map_info - - rsqgl_data = resquiggle_read( - all_raw_signal, channel_info, genome_seq, genome_loc, - align_info, std_ref, outlier_thresh, corr_grp, bio_samp_type, - seg_params, sig_aln_params, fast5_fn=fast5_fn, - skip_index=skip_index, reg_id=reg_id, debug_fps=debug_fps, - const_scale=const_scale, skip_seq_scaling=skip_seq_scaling) - n_iters = 1 - while n_iters < max_scaling_iters and rsqgl_data[-1]: - rsqgl_data = resquiggle_read( - all_raw_signal, channel_info, genome_seq, genome_loc, - align_info, std_ref, outlier_thresh, corr_grp, bio_samp_type, - seg_params, sig_aln_params, fast5_fn=fast5_fn, - skip_index=skip_index, reg_id=reg_id, debug_fps=debug_fps, - skip_seq_scaling=skip_seq_scaling, scale_values=rsqgl_data[5]) - n_iters += 1 - except Exception as e: + map_res, fast5_fn = map_info + + map_res = adjust_map_res(map_res) + # save un-normalized signal for later iterations + all_raw_signal = map_res.raw_signal try: - rsqgl_data = resquiggle_read( - all_raw_signal, channel_info, genome_seq, genome_loc, - align_info, std_ref, outlier_thresh, corr_grp, bio_samp_type, - seg_params, sig_aln_params, fast5_fn=fast5_fn, - skip_index=skip_index, reg_id=reg_id, debug_fps=debug_fps, - const_scale=const_scale, skip_seq_scaling=skip_seq_scaling, - use_save_bandwith=True) - n_iters = 1 - while n_iters < max_scaling_iters and rsqgl_data[-1]: - rsqgl_data = resquiggle_read( - all_raw_signal, channel_info, genome_seq, genome_loc, - align_info, std_ref, outlier_thresh, corr_grp, - bio_samp_type, seg_params, sig_aln_params, - fast5_fn=fast5_fn, skip_index=skip_index, - reg_id=reg_id, debug_fps=debug_fps, - skip_seq_scaling=skip_seq_scaling, - scale_values=rsqgl_data[5], - use_save_bandwith=True) - n_iters += 1 - except Exception as e: - # uncomment to identify mysterious errors - # added connection closing to avoid deadlocks here - #for rsqgl_conn in rsqgl_conns: - # rsqgl_conn.send(None) - #raise - rsqgl_conn.send([True, [unicode(e), fast5_fn]]) - continue + rsqgl_res = run_rsqgl_iters( + map_res, rsqgl_params, fast5_fn, all_raw_signal) + # if the resquiggle read fails for any reason + except: + rsqgl_res = run_rsqgl_iters( + map_res, save_params, fast5_fn, all_raw_signal) + rsqgl_res = adjust_rsqgl_res(rsqgl_res, all_raw_signal) + except th.TomboError as e: + rsqgl_conn.send([True, [unicode(e), fast5_fn, True]]) + continue + except Exception as e: + rsqgl_conn.send([True, [traceback.format_exc(), fast5_fn, False]]) + continue + + rsqgl_conn.send([False, rsqgl_res]) - rsqgl_conn.send([False, rsqgl_data[:-1]]) + if _DEBUG_PLOTTING: + _close_debug_pdf() return @@ -1163,9 +1523,20 @@ def _resquiggle_worker(*args): def _io_and_mappy_thread_worker( fast5_q, progress_q, failed_reads_q, index_q, bc_grp, bc_subgrps, - corr_grp, aligner, outlier_thresh, compute_sd, sig_aln_params, - sig_match_thresh, obs_filter, bio_samp_type, overwrite, map_conn, - q_score_thresh): + corr_grp, aligner, outlier_thresh, compute_sd, sig_match_thresh, + obs_filter, seq_samp_type, overwrite, map_conn, q_score_thresh, std_ref): + # increase update interval as more reads are provided + proc_update_interval = 1 + def update_progress(num_processed, proc_update_interval): + if num_processed % proc_update_interval == 0: + progress_q.put(proc_update_interval) + # increase update interval as more reads are processed + if num_processed == 100: + proc_update_interval = 10 + if num_processed == 1000: + proc_update_interval = 100 + return proc_update_interval + # get mappy aligner thread buffer map_thr_buf = mappy.ThreadBuffer() @@ -1181,13 +1552,10 @@ def _io_and_mappy_thread_worker( # signal that all reads have been processed to child process map_conn.send(None) # update with all reads processed from this thread - progress_q.put(num_processed % PROC_UPDATE_INTERVAL) + progress_q.put(num_processed % proc_update_interval) break num_processed += 1 - if num_processed % PROC_UPDATE_INTERVAL == 0: - progress_q.put(PROC_UPDATE_INTERVAL) - if _DRY_RUN: prep_result = h5py.File(fast5_fn, 'r') else: @@ -1198,19 +1566,23 @@ def _io_and_mappy_thread_worker( fast5_data = prep_result else: failed_reads_q.put(prep_result) + proc_update_interval = update_progress( + num_processed, proc_update_interval) continue try: _io_and_map_read( fast5_data, failed_reads_q, bc_subgrps, bc_grp, corr_grp, - aligner, bio_samp_type, map_thr_buf, fast5_fn, + aligner, seq_samp_type, map_thr_buf, fast5_fn, num_processed, map_conn, outlier_thresh, compute_sd, - obs_filter, index_q, q_score_thresh, sig_match_thresh) + obs_filter, index_q, q_score_thresh, sig_match_thresh, std_ref) finally: try: fast5_data.close() except: - failed_reads_q.put(('Error closing fast5 file', fast5_fn)) + failed_reads_q.put(('Error closing fast5 file', fast5_fn, True)) + proc_update_interval = update_progress( + num_processed, proc_update_interval) return @@ -1219,60 +1591,137 @@ def _io_and_mappy_thread_worker( ########## Multi-process Handling ########## ############################################ -def _get_progress_queue(progress_q, prog_conn, max_value): - if VERBOSE: - th._status_message( - 'Re-squiggling reads (raw signal to genomic sequence alignment).') - bar = tqdm(total=max_value, smoothing=0) +def _get_progress_fail_queues( + progress_q, failed_reads_q, pf_conn, num_reads, failed_reads_fn, + num_update_errors=0): + def format_fail_summ(header, fail_summ=[], num_proc=0, num_errs=None): + summ_errs = sorted(fail_summ)[::-1] + if num_errs is not None: + summ_errs = summ_errs[:num_errs] + if len(summ_errs) < num_errs: + summ_errs.extend([ + (None, '') for _ in range(num_errs - len(summ_errs))]) + errs_str = '\n'.join( + "{:8.1f}% ({:>7} reads)".format(100 * n_fns / float(num_proc), + n_fns) + " : " + '{:<80}'.format(err) + if (n_fns is not None and num_proc > 0) else + ' -----' for n_fns, err in summ_errs) + return '\n'.join((header, errs_str)) - tot_num_rec_proc = 0 - while True: - try: - iter_val = progress_q.get(block=False) - tot_num_rec_proc += iter_val - if VERBOSE: bar.update(iter_val) - except queue.Empty: - if prog_conn.poll(): - break - sleep(0.1) - continue - if VERBOSE: bar.close() - prog_conn.send(tot_num_rec_proc) - - return - -def _get_failed_read_queue(failed_reads_q, failed_read_conn): + if VERBOSE: + th.status_message( + 'Re-squiggling reads (raw signal to genomic sequence alignment).') + if num_update_errors > 0: + # add lines for dynamic error messages + sys.stderr.write( + '\n'.join(['' for _ in range(num_update_errors + 2)])) + bar = tqdm(total=num_reads, smoothing=0) + if num_update_errors > 0: + prog_prefix = ''.join( + [_term_move_up(),] * (num_update_errors + 1)) + '\r' + bar_update_header = ( + str(num_update_errors) + ' most common unsuccessful ' + + 'read types (approx. %):') + # write failed read update header + bar.write(prog_prefix + format_fail_summ( + bar_update_header, num_errs=num_update_errors), + file=sys.stderr) + + tot_num_rec_proc, last_prog_update = 0, 0 + non_tombo_errors = [] failed_reads = defaultdict(list) # continue to process the failed reads queue until the end signal # is sent via the failed_read_conn while True: try: - errorType, fn = failed_reads_q.get(block=False) - failed_reads[errorType].append(fn) + tot_num_rec_proc += progress_q.get(block=False) except queue.Empty: - if failed_read_conn.poll(): - break - sleep(0.1) - continue + # only update once the progress queue has emptied to make fewer + # updates to the bar (can be annoying for cat'ing stderr) + if VERBOSE and tot_num_rec_proc > last_prog_update: + bar.update(tot_num_rec_proc - last_prog_update) + last_prog_update = tot_num_rec_proc + try: + errorType, fn, is_tombo_error = failed_reads_q.get(block=False) + if is_tombo_error: + failed_reads[errorType].append(fn) + else: + failed_reads['Unexpected error'].append(fn) + if len(non_tombo_errors) < _MAX_NUM_UNEXP_ERRORS: + non_tombo_errors.append(fn + '\n:::\n' + errorType) + except queue.Empty: + # check if all reads are done signal was sent from main thread + if pf_conn.poll(): + break + if VERBOSE and num_update_errors > 0: + bar.write(prog_prefix + format_fail_summ( + bar_update_header, + [(len(fns), err) for err, fns in failed_reads.items()], + tot_num_rec_proc, num_update_errors), + file=sys.stderr) + sleep(0.5) + continue + + # empty any entries left in queues after processes have finished + while not progress_q.empty(): + tot_num_rec_proc += progress_q.get(block=False) + if VERBOSE and tot_num_rec_proc > last_prog_update: + bar.update(tot_num_rec_proc - last_prog_update) - # empty any entries left in queue after processes have finished while not failed_reads_q.empty(): - errorType, fn = failed_reads_q.get(block=False) - failed_reads[errorType].append(fn) + errorType, fn, is_tombo_error = failed_reads_q.get(block=False) + if is_tombo_error: + failed_reads[errorType].append(fn) + else: + if len(non_tombo_errors) < _MAX_NUM_UNEXP_ERRORS: + non_tombo_errors.append(fn + '\n:::\n' + errorType) + + if VERBOSE: bar.close() + + # close out failed read printout + fail_summary = [(len(fns), err) for err, fns in failed_reads.items()] + if VERBOSE: + if len(non_tombo_errors) > 0: + # add random value to filename in case multiple runs are made + # from the same location + unex_err_fn = _UNEXPECTED_ERROR_FN.format( + np.random.randint(10000)) + th.warning_message( + 'Unexpected errors occured. See full error stack traces ' + + 'for first (up to) {0:d} errors in "{1}"'.format( + _MAX_NUM_UNEXP_ERRORS, unex_err_fn)) + with io.open(unex_err_fn, 'w') as fp: + fp.write('\n\n'.join(non_tombo_errors) + '\n') + + if len(fail_summary) > 0: + total_num_failed = sum(map(itemgetter(0), fail_summary)) + header = ( + 'Final unsuccessful reads summary ' + + '({:.1%} reads unsuccessfully processed; {} total reads):'.format( + float(total_num_failed) / num_reads, total_num_failed)) + th.status_message(format_fail_summ(header, fail_summary, num_reads)) + else: + th.status_message('All reads successfully re-squiggled!') + + if failed_reads_fn is not None: + with io.open(failed_reads_fn, 'wt') as fp: + fp.write('\n'.join(( + err + '\t' + ', '.join(fns) + for err, fns in failed_reads.items())) + '\n') - failed_read_conn.send(dict(failed_reads)) + pf_conn.send(tot_num_rec_proc) return -def _get_index_queue(index_q, index_conn): - all_index_data = [] +def _get_index_queue(index_q, index_conn, fast5s_dir, corr_grp): + # open TomboReads object for storing and eventually writing the index data + reads_index = th.TomboReads([fast5s_dir,], corr_grp, for_writing=True) # continue to process the index queue until the end signal # is sent via the index_conn while True: try: - r_index_data = index_q.get(block=False) - all_index_data.append(r_index_data) + reads_index.add_read_data(*index_q.get(block=False)) except queue.Empty: if index_conn.poll(): break @@ -1281,10 +1730,10 @@ def _get_index_queue(index_q, index_conn): # empty any entries left in queue after processes have finished while not index_q.empty(): - r_index_data = index_q.get(block=False) - all_index_data.append(r_index_data) + reads_index.add_read_data(*index_q.get(block=False)) - index_conn.send(all_index_data) + # write index out to file + reads_index.write_index_file() return @@ -1298,16 +1747,15 @@ def _fill_files_queue(fast5_q, fast5_fns, num_threads): def resquiggle_all_reads( fast5_fns, aligner, bc_grp, bc_subgrps, corr_grp, std_ref, - bio_samp_type, outlier_thresh, overwrite, num_ps, threads_per_proc, - compute_sd, skip_index, sig_aln_params, sig_match_thresh, obs_filter, - const_scale, seg_params, q_score_thresh, skip_seq_scaling, - max_scaling_iters): - """ - Perform genomic alignment and re-squiggle algorithm + seq_samp_type, outlier_thresh, overwrite, num_ps, threads_per_proc, + compute_sd, skip_index, rsqgl_params, save_params, sig_match_thresh, + obs_filter, const_scale, q_score_thresh, skip_seq_scaling, + max_scaling_iters, failed_reads_fn, fast5s_basedir, num_update_errors): + """Perform genomic alignment and re-squiggle algorithm """ - fast5_q = mp.Queue(maxsize=th.MAX_QUEUE_SIZE) + fast5_q = mp.Queue(maxsize=_MAX_QUEUE_SIZE) failed_reads_q = mp.Queue() - index_q = mp.Queue(maxsize=th.MAX_QUEUE_SIZE) if not skip_index else None + index_q = mp.Queue(maxsize=_MAX_QUEUE_SIZE) if not skip_index else None progress_q = mp.Queue() # open all multiprocessing pipes and queues before threading @@ -1330,30 +1778,27 @@ def resquiggle_all_reads( proc_rsqgl_conns.append(rsqgl_conn) # open re-squiggle process to void intensive processing hitting the GIL rsqgl_args = ( - proc_rsqgl_conns, std_ref, outlier_thresh, corr_grp, bio_samp_type, - seg_params, sig_aln_params, index_q is None, const_scale, - skip_seq_scaling, max_scaling_iters) + proc_rsqgl_conns, std_ref, outlier_thresh, corr_grp, seq_samp_type, + rsqgl_params, save_params, const_scale, skip_seq_scaling, + max_scaling_iters) rsqgl_process = mp.Process(target=_resquiggle_worker, args=rsqgl_args) rsqgl_process.daemon = True rsqgl_process.start() rsqgl_ps.append(rsqgl_process) - # start queue getter processes - main_prog_conn, prog_conn = mp.Pipe() - prog_p = mp.Process(target=_get_progress_queue, - args=(progress_q, prog_conn, len(fast5_fns))) - prog_p.daemon = True - prog_p.start() - # failed read queue getter - main_failed_read_conn, failed_read_conn = mp.Pipe() - failed_reads_p = mp.Process(target=_get_failed_read_queue, - args=(failed_reads_q, failed_read_conn)) - failed_reads_p.daemon = True - failed_reads_p.start() + # failed read and progress queues getter + main_pf_conn, pf_conn = mp.Pipe() + pf_p = mp.Process(target=_get_progress_fail_queues, + args=(progress_q, failed_reads_q, pf_conn, len(fast5_fns), + failed_reads_fn, num_update_errors)) + pf_p.daemon = True + pf_p.start() + # index queue getter if index_q is not None: main_index_conn, index_conn = mp.Pipe() - index_p = mp.Process(target=_get_index_queue, args=(index_q, index_conn)) + index_p = mp.Process(target=_get_index_queue, args=( + index_q, index_conn, fast5s_basedir, corr_grp)) index_p.daemon = True index_p.start() @@ -1362,8 +1807,8 @@ def resquiggle_all_reads( for map_conn in map_conns: map_args = (fast5_q, progress_q, failed_reads_q, index_q, bc_grp, bc_subgrps, corr_grp, aligner, outlier_thresh, compute_sd, - sig_aln_params, sig_match_thresh, obs_filter, bio_samp_type, - overwrite, map_conn, q_score_thresh) + sig_match_thresh, obs_filter, seq_samp_type, + overwrite, map_conn, q_score_thresh, std_ref) t = threading.Thread(target=_io_and_mappy_thread_worker, args=map_args) t.daemon = True @@ -1377,21 +1822,20 @@ def resquiggle_all_reads( for t in resquiggle_ts: t.join() - # in a very unlikely case the progress queue could die while the + # in a very unlikely case the progress/fail queue could die while the # main process remains active and thus we would have a deadlock here - if prog_p.is_alive(): + if pf_p.is_alive(): # send signal to getter queue to finish and return results - main_prog_conn.send(True) + main_pf_conn.send(True) # returns total number of processed reads if that is needed - main_prog_conn.recv() - main_failed_read_conn.send(True) - failed_reads = main_failed_read_conn.recv() - all_index_data = None + main_pf_conn.recv() + pf_p.join() + if index_q is not None: main_index_conn.send(True) - all_index_data = main_index_conn.recv() + index_p.join() - return failed_reads, all_index_data + return ################################### @@ -1399,48 +1843,54 @@ def resquiggle_all_reads( ################################### def _parse_files_and_lock_dirs(args): - if VERBOSE: th._status_message('Getting file list.') + if VERBOSE: th.status_message('Getting file list.') try: - if not os.path.isdir(args.fast5_basedir): - th._error_message_and_exit( + if not os.path.isdir(args.fast5s_basedir): + th.error_message_and_exit( 'Provided [fast5-basedir] is not a directory.') - fast5_basedir = ( - args.fast5_basedir if args.fast5_basedir.endswith('/') else - args.fast5_basedir + '/') - if args.skip_index: - index_fn = None - else: - index_fn = th.get_index_fn(fast5_basedir, args.corrected_group) - if os.path.exists(index_fn): os.remove(index_fn) + fast5s_basedir = ( + args.fast5s_basedir if args.fast5s_basedir.endswith('/') else + args.fast5s_basedir + '/') files, lock_fns = th.get_files_list_and_lock_dirs( - fast5_basedir, args.ignore_read_locks) + fast5s_basedir, args.ignore_read_locks) except OSError: - th._error_message_and_exit( - 'Reads base directory, a sub-directory ' + - 'or an old (hidden) index file does not appear to be ' + + th.error_message_and_exit( + 'Reads base directory or a sub-directory does not appear to be ' + 'accessible. Check directory permissions.') if len(files) < 1: th.clear_tombo_locks(lock_fns) - th._error_message_and_exit( + th.error_message_and_exit( 'No files identified in the specified ' + 'directory or within immediate subdirectories.') if not th.reads_contain_basecalls( files, args.basecall_group, num_reads=1000): th.clear_tombo_locks(lock_fns) - th._error_message_and_exit( + th.error_message_and_exit( 'Reads do not to contain basecalls. Check --basecall-group ' + 'option if basecalls are stored in non-standard location or ' + 'use `tombo annotate_raw_with_fastqs` to add basecalls from ' + 'FASTQ files to raw FAST5 files.') - return files, fast5_basedir, index_fn, lock_fns + return files, fast5s_basedir, lock_fns def _resquiggle_main(args): + """Main method for resquiggle """ - Main method for resquiggle - """ + if args.processes > 1 and _DEBUG_PLOTTING: + th.error_message_and_exit('Cannot run multiple processes and debugging.') + if _DEBUG_PLOTTING: + th.warning_message( + 'Producing de-bug plotting output. Can be very slow and should ' + + 'only be run on a small number of files.') + if _DRY_RUN: + th.warning_message( + 'Producing de-bug output. Not saving re-squiggle results.') + if _DEBUG_BANDWIDTH or _DEBUG_START_BANDWIDTH: + sys.stdout.write( + 'bandwidth\tmin_bw_edge_buffer\tmean_dp_score\tread_id\n') + global VERBOSE VERBOSE = not args.quiet th.VERBOSE = VERBOSE @@ -1452,42 +1902,47 @@ def _resquiggle_main(args): sys.exit() if args.basecall_group == args.corrected_group: - th._error_message_and_exit( + th.error_message_and_exit( '--basecall-group and --corrected-group must ' + 'be different.') # check simple arguments for validity first - outlier_thresh = args.outlier_threshold if ( - args.outlier_threshold > 0) else None + outlier_thresh = args.outlier_threshold + if outlier_thresh is not None and outlier_thresh <= 0: + outlier_thresh = None obs_filter = th.parse_obs_filter(args.obs_per_base_filter) \ if 'obs_per_base_filter' in args else None - if VERBOSE: th._status_message('Loading minimap2 reference.') + if VERBOSE: th.status_message('Loading minimap2 reference.') # to be enabled when mappy genome sequence extraction bug is fixed - aligner = mappy.Aligner(str(args.reference), preset=str('map-ont')) + aligner = mappy.Aligner(str(args.reference), preset=str('map-ont'), best_n=1) if not aligner: - th._error_message_and_exit( + th.error_message_and_exit( 'Failed to load reference genome FASTA for mapping.') # get files as late as possible in startup since it takes the longest # and so other errors can't happen after locks are written - files, fast5_basedir, index_fn, lock_fns = _parse_files_and_lock_dirs(args) + files, fast5s_basedir, lock_fns = _parse_files_and_lock_dirs(args) try: - tb_model_fn = args.tombo_model_filename - bio_samp_type = args.bio_sample_type - if tb_model_fn is None: - tb_model_fn, bio_samp_type = ts.get_default_standard_ref_from_files( - files, bio_samp_type) - else: - bio_samp_type = 'RNA' if th.is_rna_from_files(files) else 'DNA' + seq_samp_type = None + if args.seq_sample_type is not None: + seq_samp_type = th.seqSampleType(RNA_SAMP_TYPE, True) \ + if args.seq_sample_type == RNA_SAMP_TYPE else \ + th.seqSampleType(DNA_SAMP_TYPE, False) + if args.tombo_model_filename is not None and seq_samp_type is None: + seq_samp_type = th.get_seq_sample_type(fast5_fns=files) + # parse tombo model + std_ref = ts.TomboModel( + ref_fn=args.tombo_model_filename, seq_samp_type=seq_samp_type, + fast5_fns=files) + seq_samp_type = std_ref.seq_samp_type + if seq_samp_type.name == DNA_SAMP_TYPE and USE_START_CLIP_BASES: + std_ref = std_ref.reverse_sequence_copy() + sig_match_thresh = args.signal_matching_score if sig_match_thresh is None: - sig_match_thresh = SIG_MATCH_THRESH[bio_samp_type] - if not os.path.exists(tb_model_fn): - th._error_message_and_exit('Invalid tombo model file provided.') - # parse tombo model - std_ref = ts.TomboModel(tb_model_fn) + sig_match_thresh = SIG_MATCH_THRESH[seq_samp_type.name] const_scale = None if args.fixed_scale is not None: @@ -1495,45 +1950,28 @@ def _resquiggle_main(args): elif args.fit_global_scale: const_scale = ts.estimate_global_scale(files) - failed_reads, all_index_data = resquiggle_all_reads( + rsqgl_params = ts.load_resquiggle_parameters( + seq_samp_type, args.signal_align_parameters, + args.segmentation_parameters) + save_params = ts.load_resquiggle_parameters( + seq_samp_type, args.signal_align_parameters, + args.segmentation_parameters, use_save_bandwidth=True) + + resquiggle_all_reads( files, aligner, args.basecall_group, args.basecall_subgroups, - args.corrected_group, std_ref, bio_samp_type, outlier_thresh, + args.corrected_group, std_ref, seq_samp_type, outlier_thresh, args.overwrite, args.processes, args.threads_per_process, args.include_event_stdev, args.skip_index, - args.signal_align_parameters, sig_match_thresh, - obs_filter, const_scale, args.segmentation_parameters, args.q_score, - args.skip_sequence_rescaling, args.max_scaling_iterations) + rsqgl_params, save_params, sig_match_thresh, + obs_filter, const_scale, args.q_score, + args.skip_sequence_rescaling, args.max_scaling_iterations, + args.failed_reads_filename, fast5s_basedir, + args.num_most_common_errors) finally: th.clear_tombo_locks(lock_fns) - if not args.skip_index: - th.write_index_file(all_index_data, index_fn, fast5_basedir) - fail_summary = [(err, len(fns)) for err, fns in failed_reads.items()] - if len(fail_summary) > 0: - total_num_failed = sum(map(itemgetter(1), fail_summary)) - th._status_message( - 'Failed reads summary (' + unicode(total_num_failed) + - ' total failed):\n' + '\n'.join( - "\t" + err + " :\t" + unicode(n_fns) - for err, n_fns in sorted(fail_summary))) - else: - if len(files) == len(all_index_data): - th._status_message('All reads successfully re-squiggled!') - else: - th._status_message('Tombo appears to have failed unexpectedly.') - if args.failed_reads_filename is not None: - with io.open(args.failed_reads_filename, 'wt') as fp: - fp.write('\n'.join(( - err + '\t' + ', '.join(fns) - for err, fns in failed_reads.items())) + '\n') - - return - -def _args_and_main(): - import _option_parsers - resquiggle_main( - _option_parsers.get_resquiggle_parser().parse_args()) return if __name__ == '__main__': - _args_and_main() + sys.stderr.write('This is a module. See commands with `tombo -h`') + sys.exit(1) diff --git a/tombo/tests/shell_tests.sh b/tombo/tests/shell_tests.sh index 6f71d60..aa19e5e 100755 --- a/tombo/tests/shell_tests.sh +++ b/tombo/tests/shell_tests.sh @@ -11,8 +11,8 @@ poreModel="r9_250bps.nucleotide.5mer.template.model" genomeFn="e_coli.K12.NEB5alpha.fasta" mmiFn="e_coli.K12.NEB5alpha.mmi" -genomeLocs='"CP017100.1:1505285" "CP017100.1:1504705"' -strandGenomeLocs='"CP017100.1:1505285:+" "CP017100.1:1504705:+"' +genomeLocs='"CP017100.1:1505285" "CP017100.1:2873680"' +strandGenomeLocs='"CP017100.1:1505285:+" "CP017100.1:2873680:-"' runHelps=false runResquiggle=true @@ -66,11 +66,11 @@ printf "\n\n********* Testing re-squiggle command **********\n" tombo resquiggle \ $natDir $genomeFn \ --failed-reads-filename testing.native.failed_read.txt \ - --processes 4 --overwrite --include-event-stdev + --processes 8 --overwrite --num-most-common-errors 5 tombo resquiggle \ $ampDir $genomeFn \ --failed-reads-filename testing.amplified.failed_read.txt \ - --processes 4 --overwrite --include-event-stdev + --processes 8 --overwrite --num-most-common-errors 5 printf "\n\n********* Testing FASTQ annotation and re-squiggle **********\n" tombo preprocess annotate_raw_with_fastqs --fast5-basedir $natFqDir \ @@ -78,66 +78,61 @@ tombo preprocess annotate_raw_with_fastqs --fast5-basedir $natFqDir \ tombo resquiggle \ $natFqDir $genomeFn \ --corrected-group FastqAnnotation \ - --failed-reads-filename testing.native.fastq_ann.failed_read.txt \ - --processes 4 --overwrite + --processes 8 --overwrite + +printf "\n\n********* Testing minimap2 index **********\n" +tombo resquiggle \ + $natDir $mmiFn \ + --corrected-group RawMinimapIndexCorrected \ + --processes 8 --overwrite printf "\n\n********* Testing re-squiggle command with filename **********\n" tombo resquiggle \ $natDir $genomeFn --tombo-model-filename $nrModFn \ --corrected-group RawWFilenameCorrected \ - --processes 4 --overwrite \ - --failed-reads-filename testing.native.fn_model.failed_read.txt + --processes 8 --overwrite tombo resquiggle \ $ampDir $genomeFn --tombo-model-filename $nrModFn \ --corrected-group RawWFilenameCorrected \ - --processes 4 --overwrite \ - --failed-reads-filename testing.amplified.fn_model.failed_read.txt + --processes 8 --overwrite printf "\n\n********* Testing event-based resquiggle **********\n" tombo build_model event_resquiggle \ $natDir $genomeFn --minimap2-executable ./minimap2 \ - --corrected-group RawEventCorrected --processes 4 --overwrite \ - --failed-reads-filename testing.native.failed_read.event.txt - -printf "\n\n********* Testing minimap2 index **********\n" -tombo resquiggle \ - $natDir $mmiFn \ - --corrected-group RawMinimapIndexCorrected \ - --processes 4 --overwrite \ - --failed-reads-filename testing.native.failed_read.txt + --corrected-group RawEventCorrected --processes 8 --overwrite printf "\n\n********* Testing pA normalization **********\n" -tombo build_model event_resquiggle --minimap2-executable ./minimap2 \ +tombo build_model event_resquiggle \ $natDir $genomeFn \ - --normalization-type pA_raw --processes 4 \ - --corrected-group RawGenomeCorrected_pA_raw_000 --overwrite \ - --failed-reads-filename testing.native.pA_raw.failed_read.txt + --minimap2-executable ./minimap2 \ + --normalization-type pA_raw --processes 8 \ + --corrected-group RawGenomeCorrected_pA_raw_000 --overwrite tombo build_model event_resquiggle \ - $natDir $genomeFn --minimap2-executable ./minimap2 \ - --normalization-type pA --pore-model-filename $poreModel \ - --corrected-group RawGenomeCorrected_pA_000 --overwrite \ - --failed-reads-filename testing.native.pA.failed_read.txt \ - --processes 4 + $natDir $genomeFn --minimap2-executable ./minimap2 \ + --normalization-type pA --pore-model-filename $poreModel \ + --corrected-group RawGenomeCorrected_pA_000 --overwrite \ + --processes 8 printf "\n\n********* Testing recursive resquiggle **********\n" tombo resquiggle \ - $rcsvDir $genomeFn \ - --failed-reads-filename testing.recursive.failed_read.txt \ - --processes 4 --overwrite + $rcsvDir $genomeFn \ + --processes 8 --overwrite fi printf "\n\n********* Testing filter functions **********\n" tombo filter clear_filters --fast5-basedirs $natDir tombo filter stuck --fast5-basedirs $natDir \ - --obs-per-base-filter 99:200 100:5000 + --obs-per-base-filter 99:200 100:1000 tombo filter level_coverage --fast5-basedirs $natDir \ --percent-to-filter 10 -tombo filter q_score --fast5-basedirs $natDir --q-score 21 +tombo filter q_score --fast5-basedirs $natDir --q-score 10 tombo filter raw_signal_matching --fast5-basedirs $natDir \ --signal-matching-score 0.75 tombo filter clear_filters --fast5-basedirs $natDir tombo filter genome_locations --fast5-basedirs $natDir \ - --include-regions CP017100.1:1,458,474-1,558,736 + --include-regions CP017100.1:1,485,920-1,558,736 --include-partial-overlap +tombo filter genome_locations --fast5-basedirs $natDir \ + --include-regions CP017100.1:1,485,920-1,558,736 tombo filter clear_filters --fast5-basedirs $natDir @@ -164,6 +159,10 @@ tombo plot motif_centered --fast5-basedirs $natDir --motif ATC \ --genome-fasta $genomeFn \ --num-bases 21 --overplot-threshold 1000 --deepest-coverage \ --pdf-filename testing.motif_centered.deepest.1_samp.pdf +tombo plot motif_centered --fast5-basedirs $natDir --motif CCWGG \ + --plot-alternate-model 5mC --genome-fasta $genomeFn \ + --num-bases 21 --overplot-threshold 1000 \ + --pdf-filename testing.motif_centered.w_model.pdf tombo plot max_coverage --fast5-basedirs $rcsvDir \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.max_cov.1_samp.recursive.pdf @@ -198,16 +197,14 @@ tombo plot motif_centered --fast5-basedirs $natDir --motif ATC \ printf "\n\n********* Testing statistical testing. **********\n" rm test_stats.de_novo.tombo.stats test_stats.2samp.tombo.stats \ test_stats.alt_model.5mC.tombo.stats \ - test_stats.alt_model.6mA.tombo.stats \ test_stats.alt_default_model.5mC.tombo.stats \ test_stats.alt_default_model.6mA.tombo.stats \ test_stats.de_novo.tombo.per_read_stats test_stats.2samp.tombo.per_read_stats \ test_stats.alt_model.5mC.tombo.per_read_stats \ - test_stats.alt_model.6mA.tombo.per_read_stats \ test_stats.alt_default_model.5mC.tombo.per_read_stats \ test_stats.alt_default_model.6mA.tombo.per_read_stats \ test_standard.model test_stats.de_novo.new_thresh.tombo.stats \ - test_alt_est.alt_C.tombo_model + test_alt.model test_alt.use_densities.model tombo detect_modifications de_novo --fast5-basedirs $natDir \ --minimum-test-reads 5 \ --statistics-file-basename test_stats.de_novo \ @@ -218,9 +215,14 @@ tombo detect_modifications de_novo --fast5-basedirs $natDir \ --per-read-statistics-basename test_stats.de_novo.two_way_thresh tombo detect_modifications sample_compare --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ - --minimum-test-reads 5 \ + --minimum-test-reads 5 --sample-only-estimates \ --statistics-file-basename test_stats.2samp \ --per-read-statistics-basename test_stats.2samp +tombo detect_modifications sample_compare --fast5-basedirs $natDir \ + --control-fast5-basedirs $ampDir \ + --minimum-test-reads 5 \ + --statistics-file-basename test_stats.2samp_w_post \ + --per-read-statistics-basename test_stats.2samp_w_post tombo detect_modifications alternative_model --fast5-basedirs $natDir \ --alternate-bases 5mC 6mA \ --statistics-file-basename test_stats.alt_default_model \ @@ -250,27 +252,31 @@ tombo build_model estimate_alt_reference \ printf "\n\n********* Testing aggregate per-read stats **********\n" tombo detect_modifications aggregate_per_read_stats --minimum-test-reads 5 \ --single-read-threshold 0.4 \ - --statistics-file-basename test_stats.de_novo.new_thresh \ + --statistics-filename test_stats.de_novo.new_thresh.tombo.stats \ --per-read-statistics-filename test_stats.de_novo.tombo.per_read_stats printf "\n\n********* Testing ROC and Precision-Recall plotting **********\n" tombo plot roc --genome-fasta e_coli.K12.NEB5alpha.fasta \ --statistics-filenames test_stats.2samp.tombo.stats \ + test_stats.2samp_w_post.tombo.stats \ test_stats.alt_default_model.5mC.tombo.stats \ test_stats.alt_default_model.6mA.tombo.stats \ test_stats.de_novo.tombo.stats test_stats.de_novo.new_thresh.tombo.stats \ --motif-descriptions \ CCWGG:2:"dcm 5mC Samp Comp"::GATC:2:"dam 6mA Samp Comp" \ + CCWGG:2:"dcm 5mC Samp Comp w/ post"::GATC:2:"dam 6mA Samp Comp w/ post" \ CCWGG:2:"dcm 5mC Alt Test" GATC:2:"dam 6mA Alt Test" \ CCWGG:2:"dcm 5mC De Novo"::GATC:2:"dam 6mA De Novo" \ CCWGG:2:"dcm 5mC De Novo New Thresh"::GATC:2:"dam 6mA De Novo New Thresh" tombo plot per_read_roc --genome-fasta e_coli.K12.NEB5alpha.fasta \ --per-read-statistics-filenames test_stats.2samp.tombo.per_read_stats \ + test_stats.2samp_w_post.tombo.per_read_stats \ test_stats.alt_default_model.5mC.tombo.per_read_stats \ test_stats.alt_default_model.6mA.tombo.per_read_stats \ test_stats.de_novo.tombo.per_read_stats --motif-descriptions \ CCWGG:2:"dcm 5mC Samp Comp"::GATC:2:"dam 6mA Samp Comp" \ + CCWGG:2:"dcm 5mC Samp Comp w/ post"::GATC:2:"dam 6mA Samp Comp w/ post" \ CCWGG:2:"dcm 5mC Alt Test" GATC:2:"dam 6mA Alt Test" \ CCWGG:2:"dcm 5mC De Novo"::GATC:2:"dam 6mA De Novo" @@ -376,7 +382,7 @@ tombo plot most_significant --fast5-basedirs $natDir \ --statistics-filename test_stats.de_novo.tombo.stats \ --overplot-threshold 1 --overplot-type Density \ --pdf-filename testing.model_plotting.density.pdf -tombo plot genome_locations --fast5-basedirs $ampDir \ +tombo plot genome_locations --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --alternate-model-filename $altModFn \ --genome-locations $genomeLocs \ @@ -401,7 +407,7 @@ tombo plot max_coverage --fast5-basedirs $natDir \ printf "\n\n********* Testing per-read testing plot **********\n" tombo plot per_read --genome-locations $genomeLocs --num-bases 101 \ --per-read-statistics-filename test_stats.2samp.tombo.per_read_stats \ - --genome-fasta $genomeFn --pdf-filename testing.per_read.pdf + --genome-fasta $genomeFn --pdf-filename testing.per_read.samp_comp.pdf tombo plot per_read --genome-locations $genomeLocs --num-bases 101 \ --per-read-statistics-filename test_stats.de_novo.tombo.per_read_stats \ --genome-fasta $genomeFn --pdf-filename testing.de_novo.per_read.pdf @@ -413,7 +419,7 @@ tombo plot per_read --genome-locations $genomeLocs --num-bases 101 \ --per-read-statistics-filename test_stats.alt_model.5mC.tombo.per_read_stats \ --pdf-filename testing.per_read.wo_seq.pdf -printf "\n\n********* Testing auxilliary commands **********\n" +printf "\n\n********* Testing text output commands **********\n" tombo text_output signif_sequence_context --fast5-basedirs $natDir $ampDir \ --statistics-filename test_stats.de_novo.tombo.stats \ --sequences-filename testing_signif_regions.from_fast5s.fasta diff --git a/tombo/tombo_helper.py b/tombo/tombo_helper.py index 354126a..802c9cf 100644 --- a/tombo/tombo_helper.py +++ b/tombo/tombo_helper.py @@ -15,9 +15,9 @@ import h5py import numpy as np +np.seterr(all='raise') from tqdm import tqdm -from glob import glob from time import sleep from time import strftime from operator import itemgetter @@ -30,124 +30,317 @@ # import tombo functions from ._version import TOMBO_VERSION -from .c_helper import c_new_mean_stds, c_new_means -from ._default_parameters import PHRED_BASE +from ._default_parameters import PHRED_BASE, DNA_SAMP_TYPE, RNA_SAMP_TYPE +from ._c_helper import ( + c_new_mean_stds, c_new_means, c_valid_cpts_w_cap, + c_valid_cpts_w_cap_t_test) +from ._c_dynamic_programming import ( + c_banded_traceback, c_adaptive_banded_forward_pass) -VERBOSE = False -MAX_QUEUE_SIZE = 1000 +# list of classes/functions to include in API +__all__ = [ + 'readData', 'intervalData', 'TomboReads', + 'TomboMotif', 'parse_motif_descs', + 'resquiggleParams', 'startClipParams', + 'resquiggleResults', 'alignInfo', 'genomeLocation', + 'sequenceData', 'dpResults', 'scaleValues', 'Fasta', 'seqSampleType', + 'get_seq_sample_type', 'get_raw_read_slot', + 'get_single_slot_read_centric', 'get_multiple_slots_read_centric'] -_ITER_QUEUE_LIMIT = 1000 -_PROC_UPDATE_INTERVAL = 100 -_MAX_FASTQ_QUEUE_SIZE = 10000 -_SEQ_SUMMARY_FN_FIELD = 'filename' -_SEQ_SUMMARY_ID_FIELD = 'read_id' +VERBOSE = True -# warning messages for annotate with fastqs over multiple processes, -# requiring passing warning codes to only print warning once. -_WARN_ID_VAL = 'ids' -_WARN_IO_VAL = 'io' -_WARN_MISMATCH_VAL = 'mismatch' -_WARN_OVRWRT_VAL = 'overwrite' -_WARN_UNIQ_VAL = 'uniq' -_WARN_CODES = (_WARN_ID_VAL, _WARN_IO_VAL, _WARN_MISMATCH_VAL, _WARN_OVRWRT_VAL) -_WARN_CODES_PREP = (_WARN_OVRWRT_VAL, _WARN_UNIQ_VAL) - - -################################ -###### Global Namedtuples ###### -################################ - -alignInfo = namedtuple( - 'alignInfo', - ('ID', 'Subgroup', 'ClipStart', 'ClipEnd', - 'Insertions', 'Deletions', 'Matches', 'Mismatches')) - -readData = namedtuple('readData', ( - 'start', 'end', 'filtered', 'read_start_rel_to_raw', 'strand', 'fn', - 'corr_group', 'rna', 'sig_match_score', 'mean_q_score')) -# set default values for sig_match_score and q_score -readData.__new__.__defaults__ = (None, None) - -intervalData = namedtuple('intervalData', ( - 'reg_id', 'chrm', 'start', 'end', 'strand', 'reg_text', 'reads', 'seq')) -""" itervalData - A Tombo namedtuple containing information about a genomic intervar - -.. py:attribute:: reg_id - - Region ID - string type - -.. py:attribute:: chrm - - Chromosome name - string type - -.. py:attribute:: start - - 0-based start position - integer type - -.. py:attribute:: end +# single base conversion for motifs +SINGLE_LETTER_CODE = { + 'A':'A', 'C':'C', 'G':'G', 'T':'T', 'B':'[CGT]', + 'D':'[AGT]', 'H':'[ACT]', 'K':'[GT]', 'M':'[AC]', + 'N':'[ACGT]', 'R':'[AG]', 'S':'[CG]', 'V':'[ACG]', + 'W':'[AT]', 'Y':'[CT]'} +INVALID_BASES = re.compile('[^ACGT]') +INVALID_BASE_RUNS = re.compile('[^ACGT]+') - 1-based (or open interval) end position - integer type -.. py:attribute:: strand +############################### +###### Custom TomboError ###### +############################### - Interval strand ('+', '-' or None). Default: None - string type +class TomboError(Exception): + pass -.. py:attribute:: reg_test - Some text describing a region. Used for plot titles. Default: '' - string type +###################################### +###### Cython Wrapper Functions ###### +###################################### -.. py:attribute:: reads +# can't import TomboError in cython so wrap these functions in python +def valid_cpts_w_cap(*args, **kwargs): + try: + valid_cpts = c_valid_cpts_w_cap(*args, **kwargs) + except NotImplementedError as e: + raise TomboError(unicode(e)) + valid_cpts.sort() + return valid_cpts - A list of readData values. Default: None - list type +def valid_cpts_w_cap_t_test(*args, **kwargs): + try: + valid_cpts = c_valid_cpts_w_cap_t_test(*args, **kwargs) + except NotImplementedError as e: + raise TomboError(unicode(e)) + valid_cpts.sort() + return valid_cpts -.. py:attribute:: seq +def banded_traceback(*args, **kwargs): + try: + return c_banded_traceback(*args, **kwargs) + except NotImplementedError as e: + raise TomboError(unicode(e)) - The genomic sequence for a region. Default: None - string type +def adaptive_banded_forward_pass(*args, **kwargs): + try: + return c_adaptive_banded_forward_pass(*args, **kwargs) + except NotImplementedError as e: + raise TomboError(unicode(e)) -""" -# set default values for strand, text, reads and seq -intervalData.__new__.__defaults__ = (None, '', None, None) -channelInfo = namedtuple( - 'channelInfo', - ('offset', 'range', 'digitisation', 'number', 'sampling_rate')) +################################ +###### Global Namedtuples ###### +################################ -scaleValues = namedtuple( - 'scaleValues', - ('shift', 'scale', 'lower_lim', 'upper_lim')) +class alignInfo(namedtuple( + 'alignInfo', + ('ID', 'Subgroup', 'ClipStart', 'ClipEnd', + 'Insertions', 'Deletions', 'Matches', 'Mismatches'))): + """Information from genomic read alignment + + Args: + ID (str): read identifier + Subgroup (str): sub-group location containing read information (e.g. 'BaseCalled_template') + ClipStart (int): number of bases clipped from beginning of basecalls + ClipEnd (int): number of bases clipped from end of basecalls + Insertions (int): number of inserted bases in alignment + Deletions (int): number of delected bases in alignment + Matches (int): number of matched bases in alignment + Mismatches (int): number of mis-matched bases in alignment + """ + +# TODO convert rna to rev_sig +class readData(namedtuple('readData', ( + 'start', 'end', 'filtered', 'read_start_rel_to_raw', 'strand', 'fn', + 'corr_group', 'rna', 'sig_match_score', 'mean_q_score', 'read_id'))): + """Nanopore read meta-data + + Example:: + + r_data = tombo_helper.readData( + start=0, end=1000, filtered=False, read_start_rel_to_raw=100, + strand='+', fn='path/to/read.fast5', + corr_group='RawGenomeCorrected_000', rna=False, + sig_match_score=1.0, mean_q_score=10.0) + + # output is a list of readData objects + reads_index = tombo_helper.TomboReads(['test_data/native_reads',]) + cs_reads = reads_index.get_cs_reads('chr20', '+') + + Args: + start (int): 0-based start mapped position + end (int): 0-based open interval end mapped position + filtered (bool): is this read filtered? + read_start_rel_to_raw (int): start (in raw signal vector) of assigned bases + strand (str): read mapped strand ('+', '-' or None) + fn (str): raw read filename. + corr_group (str): --corrected-group slot specified in re-squiggle command + rna (bool): is this read RNA? + sig_match_score (float): signal matching score. Default: None + mean_q_score (float): mean basecalling q-score. Default: None + read_id (str): read identifier Default: None + """ +# set default values for sig_match_score, q_score and read_id +readData.__new__.__defaults__ = (None, None, None) + +class scaleValues(namedtuple( + 'scaleValues', ( + 'shift', 'scale', 'lower_lim', 'upper_lim', 'outlier_thresh'))): + """Signal normaliztion scaling parameters. For details see https://nanoporetech.github.io/tombo/resquiggle.html#signal-normalization + + Args: + shift (float): shift scaling parameter + scale (float): scale scaling parameter + lower_lim (float): lower windsorizing threshold + upper_lim (float): upper windsorizing threshold + outlier_thresh (float): outlier threshold used to define `lower_lim` and `upper_lim` + """ + +class resquiggleParams(namedtuple( + 'resquiggleParams', + ('match_evalue', 'skip_pen', 'bandwidth', 'max_half_z_score', + 'running_stat_width', 'min_obs_per_base', 'mean_obs_per_event', + 'z_shift', 'stay_pen', 'use_t_test_seg', 'band_bound_thresh', + 'start_bw', 'start_save_bw', 'start_n_bases'))): + """Re-squiggle parameters + + Args: + match_evalue (float): expected value for matching event to sequence + skip_pen (float): penalty for skipped sequence position + bandwidth (int): adaptive bandwidth + max_half_z_score (float): windsorize half z-scores above this value + running_stat_width (int): running neighboring window width for segmentation scoring + min_obs_per_base (int): minimum observations per genomic base + mean_obs_per_event (int): mean number of raw obs. per event during segmentation + z_shift (float): amount to shift z-scores for DP (derived from match_evalue) + stay_pen (float): stay penalty for DP (derived from match_evalue) + use_t_test_seg (bool): use t-test segmentation criterion (default: raw neighboring window difference) + band_bound_thresh (int): bandwidth boundary threshold for determining if a read potentially left the adaptive bandwidth + start_bw (int): bandwidth for read start identification + start_save_bw (int): save bandwidth for read start identification (if start_bw_fails) + start_n_bases (int): number of genomic bases to use for read start identification + """ +# set default values for start params +resquiggleParams.__new__.__defaults__ = (None, None, None) + +class trimRnaParams(namedtuple( + 'trimRnaParams', + ('moving_window_size', 'min_running_values', 'thresh_scale', + 'max_raw_obs'))): + """Parameters to trim RNA adapters. + """ + +class stallParams(namedtuple( + 'stallParams', + ('window_size', 'threshold', 'min_consecutive_obs', 'edge_buffer', + # percentile stall method params + 'lower_pctl', 'upper_pctl', + # mean windows stall method params + 'mini_window_size', 'n_windows'))): + """Parameters to identify RNA stalls + """ +# set method specific params to None +stallParams.__new__.__defaults__ = (None,) * 4 + +class startClipParams(namedtuple( + 'startClipParams', + ('bandwidth', 'num_genome_bases'))): + """Parameters to identify read start using bases clipped from mapping + + Args: + bandwidth (int): bandwidth + num_genome_bases (int): number of genome bases + """ + +class resquiggleResults(namedtuple( + 'resquiggleResults', + ('align_info', 'genome_loc', 'genome_seq', 'mean_q_score', + 'raw_signal', 'channel_info', 'read_start_rel_to_raw', 'segs', + 'scale_values', 'sig_match_score', 'norm_params_changed', + 'start_clip_bases', 'stall_ints'))): + """Re-squiggle results + + Args: + align_info (:class:`tombo.tombo_helper.alignInfo`): read alignment information + genome_loc (:class:`tombo.tombo_helper.genomeLocation`): genome mapping location + genome_seq (str): mapped genome sequence + mean_q_score (float): mean basecalling q-score + raw_signal (np.array::np.float64): raw signal (optional) + channel_info (:class:`tombo.tombo_helper.channelInfo`): channel information (optional) + read_start_rel_to_raw (int): read start within raw signal (optional) + segs (np.array::np.int64): relative raw signal segment positions (optional) + scale_values (:class:`tombo.tombo_helper.scaleValues`): signal normalization scale values (optional) + sig_match_score (float): expected to observed signal score (see :class:`tombo.tombo_stats.get_read_seg_score`; optional) + norm_params_changed (bool): were scale parameters updated (optional) + start_clip_bases (str): mapping clipped bases from start of read (optional) + stall_ints (list): list of rna stall locations within raw signal (optional) + """ +# set default None values for when just mapping results are included +resquiggleResults.__new__.__defaults__ = (None,) * 9 + +class dpResults(namedtuple( + 'dpResults', ('read_start_rel_to_raw', 'segs', 'ref_means', 'ref_sds', + 'genome_seq'))): + """Dynamic programming results + + Args: + read_start_rel_to_raw (int): read start within raw signal + segs (np.array::np.int64): relative raw signal segment positions + ref_means (np.array::np.float64): expected signal levels + ref_sds (np.array::np.float64): expected SD of signal levels + genome_seq (str): mapped genome sequence + """ + +class genomeLocation(namedtuple('genomeLocation', ('Start', 'Strand', 'Chrom'))): + """Genomic location + + Args: + Start (int): 0-based genomic start position + Strand (str): Strand (should be '+' or '-') + Chrom (str): Chromosome + """ + +class sequenceData(namedtuple('sequenceData', ('seq', 'id', 'mean_q_score'))): + """Read sequence data from FASTQ record + + Args: + seq (str): read sequence + id (str): read id (extracted from FASTQ record line) + mean_q_score (float): mean q-score + """ + +class channelInfo(namedtuple( + 'channelInfo', ('offset', 'range', 'digitisation', + 'number', 'sampling_rate'))): + """Read channel information + + Args: + offset (float): offset parameter + range (float): range parameter + digitisation (float): digitisation parameter + number (int): channel number + sampling_rate (int): number of raw samples per second + """ + +class regionStats(namedtuple( + 'regionStats', ('reg_frac_standard_base', 'reg_poss', 'chrm', 'strand', + 'start', 'reg_cov', 'ctrl_cov', 'valid_cov'))): + """Region statistics + + Args: + reg_frac_standard_base (np.array::np.float64): fraction of standard bases + reg_poss (np.array::np.int64): positions for reported fractions + chrm (str): chromosome name + strand (str): strand (should be '+' or '-') + start (int): 0-based region start + reg_cov (np.array::np.int64): region read depth + ctrl_cov (np.array::np.int64): region control sample read depth + valid_cov (np.array::np.int64): region valid (tested) read depth + """ -genomeLoc = namedtuple( - 'genomeLoc', ('Start', 'Strand', 'Chrom')) +class seqSampleType(namedtuple( + 'seqSampleType', ('name', 'rev_sig'))): + """Description of a sequencing sample type -# single base conversion for motifs -SINGLE_LETTER_CODE = { - 'A':'A', 'C':'C', 'G':'G', 'T':'T', 'B':'[CGT]', - 'D':'[AGT]', 'H':'[ACT]', 'K':'[GT]', 'M':'[AC]', - 'N':'[ACGT]', 'R':'[AG]', 'S':'[CG]', 'V':'[ACG]', - 'W':'[AT]', 'Y':'[CT]'} -INVALID_BASES = re.compile('[^ACGT]') + Args: + name (str): name of the sequencing sample + rev_sig (bool): Is the raw signal reversed (3' to 5') + """ ###################################### ###### Various Helper Functions ###### ###################################### -def _status_message(message, indent=False): +def status_message(message, indent=False): pre_str = '\t' if indent else '' sys.stderr.write(pre_str + strftime('[%H:%M:%S] ') + message + '\n') sys.stderr.flush() return -def _warning_message(message): +def warning_message(message): sys.stderr.write( '*' * 20 + ' WARNING ' + '*' * 20 + '\n\t' + message + '\n') sys.stderr.flush() return -def _error_message_and_exit(message): +def error_message_and_exit(message): sys.stderr.write( '*' * 20 + ' ERROR ' + '*' * 20 + '\n\t' + message + '\n') @@ -155,44 +348,49 @@ def _error_message_and_exit(message): return def resolve_path(fn_path): - """ - Helper function to resolve relative and linked paths that might + """Helper function to resolve relative and linked paths that might give other packages problems. """ return os.path.realpath(os.path.expanduser(fn_path)) COMP_BASES = dict(zip(map(ord, 'ACGT'), map(ord, 'TGCA'))) def comp_seq(seq): - """ - Complement DNA sequence + """Complement DNA sequence """ return seq.translate(COMP_BASES) def rev_comp(seq): - """ - Reverse complement DNA sequence + """Reverse complement DNA sequence """ return seq.translate(COMP_BASES)[::-1] +def invalid_seq(seq): + return bool(INVALID_BASES.search(seq)) + U_TO_T = {ord('U'):ord('T')} def rev_transcribe(seq): - """ - Convert U bases to T + """Convert U bases to T """ return seq.translate(U_TO_T) -def get_chrm_sizes(raw_read_coverage, raw_read_coverage2=None): - """ - Get covered chromosome sizes from a set of reads +def get_mean_q_score(read_q): + if sys.version_info[0] > 2: + return np.mean([q_val - PHRED_BASE + for q_val in read_q.encode('ASCII')]) + return np.mean([ord(q_val) - PHRED_BASE + for q_val in read_q.encode('ASCII')]) + +def get_chrm_sizes(reads_index, ctrl_reads_index=None): + """Get covered chromosome sizes from a set of reads """ strand_chrm_sizes = defaultdict(list) - for (chrm, strand), cs_read_cov in raw_read_coverage.items(): + for (chrm, strand), cs_read_cov in reads_index: try: strand_chrm_sizes[chrm].append(max( r_data.end for r_data in cs_read_cov)) except ValueError: continue - if raw_read_coverage2 is not None: - for (chrm, strand), cs_read_cov in raw_read_coverage2.items(): + if ctrl_reads_index is not None: + for (chrm, strand), cs_read_cov in ctrl_reads_index: try: strand_chrm_sizes[chrm].append(max( r_data.end for r_data in cs_read_cov)) @@ -209,6 +407,8 @@ def get_chrm_sizes(raw_read_coverage, raw_read_coverage2=None): return chrm_sizes def parse_genome_locations(genome_locs, default_strand=None): + """Parse genome location strings and convert to 0-based coordinates + """ parsed_locs = [] for chrm_pos_strand in genome_locs: # strip off any quotes and return up to the first 3 values @@ -216,16 +416,17 @@ def parse_genome_locations(genome_locs, default_strand=None): "'", "").split(':')[:3] # default to plus strand if not specified if len(split_vals) == 1: - _error_message_and_exit( + error_message_and_exit( 'Invalid genome location provided: ' + chrm_pos_strand + '\n\t\tTry adding quotation marks around specified genome ' + 'locations (especially for sequence identifiers with ' + 'special characters).') elif len(split_vals) == 2: parsed_locs.append(( - split_vals[0], split_vals[1], default_strand)) + split_vals[0], int(split_vals[1]) - 1, default_strand)) else: - parsed_locs.append(split_vals) + parsed_locs.append(( + split_vals[0], int(split_vals[1]) - 1, split_vals[2])) return parsed_locs @@ -244,9 +445,9 @@ def parse_genome_regions(all_regs_text): reg_pos = list(map(lambda x: int(x.replace(',','')), reg_pos.split('-'))) else: - raise NotImplementedError + raise TomboError('Invalid region text provided.') except: - _error_message_and_exit( + error_message_and_exit( 'Invalid [--include-region] format.') parsed_regs[chrm].append(reg_pos) @@ -257,11 +458,39 @@ def parse_genome_regions(all_regs_text): return parsed_regs +def parse_obs_filter(obs_filter): + """Parse observations per base formatted filtering + """ + if len(obs_filter) < 1: + return None + + # parse obs_filter + try: + obs_filter = [list(map(int, pctl_nobs.split(':'))) + for pctl_nobs in obs_filter] + except: + raise TomboError('Invalid format for observation filter') + + if any(pctl < 0 or pctl > 100 for pctl in map(itemgetter(0), obs_filter)): + error_message_and_exit('Invalid percentile value.') + + return obs_filter + class TomboMotif(object): + """Description of a sequence motif, including potentially modified position + + Attributes: + raw_motif (str): input raw motif string + motif_len (int): length of motif + motif_pat (``re.compile``): python regular expression for this motif + rev_comp_pat (``re.compile``): python regular expression for reverse complement of this motif + is_palindrome (bool): is the motif palindromic (in the genomic not literary definition) + mod_pos (int): modified base position within motif + mod_base (str): modified base (should generally be A, C, G or T) + + .. automethod:: __init__ + """ def _parse_motif(self, rev_comp_motif=False): - """ - Parse a single letter code motif into a pattern for matching - """ conv_motif = ''.join(SINGLE_LETTER_CODE[letter] for letter in self.raw_motif) if rev_comp_motif: @@ -271,10 +500,16 @@ def _parse_motif(self, rev_comp_motif=False): return re.compile(conv_motif) def __init__(self, raw_motif, mod_pos=None): + """Parse string motif + + Args: + raw_motif (str): sequence motif. supports IUPAC single letter codes (use T for RNA). + mod_pos (int): 0-based position of modified base within the motif + """ invalid_chars = re.findall( '[^' + ''.join(SINGLE_LETTER_CODE) + ']', raw_motif) if len(invalid_chars) > 0: - _error_message_and_exit( + error_message_and_exit( 'Invalid characters in motif: ' + ', '.join(invalid_chars)) # basic motif parsing @@ -290,28 +525,52 @@ def __init__(self, raw_motif, mod_pos=None): if mod_pos is None: self.mod_base = None else: + assert 0 < mod_pos <= self.motif_len self.mod_base = raw_motif[mod_pos - 1] if INVALID_BASES.match(self.mod_base): - _warning_message( + warning_message( 'Provided modified position is not a single base, which ' + 'is likely an error. Specified modified base is one of: ' + ' '.join(SINGLE_LETTER_CODE[self.mod_base][1:-1])) +def parse_motif_descs(stat_motif_descs): + """Parse string motif descriptions as defined by ``tombo plot roc --motif-descriptions`` -def invalid_seq(seq): - return bool(INVALID_BASES.search(seq)) + Args: + stat_motif_descs (str): string motifs description (see ``tombo plot roc --motif-descriptions``) + + Returns: + list of tuples with :class:`tombo.tombo_helper.TomboMotif` and motif/modification names + """ + parsed_motif_descs = [] + try: + for motif_desc in stat_motif_descs.split('::'): + raw_motif, mod_pos, mod_name = motif_desc.split(':') + motif = TomboMotif(raw_motif, int(mod_pos)) + parsed_motif_descs.append((motif, mod_name)) + except: + error_message_and_exit( + 'Invalid motif decriptions format. Format descriptions as: ' + + '"motif:mod_pos:name[::motif2:mod_pos2:name2...]".') + + return parsed_motif_descs ########################### ###### FASTA Parsing ###### ########################### +def get_rec_names(fasta_fn): + with io.open(fasta_fn) as fasta_fp: + all_rec_ids = [line.replace(">","").split()[0] for line in fasta_fp + if line.startswith('>')] + + return all_rec_ids + class Fasta(object): - """ - Fasta sequence format wrapper class. + """Fasta file sequence format wrapper class. Will load faidx via ``pyfaidx`` package if installed, else the fasta will be loaded into memory for sequence extraction. - Will load faidx via pyfaidx package if installed, else the fasta will be - loaded into memory for sequence extraction + .. automethod:: __init__ """ def _load_in_mem(self): genome_index = {} @@ -350,6 +609,15 @@ def _index_contains_uridines(self, n_chrms=10, n_bases=1000): def __init__(self, fasta_fn, dry_run=False, force_in_mem=False, assume_dna_base=False): + """Load a fasta + + Args: + + fasta_fn (str): path to fasta file + dry_run (bool): when pyfaidx is not installed, don't actually read sequence into memory. + force_in_mem (bool): force genome to be loaded into memory even if pyfaidx is installed allowing on-disk access + assume_dna_bases (bool): skip check for DNA or RNA bases (default: False) + """ self.fasta_fn = resolve_path(fasta_fn) self.has_rna_bases = False try: @@ -359,9 +627,9 @@ def __init__(self, fasta_fn, dry_run=False, force_in_mem=False, try: self.index = pyfaidx.Faidx(self.fasta_fn) except UnicodeDecodeError: - _error_message_and_exit( + error_message_and_exit( 'FASTA file does not appear to be formatted correctly.') - except: + except ImportError: self.has_pyfaidx = False if not dry_run: self.index = self._load_in_mem() @@ -370,11 +638,17 @@ def __init__(self, fasta_fn, dry_run=False, force_in_mem=False, self._index_contains_uridines()) def get_seq(self, chrm, start=None, end=None, error_end=True): - """ - Extract sequence from a specific genomic region. + """Extract sequence from a specific genomic region. Note if provided, start and end must both be provided or they will be ignored. + + Args: - Note if provided, start and end must both be provided or they will - be ignored. + chrm (str): chromosome name + start (int): 0-based start position + end (int): 0-based open-interval end position + error_end (bool): raise an error when the region requested extends beyond the chromosome (default: True) + + Returns: + Genomic sequence requested. Sequence is converted to RNA sequence if applicable. """ if self.has_pyfaidx: if not (start or end): @@ -383,7 +657,7 @@ def get_seq(self, chrm, start=None, end=None, error_end=True): elif (start < 0 or start > self.index.index[chrm].rlen or ( error_end and ( end < 0 or end > self.index.index[chrm].rlen))): - raise NotImplementedError( + raise TomboError( 'Encountered invalid genome sequence request.') else: r_seq = self.index.fetch(chrm, start + 1, end).seq.upper() @@ -393,7 +667,7 @@ def get_seq(self, chrm, start=None, end=None, error_end=True): error_end and end is not None and (end < 0 or end > len(self.index[chrm]))): - raise NotImplementedError( + raise TomboError( 'Encountered invalid genome sequence request.') r_seq = self.index[chrm][start:end].upper() @@ -403,6 +677,8 @@ def get_seq(self, chrm, start=None, end=None, error_end=True): return r_seq def iter_chrms(self): + """Iterate over chromosome names + """ if self.has_pyfaidx: for chrm in self.index.index: yield unicode(chrm) @@ -421,134 +697,107 @@ def __contains__(self, chrm): ############################################# def is_read_rna(fast5_data): - """ - Determine if a read is RNA or DNA + """Determine if a read is RNA or DNA + + Args: + fast5_data (`h5py.File`) + + Returns: + Boolean, incdicating whether the read appears to be RNA or DNA + + Note: + This function uses the read meta-data and so non-standard processing pipelines may not prodcue expected values. """ # check both experiment type and kit slots for "rna" - exp_type, exp_kit = None, None + exp_type = fast5_data['UniqueGlobalKey/context_tags'].attrs.get( + 'experiment_type') try: - exp_type = fast5_data['UniqueGlobalKey/context_tags'].attrs[ - 'experiment_type'] - try: - exp_type = exp_type.decode() - except (AttributeError, TypeError): - pass + exp_type = exp_type.decode() # remove the word internal since it contains rna. exp_type = exp_type.replace('internal', '') - except: + except (AttributeError, TypeError): pass + + exp_kit = fast5_data['UniqueGlobalKey/context_tags'].attrs.get( + 'experiment_kit') try: - exp_kit = fast5_data['UniqueGlobalKey/context_tags'].attrs[ - 'experiment_kit'] - try: - exp_kit = exp_kit.decode() - except (AttributeError, TypeError): - pass + exp_kit = exp_kit.decode() # remove the word internal since it contains rna. exp_kit = exp_kit.replace('internal', '') - except: + except (AttributeError, TypeError): pass if exp_type is None and exp_kit is None: - rna = False - else: - rna = ( - (exp_type is not None and re.search('rna', exp_type) is not None) or - (exp_kit is not None and re.search('rna', exp_kit) is not None)) + return False - return rna + return ( + (exp_type is not None and re.search('rna', exp_type) is not None) or + (exp_kit is not None and re.search('rna', exp_kit) is not None)) -def is_rna(raw_read_coverage, n_reads=10): - """ - Determine if a set of reads are RNA or DNA from a small sample +def is_sample_rna(reads_index=None, fast5_fns=None, n_reads=50): + """Determine if a set of reads are RNA or DNA from a small sample. Must provide either reads_index or fast5_fns. + + Args: + reads_index (:class:`tombo.tombo_helper.TomboReads`) + fast5_fns (list): list of fast5 read filename + n_reads (int): number of reads to check (default: 50) + + Returns: + False if any read does not appear to be an RNA read (see :class:`tombo.tombo_helper.is_read_rna`) else return True. """ proc_reads = 0 - for cs_reads in raw_read_coverage.values(): - for r_data in cs_reads: + if reads_index is not None: + for r_data in reads_index.iter_reads(): if not r_data.rna: return False proc_reads += 1 if proc_reads >= n_reads: break - if proc_reads >= n_reads: - break - return True + elif fast5_fns is not None: + for fast5_fn in fast5_fns: + try: + with h5py.File(fast5_fn, 'r') as fast5_data: + if not is_read_rna(fast5_data): + return False + proc_reads += 1 + except: + continue + if proc_reads >= n_reads: + break + else: + raise TomboError( + 'Must provide either reads_index or fast5_fns to ' + + 'determine is sample is RNA.') -def is_rna_from_files(fast5_fns, n_reads=10): - """ - Determine if a set of files are RNA or DNA from a small sample - """ - proc_reads = 0 - for fast5_fn in fast5_fns: - try: - with h5py.File(fast5_fn, 'r') as fast5_data: - if not is_read_rna(fast5_data): - return False - proc_reads += 1 - except: - continue - if proc_reads >= n_reads: - break return True +def get_seq_sample_type(fast5_data=None, reads_index=None, fast5_fns=None, + num_reads=50): + """Get the sequencing sample type from a single read or set of reads -######################################### -###### Index File/Filter Functions ###### -######################################### - -def get_index_fn(fast5s_dir, corr_grp): + Args: + fast5_data (`h5py.File`): open read h5py File object + reads_index (:class:`tombo.tombo_helper.TomboReads`) + fast5_fns (list): FAST5 read filenames + num_reads (int): sample of reads to check for sample type """ - Get the filename for the requested directory and corrected group - """ - # if directory comes with trailing slash, remove for processing - if fast5s_dir.endswith('/'): - fast5s_dir = fast5s_dir[:-1] - split_dir = os.path.split(fast5s_dir) - return os.path.join(split_dir[0], "." + split_dir[1] + - "." + corr_grp + '.tombo.index') - -def load_index_data(fast5s_dir, corr_grp): - fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else - fast5s_dir + '/') - index_fn = get_index_fn(fast5s_dir, corr_grp) - try: - import cPickle as pickle - except: - import pickle - with io.open(index_fn, 'rb') as index_fp: - raw_index_data = pickle.load(index_fp) - - num_index_vals = len(next(iter(raw_index_data.values()))[0]) - if num_index_vals == 8: - def convert_r_data(from_base_fn, start, end, rsrtr, - c_grp, s_grp, filtered, rna): - return readData(start, end, filtered, rsrtr, strand, - os.path.join(fast5s_dir, from_base_fn), - corr_grp + '/' + s_grp, rna) - elif num_index_vals == 10: - def convert_r_data( - from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna, - sig_match_score, mean_q_score): - return readData(start, end, filtered, rsrtr, strand, - os.path.join(fast5s_dir, from_base_fn), - corr_grp + '/' + s_grp, rna, - sig_match_score, mean_q_score) - else: - raise NotImplementedError('Invalid Tombo index file.') + if fast5_data is None and reads_index is None and fast5_fns is None: + raise TomboError('Must provide either fast5_data, reads_index, or ' + + 'fast5_fns to determine sequencing sample type.') + if fast5_data is None: + return seqSampleType(RNA_SAMP_TYPE, True) if is_sample_rna( + reads_index, fast5_fns, num_reads) else seqSampleType( + DNA_SAMP_TYPE, False) + return seqSampleType(RNA_SAMP_TYPE, True) if is_read_rna( + fast5_data) else seqSampleType(DNA_SAMP_TYPE, False) - raw_read_coverage = {} - for (chrm, strand), cs_raw_data in raw_index_data.items(): - cs_data = [convert_r_data(*r_data) for r_data in cs_raw_data] - # don't add chrm/strand if all reads are filtered - if len(cs_data) > 0: - raw_read_coverage[(chrm, strand)] = cs_data - return fast5s_dir, index_fn, raw_read_coverage +############################ +###### Lock Functions ###### +############################ def get_lock_fn(fast5s_dir): - """ - Get filename for the lock file to indicate that this directory - is currently being processed. This file should be saved to be deleted later. + """Get filename for the lock file to indicate that this directory is currently being processed. This file should be saved to be deleted later. """ # if directory comes with trailing slash, remove for processing if fast5s_dir.endswith('/'): @@ -564,340 +813,6 @@ def _is_lock_file(lock_fn): return False return True -def prep_index_data( - fast5_fn, genome_loc, read_start_rel_to_raw, segs, - corr_grp, subgroup, rna, is_filtered=False, sig_match_score=None, - mean_q_score=None): - """ - Prepare data for storage in the index file - """ - mapped_end = genome_loc.Start + len(segs) - 1 - - return ((genome_loc.Chrom, genome_loc.Strand), readData( - genome_loc.Start, mapped_end, is_filtered, read_start_rel_to_raw, - genome_loc.Strand, fast5_fn, corr_grp + '/' + subgroup, rna, - sig_match_score, mean_q_score)) - -def write_index_file(all_index_data, index_fn, basedir): - """ - Write index file - """ - try: - import cPickle as pickle - except: - import pickle - index_data = defaultdict(list) - for chrm_strand, rd in all_index_data: - # clip the basedir off the FAST5 filename in case later functions are - # called from another relative path - from_base_fn = rd.fn.replace(basedir, '') - index_data[chrm_strand].append(( - from_base_fn, rd.start, rd.end, rd.read_start_rel_to_raw, - rd.corr_group.split('/')[0], rd.corr_group.split('/')[-1], - rd.filtered, rd.rna, rd.sig_match_score, rd.mean_q_score)) - - with io.open(index_fn, 'wb') as index_fp: - # note protocol 2 for py2/3 compatibility - pickle.dump(dict(index_data), index_fp, protocol=2) - - return - -def clear_filters(fast5s_dir, corr_grp): - """ - Clear filters applied to this directories index files - """ - _status_message('Loading index data.') - try: - fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) - except IOError: - _error_message_and_exit( - 'Filters can only be applied to runs ' + - 'with a Tombo index file. Re-run resquiggle without the ' + - '--skip-index option to apply filters.') - - _status_message('Clearing all filters.') - new_index_data = [] - for chrm_strand, cs_raw_data in index_data.items(): - new_index_data.extend([(chrm_strand, rd._replace(filtered=False)) - for rd in cs_raw_data]) - - write_index_file(new_index_data, index_fn, fast5s_dir) - _status_message('All filters successfully cleared!') - - return - -def parse_obs_filter(obs_filter): - """ - Parse observations per base formatted filtering - """ - if len(obs_filter) < 1: - return None - - # parse obs_filter - try: - obs_filter = [list(map(int, pctl_nobs.split(':'))) - for pctl_nobs in obs_filter] - except: - raise RuntimeError('Invalid format for observation filter') - - if any(pctl < 0 or pctl > 100 for pctl in map(itemgetter(0), obs_filter)): - _error_message_and_exit('Invalid percentile value.') - - return obs_filter - -def filter_reads_for_stuck(fast5s_dir, corr_grp, obs_filter): - """ - Filter reads based on some observation per base threshold criteria - """ - def read_is_stuck(fast5_fn, s_grp): - try: - with h5py.File(fast5_fn, 'r') as fast5_data: - base_lens = fast5_data['/Analyses/' + s_grp + '/Events']['length'] - return any(np.percentile(base_lens, pctl) > thresh - for pctl, thresh in obs_filter) - except: - raise - return True - - _status_message('Loading index data.') - try: - fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) - except IOError: - _error_message_and_exit( - 'Filters can only be applied to runs with a Tombo index file. ' + - 'Re-run resquiggle without the --skip-index option to apply ' + - 'filters.') - - _status_message('Filtering stuck reads.') - filt_index_data = [] - prev_unfilt_reads, num_filt_reads = 0, 0 - for chrm_strand, cs_raw_data in index_data.items(): - prev_unfilt_reads += len(cs_raw_data) - sum([ - rd.filtered for rd in cs_raw_data]) - cs_filt_reads = [(chrm_strand, rd._replace( - filtered = rd.filtered or read_is_stuck(rd.fn, rd.corr_group))) - for rd in cs_raw_data] - num_filt_reads += sum([i_data[1].filtered for i_data in cs_filt_reads]) - filt_index_data.extend(cs_filt_reads) - - _status_message( - 'Filtered ' + unicode(num_filt_reads) + ' reads due to observations ' + - 'per base filter from a total of ' + unicode(prev_unfilt_reads) + - ' reads in ' + fast5s_dir + '.') - - write_index_file(filt_index_data, index_fn, fast5s_dir) - - return - -def filter_reads_for_coverage(fast5s_dir, corr_grp, frac_to_filter): - _status_message('Loading index data.') - try: - fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) - except IOError: - _error_message_and_exit( - 'Filters can only be applied to runs with a Tombo index file. ' + - 'Re-run resquiggle without the --skip-index option to apply ' + - 'filters.') - - _status_message('Filtering reads to obtain more uniform coverage.') - unfilt_data = [] - unfilt_reads_cov = [] - prev_filt_data = [] - for chrm_strand, cs_raw_data in index_data.items(): - # compute coverage - max_end = max(rd.end for rd in cs_raw_data) - cs_coverage = np.zeros(max_end, dtype=np.int64) - for rd in cs_raw_data: - if rd.filtered: continue - cs_coverage[rd.start:rd.end] += 1 - # assign coverage value to each read - for rd in cs_raw_data: - if rd.filtered: - prev_filt_data.append((chrm_strand, rd)) - continue - # add approximate coverage from middle of read - # faster than mean over the whole read - unfilt_reads_cov.append(cs_coverage[ - rd.start + ((rd.end - rd.start) // 2)]) - unfilt_data.append((chrm_strand, rd)) - - num_reads = len(unfilt_data) - if num_reads == 0: - _error_message_and_exit( - 'No unfiltered reads present in current Tombo index.') - num_filt_reads = int(frac_to_filter * num_reads) - _status_message( - 'Filtering ' + unicode(num_filt_reads) + - ' reads due even coverage filter from a total of ' + - unicode(num_reads) + ' reads in ' + fast5s_dir + '.') - - # create probabilities array with coverage values normalized to sum to 1 - unfilt_reads_cov = np.array(unfilt_reads_cov, dtype=np.float) - unfilt_reads_p = unfilt_reads_cov / unfilt_reads_cov.sum() - # randomly chose reads to filter - filt_indices = np.random.choice( - num_reads, size=num_filt_reads, replace=False, p=unfilt_reads_p) - filt_index_data = [ - (chrm_strand, rd._replace(filtered=True)) - for chrm_strand, rd in itemgetter(*filt_indices)(unfilt_data)] - unfilt_index_data = list(itemgetter(*list(set(range(num_reads)).difference( - filt_indices)))(unfilt_data)) - - write_index_file(prev_filt_data + filt_index_data + unfilt_index_data, - index_fn, fast5s_dir) - - return - -def filter_reads_for_qscore(fast5s_dir, bc_grp, corr_grp, q_score_thresh): - """ - Filter reads based on mean q-score - """ - def read_fails_q_score(fast5_fn, s_grp): - try: - with h5py.File(fast5_fn, 'r') as fast5_data: - r_q_scores = fast5_data['/Analyses/' + bc_grp + '/' + s_grp + - '/Fastq'].value.decode().split('\n')[3] - if sys.version_info[0] > 2: - return np.mean( - [q_val - PHRED_BASE for q_val in - r_q_scores.encode('ASCII')]) < q_score_thresh - else: - return np.mean( - [ord(q_val) - PHRED_BASE for q_val in - r_q_scores.encode('ASCII')]) < q_score_thresh - except: - return True - - _status_message('Loading index data.') - try: - fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) - except IOError: - _error_message_and_exit( - 'Filters can only be applied to runs with a Tombo index file. ' + - 'Re-run resquiggle without the --skip-index option to ' + - 'apply filters.') - - _status_message('Filtering reads below a mean q-score cutoff.') - filt_index_data = [] - num_filt_reads, prev_unfilt_reads = 0, 0 - for chrm_strand, cs_raw_data in index_data.items(): - cs_prev_filt_reads = sum([ - rd.filtered for rd in cs_raw_data]) - prev_unfilt_reads += len(cs_raw_data) - cs_prev_filt_reads - cs_filt_reads = [ - (chrm_strand, rd._replace( - # if q_score was previously stored use that else get - # q-score from fast5 - filtered = rd.filtered or ( - read_fails_q_score(rd.fn, rd.corr_group.split('/')[-1]) - if rd.mean_q_score is None else - rd.mean_q_score < q_score_thresh))) - for rd in cs_raw_data] - num_filt_reads += sum([i_data[1].filtered - for i_data in cs_filt_reads]) - cs_prev_filt_reads - filt_index_data.extend(cs_filt_reads) - - _status_message( - 'Filtered ' + unicode(num_filt_reads) + ' reads due to q-score ' + - 'filter from a total of ' + unicode(prev_unfilt_reads) + ' reads in ' + - fast5s_dir + '.') - - write_index_file(filt_index_data, index_fn, fast5s_dir) - - return - -def filter_reads_for_signal_matching(fast5s_dir, corr_grp, sig_match_thresh): - """ - Filter reads based on mean half z-score matching to expected levels - """ - def read_fails_matching_score(fast5_fn, corr_group): - try: - with h5py.File(fast5_fn, 'r') as fast5_data: - return fast5_data['/Analyses/' + corr_group].attrs[ - 'signal_match_score'] > sig_match_thresh - except: - return True - - _status_message('Loading index data.') - try: - fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) - except IOError: - _error_message_and_exit( - 'Filters can only be applied to runs with a Tombo index file. ' + - 'Re-run resquiggle without the --skip-index option to ' + - 'apply filters.') - - _status_message('Filtering reads above a signal matching score threshold.') - filt_index_data = [] - num_filt_reads, prev_unfilt_reads = 0, 0 - for chrm_strand, cs_raw_data in index_data.items(): - cs_prev_filt_reads = sum([rd.filtered for rd in cs_raw_data]) - prev_unfilt_reads += len(cs_raw_data) - cs_prev_filt_reads - cs_filt_reads = [ - (chrm_strand, rd._replace( - # if sig_match_score was previously stored use that else get - # sig_match_score from fast5 - filtered = rd.filtered or ( - read_fails_matching_score(rd.fn, rd.corr_group) - if rd.sig_match_score is None else - rd.sig_match_score > sig_match_thresh))) - for rd in cs_raw_data] - num_filt_reads += sum([i_data[1].filtered for i_data in - cs_filt_reads]) - cs_prev_filt_reads - filt_index_data.extend(cs_filt_reads) - - _status_message( - 'Filtered ' + unicode(num_filt_reads) + - ' reads due to signal matching filter from a total of ' + - unicode(prev_unfilt_reads) + ' reads in ' + fast5s_dir + '.') - - write_index_file(filt_index_data, index_fn, fast5s_dir) - - return - -def filter_reads_for_genome_pos(fast5s_dir, corr_grp, include_regs): - """ - Filter reads to include or exclude genomic regions - """ - def read_not_included(start, end, chrm_include_regs): - if chrm_include_regs is None: - return False - return not any((start >= i_start and end <= i_end) - for i_start, i_end in chrm_include_regs) - - _status_message('Loading index data.') - try: - fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) - except IOError: - _error_message_and_exit( - 'Filters can only be applied to runs with a Tombo index file. ' + - 'Re-run resquiggle without the --skip-index option to ' + - 'apply filters.') - - _status_message('Filtering reads outside of the specified genomic location.') - filt_index_data = [] - num_filt_reads, prev_unfilt_reads = 0, 0 - for (chrm, strand), cs_raw_data in index_data.items(): - cs_prev_filt_reads = sum([rd.filtered for rd in cs_raw_data]) - prev_unfilt_reads += len(cs_raw_data) - cs_prev_filt_reads - do_filter_cs_reads = chrm not in include_regs - cs_filt_reads = [((chrm, strand), rd._replace( - filtered = rd.filtered or do_filter_cs_reads or read_not_included( - rd.start, rd.end, include_regs[chrm]))) - for rd in cs_raw_data] - num_filt_reads += sum([i_data[1].filtered for i_data in - cs_filt_reads]) - cs_prev_filt_reads - filt_index_data.extend(cs_filt_reads) - - _status_message( - 'Filtered ' + unicode(num_filt_reads) + - ' reads due to genomic position filter from a total of ' + - unicode(prev_unfilt_reads) + ' reads in ' + fast5s_dir + '.') - - write_index_file(filt_index_data, index_fn, fast5s_dir) - - return - ##################################### ###### FAST5 Parsing Functions ###### @@ -919,8 +834,7 @@ def reads_contain_basecalls(fast5_fns, bc_grp, num_reads): return False def get_files_list(fast5s_dir): - """ - Get all fast5 files recursively below this directory + """Get all fast5 files recursively below this directory """ all_fast5s = [] # walk through directory structure searching for fast5 files @@ -932,8 +846,7 @@ def get_files_list(fast5s_dir): return all_fast5s def clear_tombo_locks(lock_fns): - """ - Clear all lock files + """Clear all lock files """ for lock_fn in lock_fns: # safegaurd against incorrect file passed to this function @@ -949,9 +862,7 @@ def clear_tombo_locks(lock_fns): return def get_files_list_and_lock_dirs(fast5s_dir, ignore_locks): - """ - Get all fast5 files recursively below this directory and add a Tombo lock - file to indicate that this directory is currently being re-squiggled + """Get all fast5 files recursively below this directory and add a Tombo lock file to indicate that this directory is currently being re-squiggled """ ignore_locks_mess = ( 'This set of reads is currently being processed by another ' + @@ -968,7 +879,7 @@ def get_files_list_and_lock_dirs(fast5s_dir, ignore_locks): lock_fn = get_lock_fn(root) if not ignore_locks and os.path.exists(lock_fn): clear_tombo_locks(lock_fns) - _error_message_and_exit(ignore_locks_mess) + error_message_and_exit(ignore_locks_mess) lock_fns.append(lock_fn) # create empty file indicating this directory is locked open(lock_fn, 'w').close() @@ -978,170 +889,512 @@ def get_files_list_and_lock_dirs(fast5s_dir, ignore_locks): all_fast5s.append(os.path.join(root, fn)) except: clear_tombo_locks(lock_fns) - _error_message_and_exit( + error_message_and_exit( 'Unexpected error during file enumeration. Check that you have ' + 'write permission within the specified [fast5_basedir].') return all_fast5s, lock_fns def get_raw_read_slot(fast5_data): + """Get the raw read slot from this FAST5 read file + + Args: + fast5_data (`h5py.File`): open FAST5 read file object + + Example:: + + all_aw_signal = get_raw_read_slot(fast5_data)['Signal'][:] + + Returns: + The HDF5 group slot containing the raw signal data. + """ try: raw_read_slot = list(fast5_data['/Raw/Reads'].values())[0] - except: - raise NotImplementedError( - 'Raw data is not found in /Raw/Reads/Read_[read#]') + except KeyError: + raise TomboError('Raw data is not found in /Raw/Reads/Read_[read#]') return raw_read_slot -def parse_fast5s_wo_index(fast5_basedirs, corr_grp, bc_subgrps, rna): - """ - Parse re-squiggled reads data from a list of fast5 directories +class TomboReads(object): + """A set of reads with associated meta-data from re-squiggle processing + + .. automethod:: __init__ """ - def get_read_data(read_fn, fast5_data, bc_subgrp): - corr_data = fast5_data['/'.join(('/Analyses', corr_grp, bc_subgrp))] - - align_data = dict(corr_data['Alignment'].attrs.items()) - read_start_rel_to_raw = corr_data['Events'].attrs[ - 'read_start_rel_to_raw'] - chrm = align_data['mapped_chrom'] - strand = align_data['mapped_strand'] - try: - chrm = chrm.decode() - strand = strand.decode() - except: - pass + def _get_index_fn(self, fast5s_dir): + """Get the filename for the requested directory and corrected group + """ + # if directory comes with trailing slash, remove for processing + if fast5s_dir.endswith('/'): + fast5s_dir = fast5s_dir[:-1] + split_dir = os.path.split(fast5s_dir) + return os.path.join(split_dir[0], "." + split_dir[1] + + "." + self.corr_grp + '.tombo.index') + + + # index building and writing class functions + def _prep_for_writing(self, fast5s_dirs): + assert len(fast5s_dirs) == 1, ( + 'Must provide only a single FAST5 base directory when ' + + 'openning for writing.') + fast5s_dir = fast5s_dirs[0] + fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else + fast5s_dir + '/') + index_fn = self._get_index_fn(fast5s_dir) + self.fast5s_dirs[fast5s_dir] = index_fn + if os.path.exists(index_fn): os.remove(index_fn) - return chrm, strand, readData( - align_data['mapped_start'], align_data['mapped_end'], - False, read_start_rel_to_raw, strand, read_fn, - corr_grp + '/' + bc_subgrp, rna) + # open default dict to fill with readData lists by (chrm, strand) + self.reads_index = defaultdict(list) + return - files = [fn for fast5_basedir in fast5_basedirs - for fn in get_files_list(fast5_basedir)] - raw_read_coverage = defaultdict(list) - for read_fn in files: - try: - with h5py.File(read_fn, 'r') as fast5_data: - for bc_subgrp in bc_subgrps: - chrm, strand, r_data = get_read_data( - read_fn, fast5_data, bc_subgrp) - raw_read_coverage[(chrm, strand)].append(r_data) - except: - # ignore errors and process all reads that don't error - continue + def add_read_data(self, chrm, strand, read_data): + """Add read data to the index - return dict(raw_read_coverage) + Args: + chrm (str): chromosome name + strand (str): strand ('+' or '-') + read_data (:class:`tombo.tombo_helper.readData`): read information + """ + self.reads_index[(chrm, strand)].append(read_data) + return -def convert_index(index_data, fast5s_dir, corr_grp, new_corr_grp): - """ - Convert an index and save under a new corrected group. Mostly for - model_resquiggle - """ - new_index_data = [] - for (chrm, strand), cs_raw_data in index_data.items(): - for rd in cs_raw_data: - if rd.corr_group.split('/')[0] != corr_grp: continue - new_index_data.append(((chrm, strand), rd._replace( - corr_group=new_corr_grp))) - - fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else - fast5s_dir + '/') - new_index_fn = get_index_fn(fast5s_dir, new_corr_grp) - write_index_file(new_index_data, new_index_fn, fast5s_dir) + def replace_index(self, new_reads_index): + """Replace current reads index - return + Args: + new_reads_index (dict): dictionary with (chrm, strand) pointing to lists of :class:`tombo.tombo_helper.readData` objects + """ + if sum(len(x) for x in new_reads_index.values()) == 0: + raise TomboError('Cannot replace with an empty index.') -def parse_fast5s_w_index(fast5s_dir, corr_grp, subgroups, new_corr_grp): - """ - Use index file to parse information about a set of reads - """ - try: - fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) - except UnicodeDecodeError: - _warning_message( - 'Invalid Tombo index file.\n\t\tThis occurs most often when the ' + - 're-squiggle command was completed using a Tombo build against ' + - 'a different python version (2 or 3).') - raise - raw_read_coverage = {} - for (chrm, strand), cs_raw_data in index_data.items(): - cs_data = [ - rd for rd in cs_raw_data - if rd.corr_group.split('/')[0] == corr_grp and - rd.corr_group.split('/')[-1] in subgroups and not rd.filtered] - # don't add chrm/strand if all reads are filtered - if len(cs_data) > 0: - raw_read_coverage[(chrm, strand)] = cs_data - if new_corr_grp is not None: - # convert corrected group to new corrected group for - # model re-squiggle - convert_index(index_data, fast5s_dir, corr_grp, new_corr_grp) - - return raw_read_coverage - -def merge_cov(w_index_covs, wo_index_cov): - """ - Merge coverage from serveral parsed sets of data - """ - all_covs = w_index_covs + [wo_index_cov,] - raw_read_coverage = defaultdict(list) - for chrm_strand in set([cs for d_cov in all_covs for cs in d_cov]): - for dir_cov in all_covs: - if chrm_strand not in dir_cov: continue - raw_read_coverage[chrm_strand].extend(dir_cov[chrm_strand]) + self.reads_index = new_reads_index + return - return dict(raw_read_coverage) + def write_index_file(self): + """Write index file -def parse_fast5s(fast5_basedirs, corrected_group, basecall_subgroups, - new_corr_grp=None, rna=False, sample_name=None): - """ - Parse data from a list of re-squiggle fast5 directories - """ - if VERBOSE: - status_mess = ('Parsing Tombo index file(s).' if sample_name is None else - 'Parsing ' + sample_name + ' Tombo index file(s).') - _status_message(status_mess) - wo_index_dirs = [] - w_index_covs = [] - warn_index = False - # determine if index exists for each directory and load appropriately - for fast5s_dir in fast5_basedirs: - fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else - fast5s_dir + '/') - if os.path.exists(get_index_fn(fast5s_dir, corrected_group)): + Note: + + Must be an index from only a single ``fast5s_dir`` + + Index filename will be: `.[fast5s_dir].[corr_grp].tombo.index` + Note that this is a hidden file (starts with a ".") + """ + status_message('Saving Tombo reads index to file.') + assert len(self.fast5s_dirs) == 1, ( + 'Cannot write index for TomboReads created from more than ' + + 'one base directory.') + basedir, index_fn = next(iter(self.fast5s_dirs.items())) + try: + import cPickle as pickle + except: + import pickle + + index_data = defaultdict(list) + for chrm_strand, cs_reads in self: + for rd in cs_reads: + # clip the basedir off the FAST5 filename in case later + # functions are called from another relative path and + # split corr_grp and bc_subgrp for easier filtering + index_data[chrm_strand].append(( + rd.fn.replace(basedir, ''), rd.start, rd.end, + rd.read_start_rel_to_raw, rd.corr_group.split('/')[0], + rd.corr_group.split('/')[-1], rd.filtered, rd.rna, + rd.sig_match_score, rd.mean_q_score, rd.read_id)) + + with io.open(index_fn, 'wb') as index_fp: + # note protocol 2 for py2/3 compatibility + pickle.dump(dict(index_data), index_fp, protocol=2) + + return + + + # Index parsing class functions + def _parse_fast5s_wo_index(self, wo_index_fast5s_dirs): + """Parse re-squiggled reads data from a list of fast5 directories + """ + def get_read_data(read_fn, fast5_data, bc_subgrp): + read_id = get_raw_read_slot(fast5_data).attrs.get('read_id') + corr_data = fast5_data[ + '/'.join(('/Analyses', self.corr_grp, bc_subgrp))] + rna = corr_data.attrs.get('rna') + rna = False if rna is None else rna + + align_data = dict(corr_data['Alignment'].attrs.items()) + read_start_rel_to_raw = corr_data['Events'].attrs.get( + 'read_start_rel_to_raw') + chrm = align_data['mapped_chrom'] + strand = align_data['mapped_strand'] try: - w_index_covs.append(parse_fast5s_w_index( - fast5s_dir, corrected_group, basecall_subgroups, - new_corr_grp)) + chrm = chrm.decode() + strand = strand.decode() except: - _warning_message( - 'Failed to parse tombo index file for ' + fast5s_dir + - ' directory. Creating index from FAST5 files.') - wo_index_dirs.append(fast5s_dir) - else: - if not warn_index: - _warning_message( - 'Tombo index file does not exist for one or more ' + - 'directories.\n\t\tIf --skip-index was not set for ' + - 're-squiggle command, ensure that the specified ' + - 'directory is the same as for the re-squiggle command.\n') - warn_index = True - wo_index_dirs.append(fast5s_dir) - wo_index_cov = parse_fast5s_wo_index( - wo_index_dirs, corrected_group, basecall_subgroups, rna) - raw_read_coverage = merge_cov(w_index_covs, wo_index_cov) + pass - return raw_read_coverage + return chrm, strand, readData( + align_data['mapped_start'], align_data['mapped_end'], + False, read_start_rel_to_raw, strand, read_fn, + self.corr_grp + '/' + bc_subgrp, rna, read_id=read_id) + + + files = [fn for fast5s_dir in wo_index_fast5s_dirs + for fn in get_files_list(fast5s_dir)] + dir_reads_index = defaultdict(list) + for read_fn in files: + try: + with h5py.File(read_fn, 'r') as fast5_data: + i_bc_subgrps = ( + fast5_data['/Analyses/' + self.corr_grp].keys() + if self.bc_subgrps is None else self.bc_subgrps) + for bc_subgrp in i_bc_subgrps: + chrm, strand, r_data = get_read_data( + read_fn, fast5_data, bc_subgrp) + dir_reads_index[(chrm, strand)].append(r_data) + except: + # ignore errors and process all reads that don't error + continue + + return dict(dir_reads_index) + + def _load_index_data(self, fast5s_dir): + try: + import cPickle as pickle + except: + import pickle + with io.open(self.fast5s_dirs[fast5s_dir], 'rb') as index_fp: + raw_index_data = pickle.load(index_fp) + + # determine the index type used. Index information was added around + # version 1.3 making the index data 2 elements longer. + # so check which one is used here. + try: + num_index_vals = len(next(iter(raw_index_data.values()))[0]) + except StopIteration: + raise TomboError('Tombo index file appears to be empty') + if num_index_vals == 8: + def convert_r_data(from_base_fn, start, end, rsrtr, + c_grp, s_grp, filtered, rna): + return readData(start, end, filtered, rsrtr, strand, + os.path.join(fast5s_dir, from_base_fn), + self.corr_grp + '/' + s_grp, rna) + elif num_index_vals == 10: + def convert_r_data( + from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna, + sig_match_score, mean_q_score): + return readData(start, end, filtered, rsrtr, strand, + os.path.join(fast5s_dir, from_base_fn), + self.corr_grp + '/' + s_grp, rna, + sig_match_score, mean_q_score) + elif num_index_vals == 11: + def convert_r_data( + from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna, + sig_match_score, mean_q_score, read_id): + return readData(start, end, filtered, rsrtr, strand, + os.path.join(fast5s_dir, from_base_fn), + self.corr_grp + '/' + s_grp, rna, + sig_match_score, mean_q_score, read_id) + else: + raise TomboError('Invalid Tombo index file.') + + dir_reads_index = {} + for (chrm, strand), cs_raw_data in raw_index_data.items(): + cs_data = [convert_r_data(*r_data) for r_data in cs_raw_data] + # don't add chrm/strand if all reads are filtered + if len(cs_data) > 0: + dir_reads_index[(chrm, strand)] = cs_data + + return dir_reads_index + + def _parse_fast5s_w_index(self, fast5s_dir): + """Use index file to parse information about a set of reads + """ + try: + curr_dir_reads_index = self._load_index_data(fast5s_dir) + except UnicodeDecodeError: + warning_message( + 'Invalid Tombo index file.\n\t\tThis occurs most often ' + + 'when the re-squiggle command was completed using a Tombo ' + + 'build against a different python version (2 or 3).') + raise TomboError + + if not self.remove_filtered and self.bc_subgrps is None: + return curr_dir_reads_index + + filt_dir_reads_index = {} + for (chrm, strand), cs_raw_data in curr_dir_reads_index.items(): + cs_data = [ + rd for rd in cs_raw_data + if rd.corr_group.split('/')[0] == self.corr_grp and + (self.bc_subgrps is None or + rd.corr_group.split('/')[-1] in self.bc_subgrps) and + (not self.remove_filtered or not rd.filtered)] + # don't add chrm/strand if all reads are filtered + if len(cs_data) > 0: + filt_dir_reads_index[(chrm, strand)] = cs_data + + return filt_dir_reads_index + + def _merge_dir_indices(self, w_index_ri, wo_index_ri): + """Merge coverage from serveral parsed sets of data + """ + all_raw_reads_index = w_index_ri + [wo_index_ri,] + reads_index = defaultdict(list) + for chrm_strand in set([cs for d_ri in all_raw_reads_index + for cs in d_ri]): + for dir_reads_index in all_raw_reads_index: + if chrm_strand not in dir_reads_index: continue + reads_index[chrm_strand].extend(dir_reads_index[chrm_strand]) + + return dict(reads_index) + + def _parse_fast5s(self, fast5s_dirs): + wo_index_dirs = [] + w_index_covs = [] + warn_index = False + # determine if index exists for each directory and load appropriately + for fast5s_dir in fast5s_dirs: + fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else + fast5s_dir + '/') + self.fast5s_dirs[fast5s_dir] = self._get_index_fn(fast5s_dir) + if os.path.exists(self.fast5s_dirs[fast5s_dir]): + try: + w_index_covs.append(self._parse_fast5s_w_index(fast5s_dir)) + except TomboError: + warning_message( + 'Failed to parse tombo index file for ' + fast5s_dir + + ' directory. Creating index in memory from FAST5 files.') + wo_index_dirs.append(fast5s_dir) + else: + if not warn_index: + warning_message( + 'Tombo index file does not exist for one or more ' + + 'directories.\n\t\tIf --skip-index was not set for ' + + 're-squiggle command, ensure that the specified ' + + 'directory is the same as for the re-squiggle command.') + warn_index = True + wo_index_dirs.append(fast5s_dir) + + wo_index_cov = self._parse_fast5s_wo_index(wo_index_dirs) + self.reads_index = self._merge_dir_indices(w_index_covs, wo_index_cov) + + return + + def __init__(self, fast5s_basedirs, corrected_group='RawGenomeCorrected_000', + basecall_subgroups=None, for_writing=False, remove_filtered=True, + sample_name=None): + """Parse data from a list of re-squiggle fast5 directories + + Args: + fast5s_basedirs (list): fast5 base directories, which have been processed by ``tombo resquiggle`` + corrected_group (str): Analysis slot containing the re-squiggle information (optional; default: 'RawGenomeCorrected_000') + basecall_subgroups (list): basecall subgroups (optional; default: Process all basecall subgroups) + for_writing (bool): open TomboReads to write index (optional; default: Open for reading and parse re-squiggled reads) + remove_filtered (bool): remove filtered reads as indicated by index file (optional; default: True) + sample_name (str): for verbose output + """ + if VERBOSE and not for_writing: + status_mess = ('Parsing Tombo index file(s).' + if sample_name is None else + 'Parsing ' + sample_name + ' Tombo index file(s).') + status_message(status_mess) + assert isinstance(fast5s_basedirs, list), ( + 'fast5s_basedirs must be a list.') + + self.fast5s_dirs = {} + self.corr_grp = corrected_group + self.bc_subgrps = basecall_subgroups + self.sample_name = sample_name + self.remove_filtered = remove_filtered + self.for_writing = for_writing + self.coverage = None + if self.for_writing: + self._prep_for_writing(fast5s_basedirs) + else: + self._parse_fast5s(fast5s_basedirs) + + return + + def _compute_coverage(self): + if VERBOSE: status_message('Calculating read coverage.') + self.coverage = {} + for chrm_strand, cs_reads in self: + if len(cs_reads) == 0: continue + cs_coverage = np.zeros(max(r_data.end for r_data in cs_reads), + dtype=np.int64) + for r_data in cs_reads: + cs_coverage[r_data.start:r_data.end] += 1 + self.coverage[chrm_strand] = cs_coverage + return self + + def _add_coverages(self, ctrl_reads_index): + merged_cov = {} + # if self or control reads don't have coverage, compute it + if self.coverage is None: + self._compute_coverage() + if ctrl_reads_index.coverage is None: + ctrl_reads_index._compute_coverage() + + for chrm_strand, ctrl_cs_cov in ctrl_reads_index.coverage.items(): + if chrm_strand in self.coverage: + self_cs_cov = self.coverage[chrm_strand] + # copy longer array and add shorter array coverage + if self_cs_cov.shape[0] > ctrl_cs_cov.shape[0]: + merged_cs_cov = self_cs_cov.copy() + merged_cs_cov[:ctrl_cs_cov.shape[0]] += ctrl_cs_cov + else: + merged_cs_cov = ctrl_cs_cov.copy() + merged_cs_cov[:self_cs_cov.shape[0]] += self_cs_cov + else: + merged_cs_cov = ctrl_cs_cov.copy() + merged_cov[chrm_strand] = merged_cs_cov + + return merged_cov + + def iter_coverage_regions(self, ctrl_reads_index=None): + """Get genome coverage for a set of reads + + Args: + + ctrl_reads_index (:class:`tombo.tombo_helper.TomboReads`): a second set of tombo reads to add for coverage + + Returns: + Yeilds (chrm, strand, cs_cov, cs_cov_starts) indicating read coverage levels + + cs_cov and cs_cov_starts are numpy arrays containing the coverage level and the 0-based genomic start positions of those intervals + """ + if VERBOSE: status_message('Calculating read coverage regions.') + if self.coverage is None: + self._compute_coverage() + + coverage = (self.coverage if ctrl_reads_index is None else + self._add_coverages(ctrl_reads_index)) + for (chrm, strand), cs_cov in coverage.items(): + cs_cov_starts = np.concatenate([ + [0,], np.where(np.diff(cs_cov))[0] + 1, + [cs_cov.shape[0],]]) + cs_cov = cs_cov[cs_cov_starts[:-1]] + yield chrm, strand, cs_cov, cs_cov_starts + + return + + def get_all_cs(self): + """Get list of all (chromosome, strand) stored in index. + """ + return list(self.reads_index.keys()) + + def is_empty(self): + """Are any reads stored in the index? + """ + for cs_reads in self.reads_index.values(): + if len(cs_reads) > 0: + return False + return True + + def __contains__(self, chrm_strand): + return chrm_strand in self.reads_index + + def __iter__(self): + self._iter = iter(self.reads_index.items()) + return self + + def __next__(self): + return next(self._iter) + + # for python2 compatibility + def next(self): + return self.__next__() + + def iter_reads(self): + """Iterate over reads stored in the index + """ + for _, cs_reads in self: + for rd in cs_reads: + yield rd + return + + def get_cs_reads(self, chrm, strand, invalid_return=[]): + """Extract the list of reads stored in a specified chromosome and strand + + Args: + + chrm (str): chromosome name + strand (str): strand ('+' or '-') + invalid_return: value to return for invalid (chrm, strand) + """ + if (chrm, strand) not in self.reads_index: + return invalid_return + return self.reads_index[(chrm, strand)] + + def _get_strand_spec_cov(self, chrm, pos, strand, invalid_return): + if (chrm, strand) not in self.coverage: + return invalid_return + if pos >= self.coverage[(chrm, strand)].shape[0]: + return invalid_return + return self.coverage[(chrm, strand)][pos] + + def get_coverage(self, chrm, pos, strand=None, invalid_return=0): + """Get coverage at specified position + + Args: + chrm (str): chromosome name + pos (int): 0-based genomic position + strand (str): interval strand ('+', '-' or None). Default: None (max over both strands) + invalid__return: return value for invalid (bad chrm, strand or pos beyond coverage) position. Default: 0 + """ + if self.coverage is None: + self._compute_coverage() + if strand is None: + try: + return max( + self._get_strand_spec_cov(chrm, pos, '+', invalid_return), + self._get_strand_spec_cov(chrm, pos, '-', invalid_return)) + # with None invalid_return value could be max over 2 None's + except TypeError: + return invalid_return + return self._get_strand_spec_cov(chrm, pos, strand, invalid_return) + + def get_cs_coverage(self, chrm, strand, invalid_return=None): + """Extract coverage levels over a specified chromosome and strand + + Args: + + chrm (str): chromosome name + strand (str): strand ('+' or '-') + invalid_return: value to return for invalid (chrm, strand) + + Returns: + numpy array (numpy.int64) containing read coverage levels + """ + if self.coverage is None: + self._compute_coverage() + if (chrm, strand) not in self.coverage: + return invald_return + return self.coverage[(chrm, strand)] + + def iter_cs_coverage(self): + """Iterate over coverage levels across all stored (chrm, strands) + """ + if self.coverage is None: + self._compute_coverage() + return self.coverage.items() ########################################### -###### Events Table Access Functions ###### -########################################### +###### Events Table Access Functions ###### +########################################### def get_multiple_slots_read_centric(r_data, slot_names, corr_grp=None): - """ - Extract read-centric slot_names from this read's Events table + """Extract multiple read-centric slot_names from this read's Events table + + Args: + + fast5_data (:class:`tombo.tombo_helper.readData` or an open ``h5py.File`` object) + slot_name (str): slot from which to extract data (valid values: norm_mean, norm_stdev, start, length, base) + corr_grp (str): corrected group slot from which to extract data (default: use value from ``fast5_data`` object; required for ``h5py.File``) + + Returns: + A tuple of numpy arrays specified by the ``slot_names`` """ try: do_close = False @@ -1152,7 +1405,7 @@ def get_multiple_slots_read_centric(r_data, slot_names, corr_grp=None): event_slot_name = '/'.join(('/Analyses', corr_grp, 'Events')) # note that it's more efficient to try to access the slot # and except the error that check if the slot exists first - r_event_data = r_data[event_slot_name].value + r_event_data = r_data[event_slot_name][:] if do_close: r_data.close() except: # probably truncated file or events don't exist @@ -1161,8 +1414,13 @@ def get_multiple_slots_read_centric(r_data, slot_names, corr_grp=None): return [r_event_data[slot_name] for slot_name in slot_names] def get_single_slot_read_centric(r_data, slot_name, corr_grp=None): - """ - Extract read-centric slot_name from this read's Events table + """Extract read-centric slot_name from this read's Events table + + Args: + + fast5_data (:class:`tombo.tombo_helper.readData` or an open ``h5py.File`` object) + slot_name (str): slot from which to extract data (valid values: norm_mean, norm_stdev, start, length, base) + corr_grp (str): corrected group slot from which to extract data (default: use value from ``fast5_data`` object; required for ``h5py.File``) """ try: # if r_data is an open h5py object then don't open the filename @@ -1183,8 +1441,7 @@ def get_single_slot_read_centric(r_data, slot_name, corr_grp=None): return r_slot_values def get_single_slot_genome_centric(r_data, slot_name): - """ - Extract genome-centric slot_name from this read's Events table + """Extract genome-centric slot_name from this read's Events table """ r_slot_values = get_single_slot_read_centric(r_data, slot_name) if r_slot_values is None: @@ -1196,9 +1453,7 @@ def get_single_slot_genome_centric(r_data, slot_name): return r_slot_values def get_mean_slot_genome_centric(cs_reads, chrm_len, slot_name): - """ - Get the mean over all reads at each covered genomic location for this - slots value + """Get the mean over all reads at each covered genomic location for this slots value """ base_sums = np.zeros(chrm_len) base_cov = np.zeros(chrm_len, dtype=np.int64) @@ -1214,10 +1469,9 @@ def get_mean_slot_genome_centric(cs_reads, chrm_len, slot_name): return base_sums / base_cov -def iter_mean_slot_values(raw_read_coverage, chrm_sizes, slot_name, - raw_read_coverage2=None): - """ - Iterate through chromosomes and strands yielding mean slots values over +def iter_mean_slot_values( + reads_index, chrm_sizes, slot_name, ctrl_reads_index=None): + """Iterate through chromosomes and strands yielding mean slots values over all reads at each covered genomic location. Generator returns chrmosome, strand, cs_mean_values tuples (3 return values). @@ -1230,37 +1484,38 @@ def iter_mean_slot_values(raw_read_coverage, chrm_sizes, slot_name, # coverage so leave as nan for now old_err_settings = np.seterr(all='ignore') for chrm, strand in [(c, s) for c in sorted(chrm_sizes) for s in ('+', '-')]: - if raw_read_coverage2 is None: - if (chrm, strand) not in raw_read_coverage: continue + if ctrl_reads_index is None: + if (chrm, strand) not in reads_index: continue cs_mean_values = get_mean_slot_genome_centric( - raw_read_coverage[(chrm, strand)], chrm_sizes[chrm], slot_name) + reads_index.get_cs_reads(chrm, strand), + chrm_sizes[chrm], slot_name) yield chrm, strand, cs_mean_values else: - cs_mean_values, cs_mean_values2 = None, None - if (chrm, strand) in raw_read_coverage: + cs_mean_values, ctrl_cs_mean_values = None, None + if (chrm, strand) in reads_index: cs_mean_values = get_mean_slot_genome_centric( - raw_read_coverage[(chrm, strand)], chrm_sizes[chrm], - slot_name) - if (chrm, strand) in raw_read_coverage2: - cs_mean_values2 = get_mean_slot_genome_centric( - raw_read_coverage2[(chrm, strand)], chrm_sizes[chrm], - slot_name) - if cs_mean_values is None and cs_mean_values2 is None: continue - yield chrm, strand, cs_mean_values, cs_mean_values2 + reads_index.get_cs_reads(chrm, strand), + chrm_sizes[chrm], slot_name) + if (chrm, strand) in ctrl_reads_index: + ctrl_cs_mean_values = get_mean_slot_genome_centric( + ctrl_reads_index.get_cs_reads(chrm, strand), + chrm_sizes[chrm], slot_name) + if cs_mean_values is None and ctrl_cs_mean_values is None: continue + yield chrm, strand, cs_mean_values, ctrl_cs_mean_values _ = np.seterr(**old_err_settings) return def get_largest_signal_differences( - raw_read_coverage1, raw_read_coverage2, num_regions, num_bases): - chrm_sizes = get_chrm_sizes(raw_read_coverage1, raw_read_coverage2) + reads_index, ctrl_reads_index, num_regions, num_bases): + chrm_sizes = get_chrm_sizes(reads_index, ctrl_reads_index) all_largest_diff_poss = [] - for chrm, strand, cs_sig_means1, cs_sig_means2 in iter_mean_slot_values( - raw_read_coverage1, chrm_sizes, 'norm_mean', raw_read_coverage2): - if cs_sig_means1 is None or cs_sig_means2 is None: continue - chrm_diffs = np.nan_to_num(np.abs(cs_sig_means1 - cs_sig_means2)) + for chrm, strand, cs_sig_means, ctrl_cs_sig_means in iter_mean_slot_values( + reads_index, chrm_sizes, 'norm_mean', ctrl_reads_index): + if cs_sig_means is None or ctrl_cs_sig_means is None: continue + chrm_diffs = np.nan_to_num(np.abs(cs_sig_means - ctrl_cs_sig_means)) chrm_max_diff_regs = np.argsort(chrm_diffs)[::-1][:num_regions] all_largest_diff_poss.extend(( chrm_diffs[pos], max(pos - int(num_bases / 2.0), 0), @@ -1268,45 +1523,373 @@ def get_largest_signal_differences( return sorted(all_largest_diff_poss, reverse=True)[:num_regions] -def get_signal_differences(raw_read_coverage1, raw_read_coverage2): +def get_signal_differences(reads_index, ctrl_reads_index): + """Helper function to compute all signal differences """ - Helper function to compute all signal differences - """ - chrm_sizes = get_chrm_sizes(raw_read_coverage1, raw_read_coverage2) + chrm_sizes = get_chrm_sizes(reads_index, ctrl_reads_index) all_diffs = {} - for chrm, strand, cs_sig_means1, cs_sig_means2 in iter_mean_slot_values( - raw_read_coverage1, chrm_sizes, 'norm_mean', raw_read_coverage2): - if cs_sig_means1 is None or cs_sig_means2 is None: continue - all_diffs[(chrm, strand)] = np.nan_to_num(cs_sig_means1 - cs_sig_means2) + for chrm, strand, cs_sig_means, ctrl_cs_sig_means in iter_mean_slot_values( + reads_index, chrm_sizes, 'norm_mean', ctrl_reads_index): + if cs_sig_means is None or ctrl_cs_sig_means is None: continue + all_diffs[(chrm, strand)] = np.nan_to_num( + cs_sig_means - ctrl_cs_sig_means) return all_diffs +#################################### +###### Genomic Interval Class ###### +#################################### + +class intervalData(object): + """Genome/transcriptome interval information + + Example:: + + int_data = tombo_helper.intervalData( + chrm='chr20', start=10000, end=10100, strand='+') + + Note: + All intervalData functions return the object in order to allow chaining of interval commands. (e.g. ``int_data.add_reads(reads_index).add_seq()``) + + .. automethod:: __init__ + """ + def __setattr__(self, name, value): + if name == 'chrm': + if not isinstance(value, unicode): + raise TypeError(name + ' must be a string') + elif name in ('start', 'end'): + if not (isinstance(value, int) or isinstance(value, np.integer)): + raise TypeError(name + ' must be an int') + elif name == 'strand': + if value not in (None, '+', '-'): + raise TypeError('strand must be either None, "+", or "-"') + elif name in ('reg_id', 'reg_text', 'seq'): + if value is not None and not isinstance(value, unicode): + raise TypeError(name + ' must be a string') + elif name == 'reads': + if value is not None and not isinstance(value, list): + raise TypeError('reads must be a list') + else: + raise ValueError(name + ' is an invalid attribute for intervalData') + super(intervalData, self).__setattr__(name, value) + + def __init__(self, chrm, start, end, strand=None, reg_id=None, + reg_text='', reads=None, seq=None): + """Initialize a genome/transcriptome interval object + + Args: + chrm (str): chromosome name. + start (int): 0-based start position. + end (int): 1-based (or open interval) end position. + strand (str): interval strand ('+', '-' or None). Default: None + reg_id (str): used to keep track of multiple intervals. Default: None + reg_text (str): region description text. Default: '' + reads (list): list of readData values overlapping region. Default: None + seq (str): region genomic sequence. Default: None + """ + self.chrm = unicode(chrm) + self.start = start + self.end = end + self.strand = unicode(strand) if strand is not None else None + self.reg_id = unicode(reg_id) if reg_id is not None else None + self.reg_text = unicode(reg_text) + self.reads = reads + self.seq = seq + + def update(self, **kwargs): + """Update slots specified in keyword arguments (kwargs) + """ + for k, v in kwargs.items(): + self.__setattr__(k, v) + # return self to allow chaining and auto-return + return self + + def __repr__(self): + # convert reads and seq if they are too long + self_vars = vars(self).copy() + if self_vars['reads'] is not None: + self_vars['reads'] = '<{:d} reads>'.format(len(self_vars['reads'])) + if self_vars['seq'] is not None and len(self_vars['seq']) > 50: + self_vars['seq'] = '<{:d} bases of sequence>'.format( + len(self_vars['seq'])) + self_vars['reg_text'] = '"' + self_vars['reg_text'] + '"' + return ' :\n' + '\n'.join( + "{:>15} : {}".format(k, str(v)) for k, v in self_vars.items()) + + def copy(self, include_reads=True): + """Create a copy of this interval. Useful when adding sets of reads from multiple samples to compare. + + Args: + include_reads (bool): include current reads slot in the new object (default: True) + """ + cp_reads = self.reads if include_reads else None + return type(self)(self.chrm, self.start, self.end, self.strand, + self.reg_id, self.reg_text, cp_reads, self.seq) + + def merge(self, other_reg): + """Create a copy of this interval with the reads from this interval and `other_reg` + + Args: + other_reg (:class:`tombo.tombo_helper.intervalData`): a second region to merge with this interval + + Note: + Aside from reads, all other attributes will be taken from this interval + """ + merged_reg_data = self.copy() + self_reads = self.reads if self.reads is not None else [] + other_reads = other_reg.reads if other_reg.reads is not None else [] + return merged_reg_data.update(reads=self_reads + other_reads) + + def expand_interval(self, expand_width): + """Expand this interval by the specified amount (effects only the start and stop attributes; NOT seq or reads) + + Args: + + expand_width (int): amount by which to expand the interval + """ + self.update(start=self.start - expand_width, + end=self.end + expand_width) + return self + + def add_reads(self, reads_index, require_full_span=False): + """Add reads overlapping this interval + + Args: + reads_index (:class:`tombo.tombo_helper.TomboReads`): reads index + require_full_span (bool): require that reads span then entire interval (default: include all reads overlapping the interval) + """ + def get_c_s_data(strand): + if require_full_span: + def read_intersects_interval(r_start, r_end): + return r_start <= self.start and r_end >= self.end + else: + def read_intersects_interval(r_start, r_end): + return not (r_start >= self.end or r_end <= self.start) + + # get all reads intersecting the interval + return [ + r_data for r_data in reads_index.get_cs_reads(self.chrm, strand) + if read_intersects_interval(r_data.start, r_data.end)] + + + # get all reads that overlap this interval + # note that this includes partial overlaps as these contribute + # to coverage and other statistics so can't really restrict to + # full coverage as previous versions of code did + if self.strand is not None: + return self.update(reads=get_c_s_data(self.strand)) + + # if strand is None, get data from both strands + return self.update(reads=get_c_s_data('+') + get_c_s_data('-')) + + def _update_seq(self, r_data, reg_base_data): + """Update the sequence for the region based on this read + """ + read_bases = get_single_slot_read_centric(r_data, 'base') + if read_bases is None: + warning_message( + 'Unable to extract data from read. Potentially corrupted file ' + + 'or invalid Tombo index file for this directory.') + return reg_base_data, max(0, r_data.start - self.start) + r_seq = b''.join(read_bases).decode() + + if r_data.strand == '-': + r_seq = rev_comp(r_seq) + + # if read starts before the interval + if r_data.start <= self.start: + r_end_overlap = r_data.end - self.start + # if read covers the whole interval + if r_data.end > self.end: + r_end_clip = r_data.end - self.end + reg_base_data = r_seq[-r_end_overlap:-r_end_clip] + return reg_base_data, len(reg_base_data) + # end of read overlaps beginning of interval + reg_base_data[:r_end_overlap] = r_seq[-r_end_overlap:] + return reg_base_data, r_end_overlap + # read doesn't cover the beginning of region + if r_data.end > self.end: + # beginning of read covers to the end of the region + r_begin_overlap = self.end - r_data.start + reg_base_data[-r_begin_overlap:] = r_seq[:r_begin_overlap] + return reg_base_data, len(reg_base_data) + # first read is completely contained in the interval + r_len = r_data.end - r_data.start + r_int_start = r_data.start - self.start + reg_base_data[r_int_start:r_int_start + r_len] = r_seq + return reg_base_data, r_int_start + r_len + + def add_seq(self, genome_index=None): + """Extract the forward strand genomic sequence for an interval from reads or genome_index if provided + + Args: + genome_index (:class:`tombo.tombo_helper.Fasta`) Tombo FASTA sequence object + """ + if genome_index is not None: + return self.update(seq=genome_index.get_seq( + self.chrm, self.start, self.end)) + + # handle case where no read overlaps whole region + # let each read contibute its sequence and fill the rest + # with dashes + reg_base_data = ['-'] * (self.end - self.start) + if self.reads is None or len(self.reads) == 0: + return self.update(seq=''.join(reg_base_data)) + # get region sequence by moving through reads that + # cover the region, but only extract seqeunce from the + # (close to) minimal reads + s_reg_reads = sorted(self.reads, key=lambda r: (r.start, r.end)) + # begin filling sequence with first (by start pos) read + reg_base_data, curr_cov_pos = self._update_seq( + s_reg_reads.pop(0), reg_base_data) + # if there was only one read return now + if len(s_reg_reads) == 0 or curr_cov_pos >= self.end - self.start: + return self.update(seq=''.join(reg_base_data)) + + # get next read (by start pos) + curr_read = s_reg_reads.pop(0) + for next_read in s_reg_reads: + # once the next read start passes the region covered thus far + # add the sequence for the saved curr_read to the reg sequence + if next_read.start >= curr_cov_pos: + # add read with curr longest end position to the region seq + reg_base_data, curr_cov_pos = self._update_seq( + curr_read, reg_base_data) + curr_read = next_read + # if the whole interval is covered return the sequence + if curr_cov_pos >= self.end - self.start: + return self.update(seq=''.join(reg_base_data)) + continue + if next_read.end > curr_read.end: + curr_read = next_read + + reg_base_data, _ = self._update_seq(curr_read, reg_base_data) + + return self.update(seq=''.join(reg_base_data)) + + def get_base_levels(self, read_rows=False, num_reads=None): + """Extract base levels from this interval. + + Args: + read_rows (bool): return array with reads as rows (default: row columns) + num_reads (int): maximal number of reads to output. randomly downsample if more reads are present (default: output all reads) + + Return: + `np.array` containing read mean levels with rows corresponding to interval position and columns to individual reads (or reverse if `read_rows`) + """ + def get_read_reg_events(r_data): + r_means = get_single_slot_genome_centric(r_data, 'norm_mean') + if r_means is None: return None + if r_data.start > self.start and r_data.end < self.end: + # handle reads that are contained in a region + start_overlap = self.end - r_data.start + end_overlap = r_data.end - self.start + # create region with nan values + r_reg_means = np.empty(self.end - self.start) + r_reg_means[:] = np.NAN + r_reg_means[-start_overlap:end_overlap] = r_means[ + -end_overlap:start_overlap] + elif r_data.start > self.start: + # handle reads that start in middle of region + start_overlap = self.end - r_data.start + # create region with nan values + r_reg_means = np.empty(self.end - self.start) + r_reg_means[:] = np.NAN + r_reg_means[-start_overlap:] = r_means[:start_overlap] + elif r_data.end < self.end: + # handle reads that end inside region + end_overlap = r_data.end - self.start + # create region with nan values + r_reg_means = np.empty(self.end - self.start) + r_reg_means[:] = np.NAN + r_reg_means[:end_overlap] = r_means[-end_overlap:] + else: + r_reg_means = r_means[ + self.start - r_data.start:self.end - r_data.start] + + return r_reg_means + + + if self.reads is None or len(self.reads) == 0: + raise TomboError( + 'Must annotate region with reads ' + + '(see `TomboInterval.add_reads`) to extract base levels.') + if num_reads is not None: + np.random.shuffle(self.reads) + reg_events = [] + for r_data in self.reads: + if self.strand is not None and r_data.strand != self.strand: + continue + r_means = get_read_reg_events(r_data) + if r_means is None: continue + reg_events.append(r_means) + if num_reads is not None and len(reg_events) >= num_reads: + break + + if read_rows: + return np.row_stack(reg_events) + return np.column_stack(reg_events) + + +def filter_empty_regions(plot_intervals): + num_filt = sum(len(p_int.reads) == 0 for p_int in plot_intervals) + plot_intervals = [p_int for p_int in plot_intervals if len(p_int.reads) > 0] + if len(plot_intervals) == 0: + error_message_and_exit('No reads in any selected regions.') + if VERBOSE and num_filt > 0: + warning_message('Some selected regions contain no reads.') + + return plot_intervals + +def get_unique_intervals(plot_intervals, covered_poss=None, num_regions=None): + # unique genomic regions filter + uniq_p_intervals = [] + used_intervals = defaultdict(set) + for reg_data in plot_intervals: + # could have significant region immediately next to + # beginning/end of reads + interval_poss = list(range(reg_data.start, reg_data.end)) + if reg_data.start not in used_intervals[( + reg_data.chrm, reg_data.strand)] and ( + covered_poss is None or all( + pos in covered_poss[(reg_data.chrm, reg_data.strand)] + for pos in interval_poss)): + uniq_p_intervals.append(reg_data) + used_intervals[(reg_data.chrm, reg_data.strand)].update( + interval_poss) + if num_regions is not None and len(uniq_p_intervals) >= num_regions: + break + + return uniq_p_intervals + + ########################################### ###### Special Data Access Functions ###### ########################################### def get_channel_info(fast5_data): - """ - Get channel information for a read + """Get channel information for a read """ try: - fast5_info = fast5_data['UniqueGlobalKey/channel_id'].attrs - except: - raise NotImplementedError("No channel_id group in HDF5 file. " + - "Probably mux scan HDF5 file.") + fast5_attrs = fast5_data['UniqueGlobalKey/channel_id'].attrs + except KeyError: + raise TomboError("No channel_id group in HDF5 file. " + + "Probably mux scan HDF5 file.") - channel_info = channelInfo( - fast5_info['offset'], fast5_info['range'], - fast5_info['digitisation'], fast5_info['channel_number'], - fast5_info['sampling_rate'].astype(np.int64)) + try: + channel_info = channelInfo( + fast5_attrs.get('offset'), fast5_attrs.get('range'), + fast5_attrs.get('digitisation'), fast5_attrs.get('channel_number'), + fast5_attrs.get('sampling_rate').astype(np.int64)) + except KeyError: + raise TomboError("Channel info parameters not available.") return channel_info def get_raw_signal(r_data, int_start, int_end): - """ - Extract raw signal from where this read overlaps a particular genomic region + """Extract raw signal from where this read overlaps a particular genomic region """ with h5py.File(r_data.fn, 'r') as fast5_data: # retrieve shift and scale computed in correction script @@ -1316,9 +1899,11 @@ def get_raw_signal(r_data, int_start, int_end): segs = np.concatenate([event_starts, [events_end,]]) scale_values = scaleValues( - corr_subgrp.attrs['shift'], corr_subgrp.attrs['scale'], - corr_subgrp.attrs['lower_lim'], corr_subgrp.attrs['upper_lim']) - all_sig = get_raw_read_slot(fast5_data)['Signal'].value + corr_subgrp.attrs.get('shift'), corr_subgrp.attrs.get('scale'), + corr_subgrp.attrs.get('lower_lim'), + corr_subgrp.attrs.get('upper_lim'), + corr_subgrp.attrs.get('outlier_threshold')) + all_sig = get_raw_read_slot(fast5_data)['Signal'][:] rsrtr = r_data.read_start_rel_to_raw if r_data.rna: @@ -1353,8 +1938,7 @@ def get_raw_signal(r_data, int_start, int_end): return r_sig, overlap_seg_data, start_offset, scale_values def parse_read_correction_data(r_data): - """ - Parse correction data from an event resquiggled read + """Parse correction data from an event resquiggled read """ try: with h5py.File(r_data.fn, 'r') as fast5_data: @@ -1365,25 +1949,26 @@ def parse_read_correction_data(r_data): new_segs = np.concatenate([event_starts, [events_end,]]) raw_grp = get_raw_read_slot(fast5_data) - read_id = raw_grp.attrs['read_id'] + read_id = raw_grp.attrs.get('read_id') try: read_id = read_id.decode() except (AttributeError, TypeError): pass - signal_data = raw_grp['Signal'].value + signal_data = raw_grp['Signal'][:] - raw_offset = events_grp.attrs['read_start_rel_to_raw'] - scale_values = scaleValues([ - corr_grp.attrs[attr_name] for attr_name in ( - 'shift', 'scale', 'lower_lim', 'upper_lim')]) + raw_offset = events_grp.attrs.get('read_start_rel_to_raw') + scale_values = scaleValues(*[ + corr_grp.attrs.get(attr_name) for attr_name in ( + 'shift', 'scale', 'lower_lim', 'upper_lim', + 'outlier_threshold')]) - old_segs = corr_grp['Alignment/read_segments'].value + old_segs = corr_grp['Alignment/read_segments'][:] old_align_vals = list(map( lambda x: x.decode(), - corr_grp['Alignment/read_alignment'].value)) + corr_grp['Alignment/read_alignment'][:])) new_align_vals = list(map( lambda x: x.decode(), - corr_grp['Alignment/genome_alignment'].value)) + corr_grp['Alignment/genome_alignment'][:])) except: return None @@ -1394,8 +1979,7 @@ def parse_read_correction_data(r_data): old_align_vals, new_align_vals, events_end, new_segs) def get_all_read_data(r_data): - """ - Extract most relevant read data from this read + """Extract most relevant read data from this read """ try: with h5py.File(r_data.fn, 'r') as fast5_data: @@ -1403,9 +1987,9 @@ def get_all_read_data(r_data): # and except the error that check if the slot exists first corr_subgrp = fast5_data['/Analyses/' + r_data.corr_group] algn_subgrp = dict(corr_subgrp['Alignment'].attrs.items()) - event_data = corr_subgrp['Events'].value + event_data = corr_subgrp['Events'][:] r_attrs = dict(corr_subgrp.attrs.items()) - all_sig = get_raw_read_slot(fast5_data)['Signal'].value + all_sig = get_raw_read_slot(fast5_data)['Signal'][:] except: # probably truncated file or Events slot doesn't exist return None @@ -1419,61 +2003,20 @@ def get_all_read_data(r_data): segs = np.concatenate([event_data['start'], [events_end,]]).astype(np.int64) return (r_means, r_seq, all_sig, segs, r_data.read_start_rel_to_raw, - r_attrs['norm_type'], r_attrs['outlier_threshold'], - genomeLoc(algn_subgrp['mapped_start'], algn_subgrp['mapped_strand'], - algn_subgrp['mapped_chrom'])) + r_attrs.get('norm_type'), r_attrs.get('outlier_threshold'), + genomeLocation( + algn_subgrp['mapped_start'], algn_subgrp['mapped_strand'], + algn_subgrp['mapped_chrom'])) -def get_coverage(raw_read_coverage): - """ - Get genome coverage for a set of reads - """ - if VERBOSE: _status_message('Calculating read coverage.') - read_coverage = {} - for (chrm, strand), reads_data in raw_read_coverage.items(): - if len(reads_data) == 0: continue - max_end = max(r_data.end for r_data in reads_data) - chrm_coverage = np.zeros(max_end, dtype=np.int64) - for r_data in reads_data: - chrm_coverage[r_data.start:r_data.end] += 1 - read_coverage[(chrm, strand)] = chrm_coverage - - return read_coverage - -def get_coverage_regions(raw_read_coverage, raw_read_coverage2=None): - """ - Get genome coverage for a set of reads - """ - if VERBOSE: _status_message('Calculating read coverage.') - all_chrm_strands = ( - raw_read_coverage.keys() if raw_read_coverage2 is None else - set(raw_read_coverage).union(raw_read_coverage2)) - for chrm, strand in sorted(all_chrm_strands): - if raw_read_coverage2 is None: - reads_data = raw_read_coverage[(chrm, strand)] - else: - reads_data = [] - if (chrm, strand) in raw_read_coverage: - reads_data += raw_read_coverage[(chrm, strand)] - if (chrm, strand) in raw_read_coverage2: - reads_data += raw_read_coverage2[(chrm, strand)] - - if len(reads_data) == 0: continue - max_end = max(r_data.end for r_data in reads_data) - cs_cov = np.zeros(max_end, dtype=np.int64) - for r_data in reads_data: - cs_cov[r_data.start:r_data.end] += 1 - - cs_cov_starts = np.concatenate([ - [0,], np.where(np.diff(cs_cov))[0] + 1, - [cs_cov.shape[0],]]) - cs_cov = cs_cov[cs_cov_starts[:-1]] - yield chrm, strand, cs_cov, cs_cov_starts +def get_reads_events(cs_reads): + """Extract read base levels split by genomic position - return + Args: -def get_reads_events(cs_reads): - """ - Extract read base levels split by genomic position + cs_reads (list): a list of reads from a single (chromosome, strand) + + Returns: + A dictionary with 0-based genomic positions pointing to read signal levels """ # note that this function assumes that all reads come from the same # chromosome and strand @@ -1508,169 +2051,6 @@ def get_reads_events(cs_reads): return cs_base_events -def update_seq(r_data, reg_base_data, int_start, int_end): - """ - Update the sequence for the region based on this read - """ - read_bases = get_single_slot_read_centric(r_data, 'base') - if read_bases is None: - _warning_message( - 'Unable to extract data from read. Potentially corrupted file ' + - 'or invalid Tombo index file for this directory.') - return reg_base_data, max(0, r_data.start - int_start) - r_seq = b''.join(read_bases).decode() - - if r_data.strand == '-': - r_seq = rev_comp(r_seq) - - # if read starts before the interval - if r_data.start <= int_start: - r_end_overlap = r_data.end - int_start - # if read covers the whole interval - if r_data.end > int_end: - r_end_clip = r_data.end - int_end - reg_base_data = r_seq[-r_end_overlap:-r_end_clip] - return reg_base_data, len(reg_base_data) - # end of read overlaps beginning of interval - reg_base_data[:r_end_overlap] = r_seq[-r_end_overlap:] - return reg_base_data, r_end_overlap - # read doesn't cover the beginning of region - if r_data.end > int_end: - # beginning of read covers to the end of the region - r_begin_overlap = int_end - r_data.start - reg_base_data[-r_begin_overlap:] = r_seq[:r_begin_overlap] - return reg_base_data, len(reg_base_data) - # first read is completely contained in the interval - r_len = r_data.end - r_data.start - r_int_start = r_data.start - int_start - reg_base_data[r_int_start:r_int_start + r_len] = r_seq - return reg_base_data, r_int_start + r_len - -def get_seq_from_reads(int_start, int_end, reg_reads): - """ - Extract the forward strand genomic sequence for an interval from - a set of reads - """ - # handle case where no read overlaps whole region - # let each read contibute its sequence and fill the rest - # with dashes - reg_base_data = ['-'] * (int_end - int_start) - if len(reg_reads) == 0: - return ''.join(reg_base_data) - # get region sequence by moving through reads that - # cover the region, but only extract seqeunce from the - # (close to) minimal reads - s_reg_reads = sorted(reg_reads, key=lambda r: (r.start, r.end)) - # begin filling sequence with first (by start pos) read - reg_base_data, curr_cov_pos = update_seq( - s_reg_reads.pop(0), reg_base_data, int_start, int_end) - # if there was only one read return now - if len(s_reg_reads) == 0 or curr_cov_pos >= int_end - int_start: - return ''.join(reg_base_data) - - # get next read (by start pos) - curr_read = s_reg_reads.pop(0) - for next_read in s_reg_reads: - # once the next read start passes the region covered thus far - # add the sequence for the saved curr_read to the reg sequence - if next_read.start >= curr_cov_pos: - # add read with curr longest end position to the region seq - reg_base_data, curr_cov_pos = update_seq( - curr_read, reg_base_data, int_start, int_end) - curr_read = next_read - # if the whole interval is covered return the sequence - if curr_cov_pos >= int_end - int_start: - return ''.join(reg_base_data) - continue - if next_read.end > curr_read.end: - curr_read = next_read - - reg_base_data, _ = update_seq( - curr_read, reg_base_data, int_start, int_end) - - return ''.join(reg_base_data) - -def add_reg_seq(all_reg_data): - """ - Add the region sequence to the region data by extraction from a minimal - set of reads - """ - all_reg_base_data = [] - for reg_data in all_reg_data: - # add text to each regions data - all_reg_base_data.append(reg_data._replace(seq=get_seq_from_reads( - reg_data.start, reg_data.end, reg_data.reads))) - - return all_reg_base_data - -def get_region_reads( - plot_intervals, raw_read_coverage, filter_no_cov=True, add_seq=True): - """ - Get all reads overlapping a set of intervals - """ - def get_c_s_data(chrm, strand, start, end): - # get all reads intersecting the interval - if (chrm, strand) in raw_read_coverage: - r_data = raw_read_coverage[(chrm, strand)][0] - return [ - r_data for r_data in raw_read_coverage[(chrm, strand)] - if not (r_data.start >= end or r_data.end <= start)] - return [] - - - all_reg_data = [] - for int_i in plot_intervals: - # get all reads that overlap this interval - # note that this includes partial overlaps as these contribute - # to coverage and other statistics so can't really restrict to - # full coverage as previous versions of code did - if int_i.strand is None: - # if strand is None, get data from both strands - all_reg_data.append(int_i._replace( - reads=get_c_s_data(int_i.chrm, '+', int_i.start, int_i.end) + - get_c_s_data(int_i.chrm, '-', int_i.start, int_i.end))) - else: - all_reg_data.append(int_i._replace( - reads=get_c_s_data(int_i.chrm, int_i.strand, - int_i.start, int_i.end))) - - if add_seq: - all_reg_data = add_reg_seq(all_reg_data) - if not filter_no_cov: - return all_reg_data - - # filter out no coverage regions - all_reg_data = [ - reg_data for reg_data in all_reg_data if len(reg_data.reads) > 0] - - no_cov_regions = [ - (len(reg_data.reads) == 0, unicode(reg_data.chrm) + ':' + - unicode(reg_data.start)) - for reg_data in all_reg_data] - if any(no_cov[0] for no_cov in no_cov_regions): - _warning_message( - 'No coverage in regions: ' + '; '.join([ - reg for no_cov, reg in no_cov_regions if no_cov])) - - return all_reg_data - -def get_region_sequences( - plot_intervals, raw_read_coverage1, raw_read_coverage2=None): - """ - Get the sequence for a set of intervals from a set of reads - """ - all_reg_data = get_region_reads( - plot_intervals, raw_read_coverage1, filter_no_cov=False, add_seq=False) - if raw_read_coverage2 is not None: - all_reg_data2 = get_region_reads( - plot_intervals, raw_read_coverage2, filter_no_cov=False, - add_seq=False) - all_reg_data = [r1._replace(reads=r1.reads + r2.reads) - for r1, r2 in zip(all_reg_data, all_reg_data2)] - all_reg_data = add_reg_seq(all_reg_data) - - return all_reg_data - ################################### ###### FAST5 Write Functions ###### @@ -1678,25 +2058,24 @@ def get_region_sequences( def prep_fast5(fast5_fn, corr_grp, overwrite, in_place, bc_grp=None, return_fp=False): - """ - Prepare a read for re-squiggle processing (This deletes old re-squiggle - info for this read) + """Prepare a read for re-squiggle processing (This deletes old re-squiggle info for this read) """ def try_close_prep_err(fast5_data, err_str): try: fast5_data.close() except: pass - return (err_str, fast5_fn) + # is_tombo_error = True + return err_str, fast5_fn, True # several checks to prepare the FAST5 file for correction before # processing to save compute if not in_place: return ('Not currently implementing new hdf5 file writing', - fast5_fn) + fast5_fn, True) # check that the file is writeable before trying to correct if not os.access(fast5_fn, os.W_OK): - return ('FAST5 file is not writable', fast5_fn) + return 'FAST5 file is not writable', fast5_fn, True try: # create group to store data @@ -1718,7 +2097,7 @@ def try_close_prep_err(fast5_data, err_str): corr_grp_ptr = analyses_grp[corr_grp] if not overwrite: return try_close_prep_err( - fast5_data, "Tombo data exsists in [--corrected-group] " + + fast5_data, "Tombo data exists in [--corrected-group] " + "and [--overwrite] is not set") del analyses_grp[corr_grp] except: @@ -1730,8 +2109,8 @@ def try_close_prep_err(fast5_data, err_str): corr_grp.attrs['tombo_version'] = TOMBO_VERSION corr_grp.attrs['basecall_group'] = bc_grp except: - return ( - 'Error opening or writing to fast5 file', fast5_fn) + raise + return 'Error opening or writing to fast5 file', fast5_fn, True if return_fp: return fast5_data @@ -1739,21 +2118,19 @@ def try_close_prep_err(fast5_data, err_str): try: fast5_data.close() except: - return 'Error closing fast5 file', fast5_fn + return 'Error closing fast5 file', fast5_fn, True return -def write_error_status( - filename, corrected_group, basecall_subgroup, error_text): - """ - Write error message for a read into the FAST5 file +def write_error_status(fn, corr_grp, bc_subgrp, error_text): + """Write error message for a read into the FAST5 file """ - with h5py.File(filename, 'r+') as fast5_data: + with h5py.File(fn, 'r+') as fast5_data: analysis_grp = fast5_data['/Analyses'] - corr_grp = analysis_grp[corrected_group] - if basecall_subgroup is not None: + corr_grp = analysis_grp[corr_grp] + if bc_subgrp is not None: # add subgroup matching subgroup from original basecalls - corr_subgrp = corr_grp.create_group(basecall_subgroup) + corr_subgrp = corr_grp.create_group(bc_subgrp) corr_subgrp.attrs['status'] = error_text else: corr_grp.attrs['status'] = error_text @@ -1761,20 +2138,17 @@ def write_error_status( return def write_new_fast5_group( - fast5_data, genome_location, read_start_rel_to_raw, - new_segs, align_seq, norm_signal, scale_values, corrected_group, - basecall_subgroup, norm_type, outlier_thresh, compute_sd, - alignVals=None, align_info=None, old_segs=None, rna=False, - sig_match_score=None): - """ - Write new fast5 group with re-squiggle data + fast5_data, corr_grp_slot, rsqgl_res, norm_type, compute_sd, + alignVals=None, old_segs=None, rna=False): + """Write new fast5 group with re-squiggle data """ try: # compute event data before accessing fast5 file if compute_sd: - norm_means, norm_stds = c_new_mean_stds(norm_signal, new_segs) + norm_means, norm_stds = c_new_mean_stds( + rsqgl_res.raw_signal, rsqgl_res.segs) else: - norm_means = c_new_means(norm_signal, new_segs) + norm_means = c_new_means(rsqgl_res.raw_signal, rsqgl_res.segs) norm_stds = repeat(np.NAN) # had to shift to names formats numpy array specification due to @@ -1782,7 +2156,8 @@ def write_new_fast5_group( # https://github.com/numpy/numpy/issues/2407 event_data = np.array( list(zip(norm_means, norm_stds, - new_segs[:-1], np.diff(new_segs), list(align_seq))), + rsqgl_res.segs[:-1], np.diff(rsqgl_res.segs), + list(rsqgl_res.genome_seq))), dtype=[(str('norm_mean'), 'f8'), (str('norm_stdev'), 'f8'), (str('start'), 'u4'), (str('length'), 'u4'), (str('base'), 'S1')]) @@ -1794,7 +2169,7 @@ def write_new_fast5_group( np_genome_align = np.chararray(len(alignVals)) np_genome_align[:] = g_align_vals except: - raise NotImplementedError('Error computing new events') + raise TomboError('Error computing new events') do_close = False if not isinstance(fast5_data, h5py.File): @@ -1802,7 +2177,7 @@ def write_new_fast5_group( fast5_data = h5py.File(fast5_data, 'r+') do_close = True except: - raise NotImplementedError( + raise TomboError( 'Error opening file for new group writing. This should ' + 'have been caught during the alignment phase. Check that ' + 'there are no other tombo processes or processes ' + @@ -1810,35 +2185,50 @@ def write_new_fast5_group( try: analysis_grp = fast5_data['/Analyses'] - corr_grp = analysis_grp[corrected_group] + corr_grp = analysis_grp[corr_grp_slot] # add subgroup matching subgroup from original basecalls - corr_subgrp = corr_grp.create_group(basecall_subgroup) + corr_subgrp = corr_grp.create_group(rsqgl_res.align_info.Subgroup) corr_subgrp.attrs['status'] = 'success' + # TODO change to rev_sig corr_subgrp.attrs['rna'] = rna - if sig_match_score is not None: - corr_subgrp.attrs['signal_match_score'] = sig_match_score - corr_subgrp.attrs['shift'] = scale_values.shift - corr_subgrp.attrs['scale'] = scale_values.scale - corr_subgrp.attrs['lower_lim'] = scale_values.lower_lim - corr_subgrp.attrs['upper_lim'] = scale_values.upper_lim + if rsqgl_res.sig_match_score is not None: + corr_subgrp.attrs[ + 'signal_match_score'] = rsqgl_res.sig_match_score + corr_subgrp.attrs['shift'] = rsqgl_res.scale_values.shift + corr_subgrp.attrs['scale'] = rsqgl_res.scale_values.scale corr_subgrp.attrs['norm_type'] = norm_type - corr_subgrp.attrs['outlier_threshold'] = outlier_thresh + if rsqgl_res.scale_values.lower_lim is not None: + corr_subgrp.attrs[ + 'lower_lim'] = rsqgl_res.scale_values.lower_lim + if rsqgl_res.scale_values.upper_lim is not None: + corr_subgrp.attrs[ + 'upper_lim'] = rsqgl_res.scale_values.upper_lim + if rsqgl_res.scale_values.outlier_thresh is not None: + corr_subgrp.attrs[ + 'outlier_threshold'] = rsqgl_res.scale_values.outlier_thresh # store alignment statistics corr_alignment = corr_subgrp.create_group('Alignment') - corr_alignment.attrs['mapped_start'] = genome_location.Start - corr_alignment.attrs['mapped_end'] \ - = genome_location.Start + len(new_segs) - 1 - corr_alignment.attrs['mapped_strand'] = genome_location.Strand - corr_alignment.attrs['mapped_chrom'] = genome_location.Chrom - - if align_info is not None: - corr_alignment.attrs['clipped_bases_start'] = align_info.ClipStart - corr_alignment.attrs['clipped_bases_end'] = align_info.ClipEnd - corr_alignment.attrs['num_insertions'] = align_info.Insertions - corr_alignment.attrs['num_deletions'] = align_info.Deletions - corr_alignment.attrs['num_matches'] = align_info.Matches - corr_alignment.attrs['num_mismatches'] = align_info.Mismatches + corr_alignment.attrs['mapped_start'] = rsqgl_res.genome_loc.Start + corr_alignment.attrs[ + 'mapped_end'] = rsqgl_res.genome_loc.Start + len(rsqgl_res.segs) - 1 + corr_alignment.attrs[ + 'mapped_strand'] = rsqgl_res.genome_loc.Strand + corr_alignment.attrs['mapped_chrom'] = rsqgl_res.genome_loc.Chrom + + if rsqgl_res.align_info is not None: + corr_alignment.attrs[ + 'clipped_bases_start'] = rsqgl_res.align_info.ClipStart + corr_alignment.attrs[ + 'clipped_bases_end'] = rsqgl_res.align_info.ClipEnd + corr_alignment.attrs[ + 'num_insertions'] = rsqgl_res.align_info.Insertions + corr_alignment.attrs[ + 'num_deletions'] = rsqgl_res.align_info.Deletions + corr_alignment.attrs[ + 'num_matches'] = rsqgl_res.align_info.Matches + corr_alignment.attrs[ + 'num_mismatches'] = rsqgl_res.align_info.Mismatches if alignVals is not None: corr_alignment.create_dataset( @@ -1854,531 +2244,21 @@ def write_new_fast5_group( corr_events = corr_subgrp.create_dataset( 'Events', data=event_data, compression="gzip") corr_events.attrs[ - 'read_start_rel_to_raw'] = read_start_rel_to_raw + 'read_start_rel_to_raw'] = rsqgl_res.read_start_rel_to_raw except: - raise - raise NotImplementedError( + raise TomboError( 'Error writing resquiggle information back into fast5 file.') if do_close: try: fast5_data.close() except: - raise NotImplementedError( + raise TomboError( 'Error closing fast5 file after writing resquiggle information.') return -#################################### -###### Annotate Raw Functions ###### -#################################### - -def _prep_fast5_for_fastq(fast5_data, bc_grp_name, bc_subgrp_name, overwrite): - try: - read_id = get_raw_read_slot(fast5_data).attrs['read_id'] - try: - read_id = read_id.decode() - except (AttributeError, TypeError): - pass - except: - return None - - # if Analyses group doesn't exist yet, create it - try: - analyses_grp = fast5_data['/Analyses'] - except: - analyses_grp = fast5_data.create_group('Analyses') - - # create Fastq slot, unless value exists and --overwrite is not set - try: - bc_grp = analyses_grp[bc_grp_name] - bc_subgrp = analyses_grp[bc_subgrp_name] - except: - try: - bc_grp = analyses_grp.create_group(bc_grp_name) - bc_subgrp = bc_grp.create_group(bc_subgrp_name) - except: - if overwrite: - del analyses_grp[bc_grp_name] - bc_grp = analyses_grp.create_group(bc_grp_name) - bc_subgrp = bc_grp.create_group(bc_subgrp_name) - else: - raise NotImplementedError( - bc_grp_name + ' exists and --overwrite is not set.') - - return read_id - -def _annotate_with_fastqs_worker( - fastq_rec_q, fast5s_read_ids, fastq_slot, fq_slot_prepped, - prog_q, warn_q, bc_grp_name, bc_subgrp_name, overwrite): - been_warned = dict((warn_code, False) for warn_code in _WARN_CODES) - num_recs_proc = 0 - while True: - fastq_rec = fastq_rec_q.get() - if fastq_rec is None: - break - - # extract read_id from fastq (which should be the first text after - # the "@" record delimiter up to the first white space or underscore - read_id = fastq_rec[0].split()[0].split('_')[0][1:] - if read_id not in fast5s_read_ids: - if not been_warned[_WARN_ID_VAL]: - been_warned[_WARN_ID_VAL] = True - warn_q.put(_WARN_ID_VAL) - continue - - try: - with h5py.File(fast5s_read_ids[read_id], 'r+') as fast5_data: - if not fq_slot_prepped: - try: - file_parsed_id = _prep_fast5_for_fastq( - fast5_data, bc_grp_name, bc_subgrp_name, overwrite) - except NotImplementedError: - if not been_warned[_WARN_OVRWRT_VAL]: - been_warned[_WARN_OVRWRT_VAL] = True - warn_q.put(_WARN_OVRWRT_VAL) - continue - if read_id != file_parsed_id: - if not been_warned[_WARN_MISMATCH_VAL]: - been_warned[_WARN_MISMATCH_VAL] = True - warn_q.put(_WARN_MISMATCH_VAL) - continue - bc_slot = fast5_data[fastq_slot] - # add sequence to fastq slot - bc_slot.create_dataset( - 'Fastq', data=''.join(fastq_rec), - dtype=h5py.special_dtype(vlen=unicode)) - - # progress q update - num_recs_proc += 1 - if num_recs_proc % _PROC_UPDATE_INTERVAL == 0: - prog_q.put(_PROC_UPDATE_INTERVAL) - except: - if not been_warned[_WARN_IO_VAL]: - been_warned[_WARN_IO_VAL] = True - warn_q.put(_WARN_IO_VAL) - continue - - # add last number of records reported from this process - prog_q.put(num_recs_proc % _PROC_UPDATE_INTERVAL) - - return - -def _feed_seq_records_worker(fastq_fns, fastq_rec_q): - for fastq_fn in fastq_fns: - n_recs = 0 - with io.open(fastq_fn) as fastq_fp: - while True: - fastq_rec = list(islice(fastq_fp, 4)) - # if record contains fewer than 4 lines this indicates the - # EOF, so move to next file - if len(fastq_rec) != 4: break - # if sequence identifier line does not start with "@" or quality - # score line does not start with a "+" the file may be - # corrupted, so don't process any more records - if (re.match('@', fastq_rec[0]) is None or - re.match('\+', fastq_rec[2]) is None): - _warning_message( - 'Successfully parsed ' + unicode(n_recs) + - ' FASTQ records from ' + fastq_fn + ' before ' + - 'encountering an invalid record. The rest of ' + - 'this file will not be processed.') - break - n_recs += 1 - fastq_rec_q.put(fastq_rec) - - return - -def _get_ann_queues(prog_q, warn_q, been_warned): - iter_added = 0 - while True: - try: - iter_added += prog_q.get(block=False) - except queue.Empty: - break - - while True: - try: - warn_val = warn_q.get(block=False) - except queue.Empty: - break - - if warn_val == _WARN_ID_VAL: - if not been_warned[_WARN_ID_VAL]: - _warning_message( - 'Some records contain read identifiers not found in ' + - 'any FAST5 files or sequencing summary files.') - been_warned[_WARN_ID_VAL] = True - elif warn_val == _WARN_IO_VAL: - if not been_warned[_WARN_IO_VAL]: - _warning_message( - 'Some read files that could not be accessed.') - been_warned[_WARN_IO_VAL] = True - elif warn_val == _WARN_MISMATCH_VAL: - if not been_warned[_WARN_MISMATCH_VAL]: - _warning_message( - 'Read ID found in sequencing summary and FAST5 ' + - 'file are discordant. Skipping read.') - been_warned[_WARN_MISMATCH_VAL] = True - elif warn_val == _WARN_OVRWRT_VAL: - if not been_warned[_WARN_OVRWRT_VAL]: - _warning_message( - 'Basecalls exsit in specified slot for some reads. ' + - 'Set --overwrite option to overwrite these basecalls.') - been_warned[_WARN_OVRWRT_VAL] = True - else: - _warning_message('Invalid wanring code encountered.') - - return iter_added, been_warned - -def _annotate_with_fastqs( - fastq_fns, fast5s_read_ids, fastq_slot, fq_slot_prepped, num_processes, - bc_grp_name, bc_subgrp_name, overwrite): - if VERBOSE: _status_message('Annotating FAST5s with sequence from FASTQs.') - fastq_rec_q = Queue(maxsize=_MAX_FASTQ_QUEUE_SIZE) - # open a single process to read fastq files and feed the fastq record queue - fq_feed_p = Process(target=_feed_seq_records_worker, - args=(fastq_fns, fastq_rec_q)) - fq_feed_p.start() - - # open fast5 annotation processes - prog_q = Queue() - warn_q = Queue() - ann_args = (fastq_rec_q, fast5s_read_ids, fastq_slot, fq_slot_prepped, - prog_q, warn_q, bc_grp_name, bc_subgrp_name, overwrite) - ann_ps = [] - for p_id in range(num_processes): - p = Process(target=_annotate_with_fastqs_worker, args=ann_args) - p.start() - ann_ps.append(p) - - if VERBOSE: bar = tqdm(total=len(fast5s_read_ids), smoothing=0) - - total_added_seqs = 0 - been_warned = dict((warn_code, False) for warn_code in _WARN_CODES) - # process progress and warn queues until fastq filler process runs out of - # files/records - while fq_feed_p.is_alive(): - iter_added, been_warned = _get_ann_queues(prog_q, warn_q, been_warned) - total_added_seqs += iter_added - if VERBOSE: bar.update(iter_added) - sleep(0.01) - - # put none records to trigger annotation processes to exit - for _ in range(num_processes): - fastq_rec_q.put(None) - - # process the rest of the records - while any(p.is_alive() for p in ann_ps) or not prog_q.empty(): - iter_added, been_warned = _get_ann_queues(prog_q, warn_q, been_warned) - total_added_seqs += iter_added - if VERBOSE: bar.update(iter_added) - sleep(0.01) - if VERBOSE: bar.close() - - if VERBOSE: _status_message('Added sequences to a total of ' + - str(total_added_seqs) + ' reads.') - - return - -def _prep_fastq_slot_worker( - fast5_q, bc_grp, bc_subgrp, overwrite, read_ids_q, prog_q, warn_q): - num_files_proc = 0 - been_warned_overwrite = False - while not fast5_q.empty(): - try: - fast5_fn = fast5_q.get(block=False) - except queue.Empty: - sleep(0.1) - continue - - # None entry indicates that the fast5s queue is complete - if fast5_fn is None: - break - - num_files_proc += 1 - if num_files_proc % _PROC_UPDATE_INTERVAL == 0: - prog_q.put(_PROC_UPDATE_INTERVAL) - - try: - with h5py.File(fast5_fn) as fast5_data: - try: - read_id = _prep_fast5_for_fastq( - fast5_data, bc_grp, bc_subgrp, overwrite) - except NotImplementedError: - if not been_warned_overwrite: - been_warned_overwrite = True - warn_q.put(_WARN_OVRWRT_VAL) - continue - except: - continue - if read_id is None: - continue - - read_ids_q.put((read_id, fast5_fn)) - - prog_q.put(num_files_proc % _PROC_UPDATE_INTERVAL) - - return - -def _get_prep_queue(read_ids_q, prog_q, warn_q, fast5s_read_ids, been_warned): - """ - Process all records from all fast5 prep queues - """ - # only process up to _ITER_QUEUE_LIMIT items each iteration - iter_processed = 0 - while True: - try: - read_id, fast5_fn = read_ids_q.get(block=False) - except queue.Empty: - break - iter_processed += 1 - if iter_processed > _ITER_QUEUE_LIMIT: break - - if read_id in fast5s_read_ids: - if not been_warned[_WARN_UNIQ_VAL]: - _warning_message( - 'Multiple FAST5 files contain the same read identifiers. ' + - 'Ensure that FAST5 files are from a single run.') - been_warned[_WARN_UNIQ_VAL] = True - continue - fast5s_read_ids[read_id] = fast5_fn - - while True: - try: - warn_val = warn_q.get(block=False) - except queue.Empty: - break - if warn_val == _WARN_OVRWRT_VAL: - if not been_warned[_WARN_OVRWRT_VAL]: - _warning_message( - 'Basecalls exsit in specified slot for some reads. ' + - 'Set --overwrite option to overwrite these basecalls.') - been_warned[_WARN_OVRWRT_VAL] = True - else: - _warning_message('Invalid wanring code encountered.') - - iter_prog = 0 - while True: - try: - iter_prog += prog_q.get(block=False) - except queue.Empty: - break - - return fast5s_read_ids, iter_prog, been_warned - -def _fill_files_queue(fast5_q, fast5_fns, num_ps): - for fast5_fn in fast5_fns: - fast5_q.put(fast5_fn) - for _ in range(num_ps): - fast5_q.put(None) - - return - -def _get_read_ids_and_prep_fastq_slot( - fast5s_dir, bc_grp, bc_subgrp, overwrite, num_processes): - """ - Extract read id from /Raw group and prep fastq slots for annotation with - associated FASTQ files. - """ - if VERBOSE: _status_message( - 'Preparing reads and extracting read identifiers.') - fast5_q = Queue(maxsize=MAX_QUEUE_SIZE) - read_ids_q = Queue() - prog_q = Queue() - warn_q = Queue() - - fast5_fns = get_files_list(fast5s_dir) - num_fast5s = len(fast5_fns) - files_p = Process(target=_fill_files_queue, - args=(fast5_q, fast5_fns, num_processes)) - files_p.daemon = True - files_p.start() - - prep_args = (fast5_q, bc_grp, bc_subgrp, overwrite, read_ids_q, - prog_q, warn_q) - prep_ps = [] - for p_id in range(num_processes): - p = Process(target=_prep_fastq_slot_worker, args=prep_args) - p.start() - prep_ps.append(p) - - fast5s_read_ids = {} - # Warn non-unique read_ids in directory - been_warned = dict((warn_code, False) for warn_code in _WARN_CODES_PREP) - if VERBOSE: bar = tqdm(total=num_fast5s, smoothing=0) - while any(p.is_alive() for p in prep_ps): - fast5s_read_ids, iter_prog, been_warned = _get_prep_queue( - read_ids_q, prog_q, warn_q, fast5s_read_ids, been_warned) - if VERBOSE: bar.update(iter_prog) - sleep(0.01) - - fast5s_read_ids, iter_prog, been_warned = _get_prep_queue( - read_ids_q, prog_q, warn_q, fast5s_read_ids, been_warned) - if VERBOSE: bar.update(iter_prog) - if VERBOSE: bar.close() - - return fast5s_read_ids - -def _parse_sequencing_summary_files(fast5s_dir, seq_summary_fns): - if VERBOSE: _status_message('Getting read filenames.') - full_fast5_fns = {} - # walk through directory structure searching for fast5 files - for root, _, fns in os.walk(fast5s_dir): - for fn in fns: - if not fn.endswith('.fast5'): continue - full_fast5_fns[fn] = os.path.join(root, fn) - - if VERBOSE: _status_message('Parsing sequencing summary files.') - fast5s_read_ids = {} - been_warned = False - for seq_summary_fn in seq_summary_fns: - with open(seq_summary_fn) as fp: - try: - header_fields = fp.readline().split() - fn_field = next(i for i, h_field in enumerate(header_fields) - if re.match(_SEQ_SUMMARY_FN_FIELD, h_field)) - id_field = next(i for i, h_field in enumerate(header_fields) - if re.match(_SEQ_SUMMARY_ID_FIELD, h_field)) - except: - _warning_message( - 'Could not extract header information for sequencing ' + - 'summary file: ' + seq_summary_fn) - continue - try: - for line in fp: - rec_fields = line.split() - rec_short_fn = rec_fields[fn_field] - try: - rec_full_fn = full_fast5_fns[rec_short_fn] - except KeyError: - if not been_warned: - _warning_message( - 'Some records from sequencing summaries ' + - 'do not appear to have a matching file.') - been_warned = True - continue - # convert filename to full filename and link to read id - fast5s_read_ids[rec_fields[id_field]] = rec_full_fn - except: - _warning_message( - 'Error parsing records for sequencing ' + - 'summary file: ' + seq_summary_fn) - - return fast5s_read_ids - - -################################### -###### Filter Main Functions ###### -################################### - -def _clear_filters_main(args): - for fast5s_dir in args.fast5_basedirs: - clear_filters(fast5s_dir, args.corrected_group) - - return - -def _filter_stuck_main(args): - obs_filter = parse_obs_filter(args.obs_per_base_filter) - for fast5s_dir in args.fast5_basedirs: - filter_reads_for_stuck(fast5s_dir, args.corrected_group, obs_filter) - - return - -def _filter_coverage_main(args): - if not 0 < args.percent_to_filter < 100: - _error_message_and_exit( - '--percent-to-filter must be between 0 and 100.') - - for fast5s_dir in args.fast5_basedirs: - filter_reads_for_coverage( - fast5s_dir, args.corrected_group, args.percent_to_filter / 100.0) - - return - -def _filter_q_score_main(args): - if not 0 < args.q_score < 40: - _error_message_and_exit('--q-score must be between 0 and 40.') - - for fast5s_dir in args.fast5_basedirs: - filter_reads_for_qscore( - fast5s_dir, args.basecall_group, args.corrected_group, args.q_score) - - return - -def _filter_signal_matching_main(args): - if not 0 < args.signal_matching_score < 10: - _error_message_and_exit( - '--signal-matching-score must be between 0 and 10.') - - for fast5s_dir in args.fast5_basedirs: - filter_reads_for_signal_matching( - fast5s_dir, args.corrected_group, args.signal_matching_score) - - return - -def _filter_genome_pos_main(args): - include_regs = parse_genome_regions(args.include_regions) - - for fast5s_dir in args.fast5_basedirs: - filter_reads_for_genome_pos( - fast5s_dir, args.corrected_group, include_regs) - - return - -def _filter_main(args): - global VERBOSE - VERBOSE = not args.quiet - - if args.action_command == 'clear_filters': - _clear_filters_main(args) - elif args.action_command == 'genome_locations': - _filter_genome_pos_main(args) - elif args.action_command == 'stuck': - _filter_stuck_main(args) - elif args.action_command == 'level_coverage': - _filter_coverage_main(args) - elif args.action_command == 'q_score': - _filter_q_score_main(args) - elif args.action_command == 'raw_signal_matching': - _filter_signal_matching_main(args) - else: - _error_message_and_exit('Invalid Tombo filter command.') - - return - - -################################## -###### Annotate FAST5s Main ###### -################################## - -def _annotate_reads_with_fastq_main(args): - global VERBOSE - VERBOSE = not args.quiet - - fast5s_basedir = ( - args.fast5_basedir if args.fast5_basedir.endswith('/') else - args.fast5_basedir + '/') - if args.sequencing_summary_filenames: - fast5s_read_ids = _parse_sequencing_summary_files( - fast5s_basedir, args.sequencing_summary_filenames) - fq_slot_prepped = False - else: - fast5s_read_ids = _get_read_ids_and_prep_fastq_slot( - fast5s_basedir, args.basecall_group, args.basecall_subgroup, - args.overwrite, args.processes) - fq_slot_prepped = True - fastq_slot = '/'.join(('/Analyses', args.basecall_group, - args.basecall_subgroup)) - _annotate_with_fastqs( - args.fastq_filenames, fast5s_read_ids, fastq_slot, fq_slot_prepped, - args.processes, args.basecall_group, args.basecall_subgroup, - args.overwrite) - - return - - if __name__ == '__main__': - raise NotImplementedError( - 'This is a module. See commands with `tombo -h`') + sys.stderr.write('This is a module. See commands with `tombo -h`') + sys.exit(1) diff --git a/tombo/tombo_models/tombo.RNA.180mV.model b/tombo/tombo_models/tombo.RNA.180mV.model index de45c62..0c3f16b 100644 Binary files a/tombo/tombo_models/tombo.RNA.180mV.model and b/tombo/tombo_models/tombo.RNA.180mV.model differ diff --git a/tombo/tombo_models/tombo.RNA.5mC.model b/tombo/tombo_models/tombo.RNA.5mC.model index 61b3cbf..3ba549b 100644 Binary files a/tombo/tombo_models/tombo.RNA.5mC.model and b/tombo/tombo_models/tombo.RNA.5mC.model differ diff --git a/tombo/tombo_stats.py b/tombo/tombo_stats.py index d3a64b8..1749353 100644 --- a/tombo/tombo_stats.py +++ b/tombo/tombo_stats.py @@ -20,6 +20,7 @@ from tqdm import tqdm from time import sleep +from copy import deepcopy from operator import itemgetter from scipy import stats, optimize from collections import defaultdict @@ -35,28 +36,45 @@ # import tombo functions from . import tombo_helper as th -from .c_helper import c_mean_std, c_apply_outlier_thresh, c_new_means, \ - c_calc_llh_ratio, c_calc_llh_ratio_const_var, \ - c_calc_scaled_llh_ratio_const_var +from ._c_helper import ( + c_mean_std, c_apply_outlier_thresh, c_new_means, c_calc_llh_ratio, + c_calc_llh_ratio_const_var, c_calc_scaled_llh_ratio_const_var, + c_new_mean_stds, c_compute_running_pctl_diffs, c_compute_slopes) + from ._default_parameters import ( SMALLEST_PVAL, MIN_POSITION_SD, STANDARD_MODELS, ALTERNATE_MODELS, MIN_KMER_OBS_TO_EST, ALT_EST_BATCH, MAX_KMER_OBS, NUM_DENS_POINTS, LLR_THRESH, SAMP_COMP_THRESH, DE_NOVO_THRESH, KERNEL_DENSITY_RANGE, ROC_PLOT_POINTS, NANOPOLISH_CENTRAL_POS, NUM_READS_FOR_SCALE, ROBUST_QUANTS, MAX_POINTS_FOR_THEIL_SEN, NUM_READS_TO_ADJUST_MODEL, - OCLLHR_SCALE, OCLLHR_HEIGHT, OCLLHR_POWER) + OCLLHR_SCALE, OCLLHR_HEIGHT, OCLLHR_POWER, FM_OFFSET_DEFAULT, + MOST_SIGNIF_NUM_BATCHES_DEFAULT, DNA_SAMP_TYPE, RNA_SAMP_TYPE, + MEAN_PRIOR_CONST, SD_PRIOR_CONST, ALGN_PARAMS_TABLE, SEG_PARAMS_TABLE, + MIN_EVENT_TO_SEQ_RATIO, COLLAPSE_RNA_STALLS, STALL_PARAMS, + OUTLIER_THRESH, + RNA_SCALE_NUM_EVENTS, RNA_SCALE_MAX_FRAC_EVENTS, USE_RNA_EVENT_SCALE) +DEFAULT_STALL_PARAMS=th.stallParams(**STALL_PARAMS) + +# list of classes/functions to include in API +__all__ = [ + 'TomboStats', 'PerReadStats', 'TomboModel', + 'normalize_raw_signal', 'compute_base_means', 'get_read_seg_score', + 'get_ref_from_seq', 'calc_kmer_fitted_shift_scale', + 'load_resquiggle_parameters', 'compute_num_events'] -VERBOSE = False + +VERBOSE = True _PROFILE_SIGNIF = False _PROFILE_EST_REF = False +_PROFILE_CENTER_REF = False _PROFILE_ALT_EST = False _DEBUG_EST_STD = False _DEBUG_EST_BW = 0.05 _DEBUG_EST_NUM_KMER_SAVE = 500 -PER_READ_BLOCKS_QUEUE_LIMIT = 5 +STAT_BLOCKS_QUEUE_LIMIT = 5 DNA_BASES = ['A','C','G','T'] @@ -77,31 +95,50 @@ # assume constant SD in model to save on computation CONST_SD_MODEL = True +STAT_BLOCKS_H5_NAME = 'Statistic_Blocks' +MOST_SIGNIF_H5_NAME = 'Most_Significant_Stats' +COV_DAMP_COUNTS_H5_NAME = 'Cov_Damp_Counts' +COV_THRESH_H5_NAME = 'Cov_Threshold' + +# turned off by default (and not accessible via command line so hardcoded for now) +DEFAULT_TRIM_RNA_PARAMS = th.trimRnaParams( + moving_window_size=50, min_running_values=100, + thresh_scale=0.7, max_raw_obs=40000) + ############################################# ##### Pair-wise Distance and Clustering ##### ############################################# -def order_reads(log_r_pvals): - """ - Compute order of reads based on log p-values +def transform_and_trim_stats(reg_stats, are_pvals, trim_value): + if are_pvals: + reg_stats = -np.log10(reg_stats) + nan_r_stats = np.nan_to_num(reg_stats) + reg_stats[nan_r_stats > trim_value] = trim_value + else: + nan_r_stats = np.nan_to_num(reg_stats) + reg_stats[nan_r_stats > trim_value] = trim_value + reg_stats[nan_r_stats < -trim_value] = -trim_value + return reg_stats + +def order_reads(reg_stats): + """Compute order of reads based on log p-values or -log likelihood ratios """ - if log_r_pvals.shape[0] == 1: + if reg_stats.shape[0] == 1: return [0] # get pairwise distances between reads # will get some empty slice means warnings, so suppress those # (no seterr for this specific warning) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) - r_dists = pdist(log_r_pvals, lambda u, v: + r_dists = pdist(reg_stats, lambda u, v: np.nanmean(np.sqrt(((u-v)**2)))) r_dists[np.isnan(r_dists)] = np.nanmax(r_dists) + 1 # then perform single/min linkage clustering and return the leaf order return leaves_list(single(r_dists)) def sliding_window_dist(sig_diffs1, sig_diffs2, slide_span, num_bases): - """ - Compute distance over the minimum over a sliding window + """Compute distance over the minimum over a sliding window """ return np.sqrt(min(np.sum(np.square( sig_diffs1[i1:i1+num_bases] - sig_diffs2[i2:i2+num_bases])) @@ -109,14 +146,12 @@ def sliding_window_dist(sig_diffs1, sig_diffs2, slide_span, num_bases): for i2 in range((slide_span * 2) + 1))) def euclidian_dist(sig_diffs1, sig_diffs2): - """ - Compute Euclidean distance + """Compute Euclidean distance """ return np.sqrt(np.sum(np.square(sig_diffs1 - sig_diffs2))) def get_pairwise_dists(reg_sig_diffs, index_q, dists_q, slide_span=None): - """ - Compute pairwise distances between a set of signal shifts + """Compute pairwise distances between a set of signal shifts """ if slide_span > 0: num_bases=reg_sig_diffs[0].shape[0] - (slide_span * 2) @@ -147,37 +182,191 @@ def get_pairwise_dists(reg_sig_diffs, index_q, dists_q, slide_span=None): ###### Signal Normalization ###### ################################## -def get_valid_cpts(norm_signal, running_stat_width, num_events): - """ - DEPRECATED. Hook still included in re-squiggle, but commented out. +def compute_base_means(all_raw_signal, base_starts): + """Efficiently compute new base mean values from raw signal and base start positions + + Args: + all_raw_signal (`np.array`): raw nanopore signal obervation values + base_starts (`np.array::np.int32`): 0-based base start positions within raw signal + + Returns: + `np.array::np.float64` containing base mean levels + """ + return c_new_means(all_raw_signal.astype(np.float64), base_starts) + +def get_scale_values_from_events( + all_raw_signal, valid_cpts, outlier_thresh, + num_events=None, max_frac_events=None): + if num_events is not None or max_frac_events is not None: + if (num_events is None or + valid_cpts.shape[0] * max_frac_events < num_events): + num_events = int(valid_cpts.shape[0] * max_frac_events) + valid_cpts = valid_cpts[:num_events] + event_means = compute_base_means(all_raw_signal, valid_cpts) + read_med = np.median(event_means) + read_mad = np.median(np.abs(event_means - read_med)) + lower_lim = -outlier_thresh + upper_lim = outlier_thresh + + return th.scaleValues( + shift=read_med, scale=read_mad, + lower_lim=lower_lim, upper_lim=upper_lim, outlier_thresh=None) + +def trim_rna( + all_raw_signal, rsqgl_params, trim_rna_params=DEFAULT_TRIM_RNA_PARAMS): + all_raw_signal = all_raw_signal[:trim_rna_params.max_raw_obs] + num_events = np.int64( + all_raw_signal.shape[0] // rsqgl_params.mean_obs_per_event) + # get stdev over delta-mean events + valid_cpts = th.valid_cpts_w_cap( + all_raw_signal.astype(np.float64), rsqgl_params.min_obs_per_base, + rsqgl_params.running_stat_width, num_events) + _, window_sds = c_new_mean_stds( + all_raw_signal.astype(np.float64), valid_cpts) + + # now moving window through this array + n_windows = (window_sds.size - trim_rna_params.moving_window_size) + 1 + s_bytes = window_sds.strides[0] + moving_window_sds = np.lib.stride_tricks.as_strided( + window_sds, + shape=(n_windows, trim_rna_params.moving_window_size), + strides=(s_bytes, s_bytes)).mean(-1) + thresh = moving_window_sds.mean() * trim_rna_params.thresh_scale + + n_windows = (moving_window_sds.size - trim_rna_params.min_running_values) + 1 + s_bytes = moving_window_sds.strides[0] + running_mins = np.lib.stride_tricks.as_strided( + moving_window_sds, + shape=(n_windows, trim_rna_params.min_running_values), + strides=(s_bytes, s_bytes)).min(-1) + try: + pos_index = next(i for i, v in enumerate(running_mins) if v > thresh) + except StopIteration: + return 0 - Get valid changepoints given largest differences in neighboring - moving windows + return valid_cpts[pos_index] - Note that this method is completely vectorized, but allows segments - as small as 2 observations. This should be okay R9+, but is problematic - for <=R7 and RNA +def identify_stalls(all_raw_signal, stall_params, return_metric=False): + """Identify locations where bases have stalled in the pore. Two methods + availble depending on parameters specified in stall_params. """ - raw_cumsum = np.cumsum(np.concatenate([[0], norm_signal[:-1]])) - # get difference between all neighboring running_stat_width regions - running_diffs = np.abs( - (2 * raw_cumsum[running_stat_width:-running_stat_width]) - - raw_cumsum[:-2*running_stat_width] - - raw_cumsum[2*running_stat_width:]) - not_peaks = np.logical_not(np.logical_and( - running_diffs > np.concatenate([[0], running_diffs[:-1]]), - running_diffs > np.concatenate([running_diffs[1:], [0]]))) - running_diffs[not_peaks] = 0 - valid_cpts = np.argsort( - running_diffs)[::-1][:num_events].astype(np.int64) + running_stat_width - - return valid_cpts + def compute_running_mean_diffs(): + """Compute average difference between n_window neighboring window means + each of size window_size. + """ + moving_average = np.cumsum(all_raw_signal) + moving_average[stall_params.mini_window_size:] = ( + moving_average[stall_params.mini_window_size:] - + moving_average[:-stall_params.mini_window_size]) + moving_average = moving_average[ + stall_params.mini_window_size - 1:] / stall_params.mini_window_size + + # extract moving window averages at n_window offsets + offsets = [moving_average[ + int(stall_params.mini_window_size * offset): + int(-stall_params.mini_window_size * ( + stall_params.n_windows - offset - 1))] + for offset in range(stall_params.n_windows - 1)] + [ + moving_average[int(stall_params.mini_window_size * ( + stall_params.n_windows - 1)):],] + # compute difference between all pairwise offset + diffs = [np.abs(offsets[i] - offsets[j]) + for i in range(stall_params.n_windows) + for j in range(i + 1, stall_params.n_windows)] + + # compute average over offset differences at each valid position + diff_sums = diffs[0].copy() + for diff_i in diffs: + diff_sums += diff_i + return diff_sums / len(diffs) + + + # if the raw signal is too short to compute stall metrics + if all_raw_signal.shape[0] < stall_params.window_size: + if return_metric: + return [], np.repeat(np.NAN, all_raw_signal.shape[0]) + return [] + + # identify potentially stalled signal from either running window means + # or running percentile difference methods + stall_metric = np.empty(all_raw_signal.shape, all_raw_signal.dtype) + stall_metric[:] = np.NAN + start_offset = int(stall_params.window_size * 0.5) + end_offset = (all_raw_signal.shape[0] - stall_params.window_size + + start_offset + 1) + if (stall_params.lower_pctl is not None and + stall_params.upper_pctl is not None): + stall_metric[start_offset:end_offset] = c_compute_running_pctl_diffs( + all_raw_signal, np.int64(stall_params.window_size), + np.float64(stall_params.lower_pctl), + np.float64(stall_params.upper_pctl)) + elif (stall_params.n_windows is not None and + stall_params.mini_window_size is not None): + assert (stall_params.window_size == + stall_params.mini_window_size * stall_params.n_windows) + stall_metric[start_offset:end_offset] = compute_running_mean_diffs() + else: + th.TomboError( + 'Must provide method specific parameters for stall detection') + + # identify contiguous windows over threshold for minimal stretches + with np.errstate(invalid='ignore'): + stall_locs = np.where(np.diff(np.concatenate( + [[False], stall_metric <= stall_params.threshold])))[0] + if stall_metric[-1] <= stall_params.threshold: + stall_locs = np.concatenate([stall_locs, [stall_metric.shape[0]]]) + stall_locs = stall_locs.reshape(-1,2) + stall_locs = stall_locs[(np.diff(stall_locs) > + stall_params.min_consecutive_obs).flatten()] + if stall_locs.shape[0] == 0: + if return_metric: + return [], stall_metric + return [] + + # expand windows out to region that gave result below threshold + # since windows are centered (minus edge buffer) + expand_width = (stall_params.window_size // 2) - stall_params.edge_buffer + if expand_width > 0: + stall_locs[:,0] -= expand_width + stall_locs[:,1] += expand_width + # collapse intervals that now overlap + merged_stall_locs = [] + prev_int = stall_locs[0] + for curr_int in stall_locs: + if curr_int[0] > prev_int[1]: + # add previous interval to all intervals + merged_stall_locs.append(prev_int) + prev_int = curr_int + else: + # extend previous interval since these overlap + prev_int[1] = curr_int[1] + merged_stall_locs.append(prev_int) + stall_locs = merged_stall_locs + + if return_metric: + return stall_locs, stall_metric + return stall_locs def calc_kmer_fitted_shift_scale( prev_shift, prev_scale, r_event_means, r_model_means, r_model_inv_vars=None, method='theil_sen'): - """ - Compute fitted shift and scale parameters based on read sequence + """Use robust Theil-Sen estimator to compute fitted shift and scale parameters based on read sequence + + Args: + prev_shift (float): previous shift parameter + prev_scale (float): previous scale parameter + r_ref_means (`np.array::np.float64`): expected base signal levels + r_ref_sds (`np.array::np.float64`): expected base signal level sds + r_model_inv_vars (`np.array::np.float64`): expected base signal level inverse variances for method of moments (`mom`) computation + method (str): one of `theil_sen`, `robust`, or `mom` + + Returns: + Sequence-fitted scaling parameters + + 1) shift parameter (float) + 2) scale parameter (float) + 3) shift correction factor; multiply by ``prev_scale`` and add to ``prev_shift`` to get ``shift`` (float) + 4) scale correction factor; multiply by ``prev_scale`` to get ``scale`` (float) """ if method == 'robust': def read_lad_objective(x): @@ -188,6 +377,13 @@ def read_lad_objective(x): read_lad_objective, np.array([0,1]), method='nelder-mead', options={'xtol': 1e-8}).x elif method == 'theil_sen': + def compute_slopes(r_event_means, r_model_means): + # despite computing each diff twice this vectorized solution is + # about 10X faster than a list comprehension approach + delta_event = r_event_means[:, np.newaxis] - r_event_means + delta_model = r_model_means[:, np.newaxis] - r_model_means + return delta_model[delta_event > 0] / delta_event[delta_event > 0] + n_points = r_model_means.shape[0] # potentially sample points for long reads (>1kb) if r_model_means.shape[0] > MAX_POINTS_FOR_THEIL_SEN: @@ -197,18 +393,11 @@ def read_lad_objective(x): r_model_means = r_model_means[samp_ind] r_event_means = r_event_means[samp_ind] # compute Theil-Sen slope estimator - # despite computing each diff twice this vectorized solution is about - # 10X faster than a list comprehension approach - delta_event = r_event_means[:, np.newaxis] - r_event_means - delta_model = r_model_means[:, np.newaxis] - r_model_means - slopes = delta_model[delta_event > 0] / delta_event[delta_event > 0] - slopes.sort() - slope = np.median(slopes) + slope = np.median(c_compute_slopes(r_event_means, r_model_means)) inter = np.median(r_model_means - (slope * r_event_means)) if slope == 0: - raise NotImplementedError( - 'Read failed sequence-based signal re-scaling parameter ' + - 'estimation.') + raise th.TomboError('Read failed sequence-based signal ' + + 're-scaling parameter estimation.') # convert to shift and scale parameters (e.g. (obs - shift) / scale) scale_corr_factor = 1 / slope shift_corr_factor = -inter / slope @@ -228,7 +417,7 @@ def read_lad_objective(x): shift_corr_factor, scale_corr_factor = np.linalg.solve( coef_mat, dep_vect) else: - th._error_message_and_exit( + th.error_message_and_exit( 'Invalid k-mer fitted normalization parameter method: ' + method + '\n\t\tValid methods are "robust" and "mom".') @@ -239,7 +428,7 @@ def read_lad_objective(x): return shift, scale, shift_corr_factor, scale_corr_factor def estimate_global_scale(fast5_fns, num_reads=NUM_READS_FOR_SCALE): - if VERBOSE: th._status_message('Estimating global scale parameter.') + if VERBOSE: th.status_message('Estimating global scale parameter.') np.random.shuffle(fast5_fns) read_mads = [] if VERBOSE: @@ -247,7 +436,7 @@ def estimate_global_scale(fast5_fns, num_reads=NUM_READS_FOR_SCALE): for fast5_fn in fast5_fns: try: with h5py.File(fast5_fn, 'r') as fast5_data: - all_sig = th.get_raw_read_slot(fast5_data)['Signal'].value + all_sig = th.get_raw_read_slot(fast5_data)['Signal'][:] shift = np.median(all_sig) read_mads.append(np.median(np.abs(all_sig - shift))) if VERBOSE: bar.update(1) @@ -258,26 +447,46 @@ def estimate_global_scale(fast5_fns, num_reads=NUM_READS_FOR_SCALE): if VERBOSE: bar.close() if len(read_mads) == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'No reads contain raw signal for ' + 'global scale parameter estimation.') if len(read_mads) < num_reads: - th._warning_message( + th.warning_message( 'Few reads contain raw signal for global scale parameter ' + 'estimation. Results may not be optimal.') return np.mean(read_mads) def normalize_raw_signal( - all_raw_signal, read_start_rel_to_raw, read_obs_len, - norm_type=None, channel_info=None, outlier_thresh=None, + all_raw_signal, read_start_rel_to_raw=0, read_obs_len=None, + norm_type='median', outlier_thresh=None, channel_info=None, scale_values=None, event_means=None, model_means=None, model_inv_vars=None, const_scale=None): - """ - Apply scaling and windsorizing parameters to normalize raw signal - """ + """Apply scaling and windsorizing parameters to normalize raw signal. + + Args: + all_raw_signal (`np.array`): raw nanopore signal obervation values + read_start_rel_to_raw (int): amount of signal to trim from beginning of the signal (default: 0) + read_obs_len (int): length of signal to process from `read_start_rel_to_raw` (default: full length) + norm_type (str): normalization type (`median` (default), `none`, `pA_raw`, `pA`, `median_const_scale`, `robust_median`; ignored is ``scale_values`` provided) + outlier_thresh (float): windsorizing threshold (MAD units; default: None) + channel_info (:class:`tombo.tombo_helper.channelInfo`): channel information (optional; only for `pA` and `pA_raw`) + scale_values (:class:`tombo.tombo_helper.scaleValues`): scaling values (optional) + event_means (`np.array`): for `pA` fitted scaling parameters (optional) + model_means (`np.array`): for `pA` fitted scaling parameters (optional) + model_inv_vars (`np.array`): for `pA` fitted scaling parameters (optional) + const_scale (float): global scale parameter (optional) + + Returns: + Normalized signal and scaling parameters + + 1) normalized signal observations (`np.array::np.float64`) + 2) :class:`tombo.tombo_helper.scaleValues` + """ + if read_obs_len is None: + read_obs_len = all_raw_signal.shape[0] - read_start_rel_to_raw if norm_type not in NORM_TYPES and scale_values is None: - raise NotImplementedError( + raise th.TomboError( 'Normalization type ' + norm_type + ' is not a valid ' + 'option and shift or scale parameters were not provided.') @@ -308,7 +517,7 @@ def normalize_raw_signal( scale = const_scale elif norm_type == 'robust_median': shift = np.mean(np.percentile(raw_signal, ROBUST_QUANTS)) - scale = np.median(np.abs(raw_signal - read_robust_med)) + scale = np.median(np.abs(raw_signal - shift)) else: shift = scale_values.shift scale = scale_values.scale @@ -326,9 +535,13 @@ def normalize_raw_signal( else: lower_lim = scale_values.lower_lim upper_lim = scale_values.upper_lim - norm_signal = c_apply_outlier_thresh(norm_signal, lower_lim, upper_lim) + # provided scale values could contain None winsorizing limits still + if lower_lim is not None and upper_lim is not None: + norm_signal = c_apply_outlier_thresh( + norm_signal, lower_lim, upper_lim) - return norm_signal, th.scaleValues(shift, scale, lower_lim, upper_lim) + return norm_signal, th.scaleValues( + shift, scale, lower_lim, upper_lim, outlier_thresh) ############################# @@ -336,11 +549,11 @@ def normalize_raw_signal( ############################# class TomboModel(object): + """Load, store and access Tombo model attributes and sequence-based expected mean and standard deviation levels (median normalization only). + + .. automethod:: __init__ """ - Load, store and access Tombo model attributes and sequence-based expected - mean and standard deviation levels (median normalization only) - """ - def center_model(self, shift_corr_factor, scale_corr_factor): + def _center_model(self, shift_corr_factor, scale_corr_factor): centered_means = {} for kmer, k_mean in self.means.items(): centered_means[kmer] = ( @@ -350,12 +563,18 @@ def center_model(self, shift_corr_factor, scale_corr_factor): return - def make_constant_sd(self): + def _make_constant_sd(self): med_sd = np.median(list(self.sds.values())) self.sds = dict((kmer, med_sd) for kmer in self.sds) return - def write_model(self, ref_fn, alt_base=None, alt_name=None): + def write_model(self, ref_fn): + """Write TomboModel to specified file + + Args: + + ref_fn (str): filename to write TomboModel + """ # Explicity use btype string names for py3 compatiblity as well as # pickle-ability of numpy arrays for consistency. See discussion here: # https://github.com/numpy/numpy/issues/2407 @@ -367,23 +586,22 @@ def write_model(self, ref_fn, alt_base=None, alt_name=None): with h5py.File(ref_fn, 'w') as ref_fp: ref_fp.create_dataset('model', data=ref_for_file, compression="gzip") ref_fp.attrs['central_pos'] = self.central_pos - if alt_base is None: + if self.alt_base is None: ref_fp.attrs['model_name'] = STANDARD_MODEL_NAME else: - ref_fp.attrs['model_name'] = alt_name - ref_fp.attrs['alt_base'] = alt_base + ref_fp.attrs['model_name'] = self.alt_name + ref_fp.attrs['alt_base'] = self.alt_base return def _parse_tombo_model(self): - """ - Parse a tombo model file + """Parse a tombo model file """ try: with h5py.File(self.ref_fn, 'r') as ref_fp: - ref_raw = ref_fp['model'].value - central_pos = ref_fp.attrs['central_pos'] - model_name = ref_fp.attrs['model_name'] + ref_raw = ref_fp['model'][:] + central_pos = ref_fp.attrs.get('central_pos') + model_name = ref_fp.attrs.get('model_name') try: model_name = model_name.decode() @@ -391,7 +609,7 @@ def _parse_tombo_model(self): pass try: - alt_base = ref_fp.attrs['alt_base'] + alt_base = ref_fp.attrs.get('alt_base') except: alt_base = None try: @@ -400,7 +618,7 @@ def _parse_tombo_model(self): pass except: - th._error_message_and_exit('Invalid tombo model file provided: ' + th.error_message_and_exit('Invalid tombo model file provided: ' + unicode(self.ref_fn)) mean_ref = {} @@ -436,7 +654,7 @@ def _parse_text_model(self): mean_ref[kmer] = kmer_mean sd_ref[kmer] = kmer_sd except: - th._error_message_and_exit('Invalid text pA model file provided: ' + th.error_message_and_exit('Invalid text pA model file provided: ' + unicode(self.ref_fn)) self.means = mean_ref @@ -467,163 +685,263 @@ def _load_std_model(self, kmer_ref, central_pos): return - def add_invvar(self): + def _add_invvar(self): self.inv_var = {} for kmer, stdev in self.sds.items(): self.inv_var[kmer] = 1 / (stdev * stdev) return - def __init__(self, ref_fn, is_text_model=False, kmer_ref=None, - central_pos=None, minimal_startup=False): - if ref_fn is None: - assert kmer_ref is not None and central_pos is not None - self._load_std_model(kmer_ref, central_pos) + def _get_default_standard_ref(self, reads_index): + if th.is_sample_rna(reads_index=reads_index): + if VERBOSE: th.status_message( + 'Loading default canonical ***** RNA ***** model.') + std_ref_fn = STANDARD_MODELS[RNA_SAMP_TYPE] + self.seq_samp_type = th.seqSampleType(RNA_SAMP_TYPE, True) else: + if VERBOSE: th.status_message( + 'Loading default canonical ***** DNA ***** model.') + self.seq_samp_type = th.seqSampleType(DNA_SAMP_TYPE, False) + std_ref_fn = STANDARD_MODELS[DNA_SAMP_TYPE] + # get full filename path with setuptools + self.ref_fn = th.resolve_path(pkg_resources.resource_filename( + 'tombo', 'tombo_models/' + std_ref_fn)) + + return + + def _get_default_standard_ref_from_files(self, fast5_fns): + if th.is_sample_rna(fast5_fns=fast5_fns): + if VERBOSE: th.status_message( + 'Loading default canonical ***** RNA ***** model.') + std_ref_fn = STANDARD_MODELS[RNA_SAMP_TYPE] + self.seq_samp_type = th.seqSampleType(RNA_SAMP_TYPE, True) + else: + if VERBOSE: th.status_message( + 'Loading default canonical ***** DNA ***** model.') + std_ref_fn = STANDARD_MODELS[DNA_SAMP_TYPE] + self.seq_samp_type = th.seqSampleType(DNA_SAMP_TYPE, False) + # get full filename path with setuptools + self.ref_fn = th.resolve_path(pkg_resources.resource_filename( + 'tombo', 'tombo_models/' + std_ref_fn)) + + return + + def _check_ref_fn_exists(self): + if not os.path.exists(self.ref_fn): + th.error_message_and_exit('Invalid tombo model file provided.') + + def __init__( + self, ref_fn=None, is_text_model=False, kmer_ref=None, + central_pos=None, seq_samp_type=None, reads_index=None, + fast5_fns=None, minimal_startup=True): + """Initialize a Tombo k-mer model object + + Args: + ref_fn (str): tombo model filename + is_text_model (bool): `ref_fn` is text (e.g. https://github.com/nanoporetech/kmer_models/blob/master/r9.4_180mv_450bps_6mer/template_median68pA.model) + kmer_ref (list): containing 3-tuples 1) k-mer 2) expected level 3) level SD + central_pos (int): base within k-mer to assign signal (only applicable when `kmer_ref` is provided) + seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): sequencing sample type (default: None) + reads_index (:class:`tombo.tombo_helper.TomboReads`): For determining `seq_samp_type` + fast5_fns (list): fast5 read filenames from which to extract read metadata. For determining `seq_samp_type` + minimal_startup (bool): don't compute inverse variances (default True) + + Note: + + Order of priority for initialization when multiple model specifications are provided: + 1) `ref_fn` + 2) `kmer_ref` (requires `central_pos`) + 3) `seq_samp_type` + 4) `reads_index` + 5) `fast5_fns` + + Last 3 options load a default model file included with Tombo. Last 2 determine the sample type from read metadata. + """ + if ref_fn is not None: self.ref_fn = th.resolve_path(ref_fn) if is_text_model: self._parse_text_model() else: self._parse_tombo_model() + self.seq_samp_type = seq_samp_type + elif kmer_ref is not None: + assert central_pos is not None, ( + 'central_pos must be provided is TomboModel is loaded ' + + 'with a kmer_ref') + self._load_std_model(kmer_ref, central_pos) + self.seq_samp_type = seq_samp_type + else: + if seq_samp_type is not None: + self.seq_samp_type = seq_samp_type + self.ref_fn = th.resolve_path(pkg_resources.resource_filename( + 'tombo', 'tombo_models/' + STANDARD_MODELS[ + seq_samp_type.name])) + elif reads_index is not None: + self._get_default_standard_ref(reads_index) + elif fast5_fns is not None: + self._get_default_standard_ref_from_files(fast5_fns) + else: + th.error_message_and_exit( + 'Must provide initialization method for TomboModel.') + self._parse_tombo_model() self.kmer_width = len(next(k for k in self.means)) self.is_std_model = (self.name == STANDARD_MODEL_NAME and self.alt_base is None) self.is_alt_model = not self.is_std_model + self.inv_var = None if not minimal_startup: - self.add_invvar() + self._add_invvar() + + def reverse_sequence_copy(self): + """Return a copy of model for processing sequence/signal in reverse (default models are all saved in genome sequence forward (5p to 3p) direction) + """ + rev_model = deepcopy(self) + rev_model.central_pos = self.kmer_width - self.central_pos - 1 + rev_model.means = dict((kmer[::-1], kmer_mean) + for kmer, kmer_mean in self.means.items()) + rev_model.sds = dict((kmer[::-1], kmer_sds) + for kmer, kmer_sds in self.sds.items()) + if self.inv_var is not None: + rev_model.inv_var = dict( + (kmer[::-1], kmer_inv_var) + for kmer, kmer_inv_var in self.inv_var.items()) + + return rev_model ############################ ##### Model Estimation ##### ############################ -def parse_tombo_models(alt_fns, std_ref): +def check_valid_alt_models(alt_refs, std_ref): + """Parse several alternative tombo model files """ - Parse several alternative tombo model files - """ - alt_refs = {} - for alt_model_fn in alt_fns: - alt_ref = TomboModel(alt_model_fn) + for alt_name, alt_ref in alt_refs.items(): if (std_ref.central_pos != alt_ref.central_pos or std_ref.kmer_width != alt_ref.kmer_width): - th._warning_message( - 'Standard and ' + alt_model_fn + ' alternative base ' + + th.warning_message( + 'Standard and ' + alt_ref.ref_fn + ' alternative base ' + 'models must be estimated using the same k-mer positions.') continue if not alt_ref.is_alt_model: - th._warning_message( - 'Alternative model ' + alt_model_fn + ' appears to be a ' + + th.warning_message( + 'Alternative model ' + alt_ref.ref_fn + ' appears to be a ' + 'standard model and will not be processed.') continue - if alt_ref.name in alt_refs: - th._warning_message( - alt_ref.name + ' alternative model found in more than one ' + - 'model file. Ignoring: ' + alt_model_fn) - continue - alt_refs[alt_ref.name] = alt_ref - return alt_refs -def get_default_standard_ref(raw_read_coverage, bio_samp_type=None): - if bio_samp_type is not None: - standard_ref_fn = STANDARD_MODELS[bio_samp_type] - elif th.is_rna(raw_read_coverage): - if VERBOSE: th._status_message( - 'Using default canonical ***** RNA ***** model.') - standard_ref_fn = STANDARD_MODELS['RNA'] - else: - if VERBOSE: th._status_message( - 'Using default canonical ***** DNA ***** model.') - standard_ref_fn = STANDARD_MODELS['DNA'] - # get full filename path with setuptools - standard_ref_fn = pkg_resources.resource_filename( - 'tombo', 'tombo_models/' + standard_ref_fn) - - return standard_ref_fn, bio_samp_type - -def get_default_standard_ref_from_files(fast5_fns, bio_samp_type=None): - if bio_samp_type is not None: - standard_ref_fn = STANDARD_MODELS[bio_samp_type] - elif th.is_rna_from_files(fast5_fns): - if VERBOSE: th._status_message( - 'Using default canonical ***** RNA ***** model.') - standard_ref_fn = STANDARD_MODELS['RNA'] - bio_samp_type = 'RNA' - else: - if VERBOSE: th._status_message( - 'Using default canonical ***** DNA ***** model.') - standard_ref_fn = STANDARD_MODELS['DNA'] - bio_samp_type = 'DNA' - # get full filename path with setuptools - standard_ref_fn = pkg_resources.resource_filename( - 'tombo', 'tombo_models/' + standard_ref_fn) - - return standard_ref_fn, bio_samp_type - def _print_alt_models(): alt_model_types = [tuple(mod_name.split(ALT_MODEL_SEP_CHAR)) for mod_name in ALTERNATE_MODELS.keys()] - alt_bio_samps = ['',] + sorted(set(list(zip(*alt_model_types))[0])) + alt_seq_samps = ['',] + sorted(set(list(zip(*alt_model_types))[0])) alt_mods = list(set(list(zip(*alt_model_types))[1])) - row_format ="{:<10}" * (len(alt_bio_samps)) + '\n' - sys.stderr.write(row_format.format(*alt_bio_samps)) + row_format ="{:<10}" * (len(alt_seq_samps)) + '\n' + sys.stderr.write(row_format.format(*alt_seq_samps)) for alt_mod in alt_mods: has_mod = [alt_mod,] - for bio_samp in alt_bio_samps[1:]: - has_mod.append(' X' if (bio_samp, alt_mod) in alt_model_types else '') + for seq_samp in alt_seq_samps[1:]: + has_mod.append(' X' if (seq_samp, alt_mod) in alt_model_types else '') sys.stderr.write(row_format.format(*has_mod)) return -def get_default_alt_ref(alt_name, raw_read_coverage, bio_samp_type=None): - if bio_samp_type is not None: - try: - alt_model_fn = ALTERNATE_MODELS[ - bio_samp_type + ALT_MODEL_SEP_CHAR + alt_name] - except KeyError: - alt_model_fn = None - elif th.is_rna(raw_read_coverage): - bio_samp_type = 'RNA' - try: - alt_model_fn = ALTERNATE_MODELS['RNA' + ALT_MODEL_SEP_CHAR + alt_name] - except KeyError: - alt_model_fn = None - else: - bio_samp_type = 'DNA' - try: - alt_model_fn = ALTERNATE_MODELS['DNA' + ALT_MODEL_SEP_CHAR + alt_name] - except KeyError: - alt_model_fn = None +def load_default_alt_ref(alt_name, seq_samp_type): + try: + alt_model_fn = ALTERNATE_MODELS[ + seq_samp_type.name + ALT_MODEL_SEP_CHAR + alt_name] + except KeyError: + alt_model_fn = None if alt_model_fn is not None: # get full filename path with setuptools alt_model_fn = pkg_resources.resource_filename( 'tombo', 'tombo_models/' + alt_model_fn) if alt_model_fn is None or not os.path.isfile(alt_model_fn): - th._warning_message( + th.warning_message( 'Tombo default model for ' + alt_name + ' in ' + - bio_samp_type + ' does not exists.') + seq_samp_type.name + ' does not exists.') + return None + + return TomboModel(ref_fn=alt_model_fn, seq_samp_type=seq_samp_type) + +def load_alt_refs(alt_model_fns, alt_names, reads_index, std_ref, + seq_samp_type=None): + alt_refs = {} + if alt_model_fns is not None: + # load alternative models from filenames + for alt_model_fn in alt_model_fns: + alt_ref = TomboModel(alt_model_fn) + if alt_ref.name in alt_refs: + th.warning_message( + alt_ref.name + ' alternative model found in more than one ' + + 'model file. Ignoring: ' + alt_model_fn) + continue + alt_refs[alt_ref.name] = alt_ref + else: + # load alternative models from internal defaults + if seq_samp_type is None: + seq_samp_type = th.get_seq_sample_type(reads_index=reads_index) + for alt_name in alt_names: + alt_ref = load_default_alt_ref(alt_name, seq_samp_type) + if alt_ref is None: + continue + alt_refs[alt_name] = alt_ref + + check_valid_alt_models(alt_refs, std_ref) + + return alt_refs + +def load_valid_models( + tb_model_fn, plot_default_stnd, alt_model_fn, + plot_default_alt, reads_index, ctrl_fast5s_dirs=None): + # if no model was requested + if (tb_model_fn is None and not plot_default_stnd and + alt_model_fn is None and not plot_default_alt): return None, None - return alt_model_fn, bio_samp_type + std_ref = TomboModel(ref_fn=tb_model_fn, reads_index=reads_index) + if alt_model_fn is not None: + alt_ref = TomboModel(ref_fn=alt_model_fn) + elif plot_default_alt is not None: + seq_samp_type = std_ref.seq_samp_type + if seq_samp_type is None: + seq_samp_type = th.get_seq_sample_type(reads_index=reads_index) + alt_ref = load_default_alt_ref(plot_default_alt, seq_samp_type) + else: + alt_ref = None -def load_alt_refs(alt_names, raw_read_coverage, std_ref, bio_samp_type=None): - """ - Load several default alternative tombo models - """ - alt_fns = [] - for alt_name in alt_names: - alt_model_fn, _ = get_default_alt_ref( - alt_name, raw_read_coverage, bio_samp_type) - if alt_model_fn is None: - continue - alt_fns.append(alt_model_fn) + if ctrl_fast5s_dirs is not None and tb_model_fn is not None: + th.warning_message( + 'Both a second set of FAST5s and a tombo model were ' + + 'provided. Two samples with model plotting is not ' + + 'currently available. Models requested will be ignored.') - return parse_tombo_models(alt_fns, std_ref) + return std_ref, alt_ref def get_ref_from_seq(seq, std_ref, rev_strand=False, alt_ref=None): + """Compute expected signal levels for a sequence from a reference model + + Args: + + seq (str): genomic seqeunce to be converted to expected signal levels + std_ref (:class:`tombo.tombo_stats.TomboModel`): expected signal level model + rev_strand (bool): flip sequence (after extracting k-mers for expected level model lookup) + alt_ref (:class:`tombo.tombo_stats.TomboModel`): an alternative expected signal level model + + Note: + + Returned expected signal levels will be trimmed compared to the passed sequence based on the `std_ref.kmer_width` and `std_ref.central_pos`. + + Returns: + Expected signal level references + + 1) ref_means (`np.array::np.float64`) expected signal levels + 2) ref_sds (`np.array::np.float64`) expected signal level sds + 3) alt_means (`np.array::np.float64`) alternate expected signal levels + 4) alt_sds (`np.array::np.float64`) alternate expected signal level sds + """ seq_kmers = [seq[i:i + std_ref.kmer_width] for i in range(len(seq) - std_ref.kmer_width + 1)] # get stat lookups from seq on native strand then flip if rev_strand @@ -634,7 +952,7 @@ def get_ref_from_seq(seq, std_ref, rev_strand=False, alt_ref=None): ref_means = np.array([std_ref.means[kmer] for kmer in seq_kmers]) ref_sds = np.array([std_ref.sds[kmer] for kmer in seq_kmers]) except KeyError: - th._error_message_and_exit( + th.error_message_and_exit( 'Invalid sequence encountered from genome sequence.') if alt_ref is None: alt_means, alt_sds = None, None @@ -644,39 +962,75 @@ def get_ref_from_seq(seq, std_ref, rev_strand=False, alt_ref=None): return ref_means, ref_sds, alt_means, alt_sds +def get_ref_from_seq_with_gaps(reg_seq, std_ref, rev_strand): + # loop over regions without valid sequence (non-ACGT) + reg_ref_means, reg_ref_sds = ( + np.empty(len(reg_seq) - std_ref.kmer_width + 1), + np.empty(len(reg_seq) - std_ref.kmer_width + 1)) + reg_ref_means[:] = np.NAN + reg_ref_sds[:] = np.NAN + prev_ibr_end = 0 + for inv_base_run_m in th.INVALID_BASE_RUNS.finditer(reg_seq): + ibr_start, ibr_end = inv_base_run_m.start(), inv_base_run_m.end() + # if valid region is too short continue + if ibr_start - prev_ibr_end < std_ref.kmer_width: + prev_ibr_end = ibr_end + continue + subreg_ref_means, subreg_ref_sds, _, _ = get_ref_from_seq( + reg_seq[prev_ibr_end:ibr_start], std_ref) + reg_ref_means[prev_ibr_end: + ibr_start - std_ref.kmer_width + 1] = subreg_ref_means + reg_ref_sds[prev_ibr_end: + ibr_start - std_ref.kmer_width + 1] = subreg_ref_sds + prev_ibr_end = ibr_end + + # if there is valid sequence at the end of a region include it here + if prev_ibr_end <= len(reg_seq) - std_ref.kmer_width: + subreg_ref_means, subreg_ref_sds, _, _ = get_ref_from_seq( + reg_seq[prev_ibr_end:], std_ref) + reg_ref_means[prev_ibr_end:] = subreg_ref_means + reg_ref_sds[prev_ibr_end:] = subreg_ref_sds + + if rev_strand: + reg_ref_means = reg_ref_means[::-1] + reg_ref_sds = reg_ref_sds[::-1] + + return reg_ref_means, reg_ref_sds + def calc_med_sd(vals): - """ - Helper function to compute median and standard deviation from a numpy array + """Helper function to compute median and standard deviation from a numpy array """ return np.median(vals), np.std(vals) -def get_region_kmer_levels( - reg_reads, cov_thresh, upstrm_bases, dnstrm_bases, - cs_cov_thresh, est_mean, region_size, reg_start, strand): - """ - Compute mean or median and standard deviation for each k-mer +def get_region_kmer_levels(reg_data, cov_thresh, upstrm_bases, dnstrm_bases, + cs_cov_thresh, est_mean, region_size): + """Compute mean or median and standard deviation for each k-mer """ if cs_cov_thresh is not None: # sample reads until requested mean depth of coverage is achieved cs_num_bases_thresh = region_size * cs_cov_thresh - np.random.shuffle(reg_reads) - cumm_num_bases = np.cumsum([r_data.end - r_data.start - for r_data in reg_reads]) + np.random.shuffle(reg_data.reads) + cumm_num_bases = np.cumsum([ + max(r_data.end, reg_data.end) - min(r_data.start, reg_data.start) + for r_data in reg_data.reads]) try: cs_num_reads = next((i for i, v in enumerate(cumm_num_bases) if v >= cs_num_bases_thresh)) - reg_reads = reg_reads[:cs_num_reads] + reg_data.update(reads=reg_data.reads[:cs_num_reads]) except StopIteration: # if threshold is not met use all reads from region pass - base_events = th.get_reads_events(reg_reads) + # TODO convert this to the intervalData method get_base_levels function + # involves a bit of logical refactoring below (which should be much simpler) + base_events = th.get_reads_events(reg_data.reads) if len(base_events) == 0: return # get intervals within the region where coverage is high enough # for model estimation - reg_cov = np.array([len(base_events[pos]) if pos in base_events else 0 - for pos in range(reg_start, reg_start + region_size)]) + reg_cov = np.array([ + len(base_events[pos]) if pos in base_events else 0 + for pos in range(reg_data.start, reg_data.end)]) cov_intervals = np.where(np.diff(np.concatenate( [[False], reg_cov > cov_thresh])))[0] if reg_cov[-1] > cov_thresh: @@ -693,32 +1047,34 @@ def get_region_kmer_levels( for kmer in product(DNA_BASES, repeat=kmer_width)) # upstream and downstream changes the sequence selection # depending on the strand - bb, ab = (upstrm_bases, dnstrm_bases) if strand == '+' else \ + bb, ab = (upstrm_bases, dnstrm_bases) if reg_data.strand == '+' else \ (dnstrm_bases, upstrm_bases) - for int_start, int_end in cov_intervals: - int_seq = th.get_seq_from_reads( - reg_start + int_start - bb, reg_start + int_end + ab, reg_reads) - if strand == '-': + for cov_start, cov_end in cov_intervals: + # get region sequence from expanded region to include k-mer lookups + int_seq = reg_data.copy().update( + start=reg_data.start + cov_start - bb, + end=reg_data.start + cov_end + ab).add_seq().seq + if reg_data.strand == '-': int_seq = th.comp_seq(int_seq) - int_len = int_end - int_start + int_len = cov_end - cov_start for pos in range(int_len): - pos_kmer = int_seq[pos:pos+kmer_width] - if strand == '-': + pos_kmer = int_seq[pos:pos + kmer_width] + if reg_data.strand == '-': pos_kmer = pos_kmer[::-1] try: if est_mean: reg_kmer_levels[pos_kmer].append(c_mean_std( - base_events[reg_start+pos+int_start])) + base_events[reg_data.start + pos + cov_start])) else: reg_kmer_levels[pos_kmer].append(calc_med_sd( - base_events[reg_start+pos+int_start])) + base_events[reg_data.start + pos + cov_start])) except KeyError: continue return reg_kmer_levels def _est_kmer_model_worker( - region_q, kmer_level_q, progress_q, raw_read_coverage, cov_thresh, + region_q, kmer_level_q, progress_q, reads_index, cov_thresh, upstrm_bases, dnstrm_bases, cs_cov_thresh, est_mean, region_size): while not region_q.empty(): try: @@ -729,16 +1085,16 @@ def _est_kmer_model_worker( continue break - reg_reads = [r_data for r_data in raw_read_coverage[(chrm, strand)] - if not (r_data.start >= reg_start + region_size or - r_data.end <= reg_start)] - if len(reg_reads) == 0: + reg_data = th.intervalData( + chrm=chrm, start=reg_start, end=reg_start + region_size, + strand=strand).add_reads(reads_index) + if len(reg_data.reads) == 0: progress_q.put(1) continue reg_kmer_levels = get_region_kmer_levels( - reg_reads, cov_thresh, upstrm_bases, dnstrm_bases, - cs_cov_thresh, est_mean, region_size, reg_start, strand) + reg_data, cov_thresh, upstrm_bases, dnstrm_bases, + cs_cov_thresh, est_mean, region_size) if reg_kmer_levels is not None: kmer_level_q.put(reg_kmer_levels) progress_q.put(1) @@ -754,17 +1110,17 @@ def _est_kmer_model_worker(*args): return def extract_kmer_levels( - raw_read_coverage, region_size, cov_thresh, upstrm_bases, dnstrm_bases, - cs_cov_thresh, est_mean, num_processes): - chrm_sizes = th.get_chrm_sizes(raw_read_coverage) + reads_index, region_size, cov_thresh, upstrm_bases, dnstrm_bases, + cs_cov_thresh, est_mean=False, num_processes=1): + chrm_sizes = th.get_chrm_sizes(reads_index) region_q = Queue() kmer_level_q = Queue() progress_q = Queue() num_regions = 0 for chrm, chrm_len in chrm_sizes.items(): - plus_covered = (chrm, '+') in raw_read_coverage - minus_covered = (chrm, '-') in raw_read_coverage + plus_covered = (chrm, '+') in reads_index + minus_covered = (chrm, '-') in reads_index for reg_start in range(0, chrm_len, region_size): if plus_covered: region_q.put((chrm, '+', reg_start)) @@ -774,7 +1130,7 @@ def extract_kmer_levels( num_regions +=1 est_args = ( - region_q, kmer_level_q, progress_q, raw_read_coverage, cov_thresh, + region_q, kmer_level_q, progress_q, reads_index, cov_thresh, upstrm_bases, dnstrm_bases, cs_cov_thresh, est_mean, region_size) est_ps = [] for p_id in range(num_processes): @@ -783,7 +1139,7 @@ def extract_kmer_levels( est_ps.append(p) if VERBOSE: - th._status_message('Extracting average k-mer levels.') + th.status_message('Extracting average k-mer levels.') bar = tqdm(total=num_regions, smoothing=0) all_reg_kmer_levels = [] while any(p.is_alive() for p in est_ps): @@ -800,22 +1156,27 @@ def extract_kmer_levels( while not kmer_level_q.empty(): reg_kmer_levels = kmer_level_q.get(block=False) all_reg_kmer_levels.append(reg_kmer_levels) - if VERBOSE: bar.close() + if VERBOSE: + while not progress_q.empty(): + iter_proc = progress_q.get(block=False) + if VERBOSE: bar.update(iter_proc) + bar.close() if len(all_reg_kmer_levels) == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'No genomic positions contain --minimum-test-reads. Consider ' + 'setting this option to a lower value.') return all_reg_kmer_levels -def tabulate_kmer_levels(kmer_width, all_reg_kmer_levels, min_kmer_obs): - if VERBOSE: th._status_message('Tabulating k-mer model statistics.') +def tabulate_kmer_levels(all_reg_kmer_levels, min_kmer_obs): + if VERBOSE: th.status_message('Tabulating k-mer model statistics.') all_kmer_mean_sds = [] if _DEBUG_EST_STD: kmer_dens = [] save_x = np.linspace(KERNEL_DENSITY_RANGE[0], KERNEL_DENSITY_RANGE[1], _DEBUG_EST_NUM_KMER_SAVE) + kmer_width = len(next(iter(all_reg_kmer_levels[0].keys()))) for kmer in product(DNA_BASES, repeat=kmer_width): kmer = ''.join(kmer) try: @@ -823,7 +1184,7 @@ def tabulate_kmer_levels(kmer_width, all_reg_kmer_levels, min_kmer_obs): reg_kmer_levels[kmer] for reg_kmer_levels in all_reg_kmer_levels if len(reg_kmer_levels[kmer]) > 0]) except ValueError: - th._error_message_and_exit( + th.error_message_and_exit( 'At least one k-mer is not covered at any poitions by ' + '--minimum-test-reads.\n\t\tConsider fitting to a smaller ' + 'k-mer via the --upstream-bases and --downstream-bases, ' + @@ -834,7 +1195,7 @@ def tabulate_kmer_levels(kmer_width, all_reg_kmer_levels, min_kmer_obs): sum(len(reg_levs[''.join(kmer)]) for reg_levs in all_reg_kmer_levels) for kmer in product(DNA_BASES, repeat=kmer_width)) - th._error_message_and_exit( + th.error_message_and_exit( 'K-mers represeneted in fewer observations than ' + 'requested in the provided reads. Consider a shorter ' + 'k-mer or providing more reads.\n\t' + unicode(min_obs) + @@ -856,20 +1217,138 @@ def tabulate_kmer_levels(kmer_width, all_reg_kmer_levels, min_kmer_obs): for x, y in zip(save_x, dens_i)) + '\n') return all_kmer_mean_sds +# methods needed for (re-squiggle) segmentation are also needed here for +# RNA event-based scaling (for model scale centering) +def load_resquiggle_parameters( + seq_samp_type, sig_aln_params=None, seg_params=None, + use_save_bandwidth=False): + """Load parameters for re-squiggle algorithm + + Args: + seq_samp_type (:class:`tombo.tombo_helper.seqSampleType`): sequencing sample type + sig_aln_params (tuple): signal alignment parameters (optional; default: load seq_samp_type defaults) + seg_params (tuple): segmentation parameters (optional; default: load seq_samp_type defaults) + use_save_bandwidth (bool): load larger "save" bandwidth + + Returns: + :class:`tombo.tombo_helper.resquiggleParams` + """ + if sig_aln_params is None: + (match_evalue, skip_pen, bandwidth, save_bandwidth, max_half_z_score, + band_bound_thresh, start_bw, start_save_bw, + start_n_bases) = ALGN_PARAMS_TABLE[seq_samp_type.name] + else: + # unpack signal alignment parameters + (match_evalue, skip_pen, bandwidth, save_bandwidth, + max_half_z_score, band_bound_thresh, start_bw, start_save_bw, + start_n_bases) = sig_aln_params + bandwidth = int(bandwidth) + save_bandwidth = int(save_bandwidth) + band_bound_thresh = int(band_bound_thresh) + start_bw = int(start_bw) + start_save_bw = int(start_save_bw) + start_n_bases = int(start_n_bases) + if use_save_bandwidth: + bandwidth = save_bandwidth + + if seg_params is None: + (running_stat_width, min_obs_per_base, + mean_obs_per_event) = SEG_PARAMS_TABLE[seq_samp_type.name] + else: + (running_stat_width, min_obs_per_base, mean_obs_per_event) = seg_params + + z_shift, stay_pen = get_dynamic_prog_params(match_evalue) + + rsqgl_params = th.resquiggleParams( + match_evalue, skip_pen, bandwidth, max_half_z_score, + running_stat_width, min_obs_per_base, mean_obs_per_event, + z_shift, stay_pen, seq_samp_type.name == RNA_SAMP_TYPE, + band_bound_thresh, start_bw, start_save_bw, + start_n_bases) + + return rsqgl_params + +def compute_num_events( + signal_len, seq_len, mean_obs_per_event, + min_event_to_seq_ratio=MIN_EVENT_TO_SEQ_RATIO): + """Compute number of events to find for this read + + Args: + signal_len (int): length of raw signal + seq_len (int): length of sequence + mean_obs_per_base (int): mean raw observations per genome base + min_event_to_seq_ratio (float): minimum event to sequence ratio (optional) + + Returns: + Number of events to find for this read + """ + return max(signal_len // mean_obs_per_event, + int(seq_len * min_event_to_seq_ratio)) + +def remove_stall_cpts(stall_ints, valid_cpts): + if len(stall_ints) == 0: + return valid_cpts + + # RNA data contains stall regions that can cause problems for + # banded dynamic programming so they are removed here + stall_int_iter = iter(stall_ints) + curr_stall_int = next(stall_int_iter) + non_stall_cpts = [] + # loop over valid cpts + for i, cpt in enumerate(valid_cpts): + # iterate through stall intervals until the current interval end + # is greater than the cpt to check against + while cpt > curr_stall_int[1]: + try: + curr_stall_int = next(stall_int_iter) + except StopIteration: + break + if not (curr_stall_int[0] < cpt < curr_stall_int[1]): + non_stall_cpts.append(i) + + return valid_cpts[non_stall_cpts] + def center_model_to_median_norm( - raw_read_coverage, init_ref, max_reads=NUM_READS_TO_ADJUST_MODEL): + reads_index, init_ref, max_reads=NUM_READS_TO_ADJUST_MODEL): upstrm_bases = init_ref.central_pos dnstrm_bases = init_ref.kmer_width - init_ref.central_pos - 1 + def get_event_scale_values(all_raw_signal, r_len): + rsqgl_params = load_resquiggle_parameters( + th.seqSampleType(RNA_SAMP_TYPE, True)) + + num_events = compute_num_events( + all_raw_signal.shape[0], r_len, + rsqgl_params.mean_obs_per_event, MIN_EVENT_TO_SEQ_RATIO) + valid_cpts = th.valid_cpts_w_cap_t_test( + all_raw_signal.astype(np.float64), rsqgl_params.min_obs_per_base, + rsqgl_params.running_stat_width, num_events) + if COLLAPSE_RNA_STALLS: + valid_cpts = remove_stall_cpts( + identify_stalls(all_raw_signal, DEFAULT_STALL_PARAMS), valid_cpts) + scale_values = get_scale_values_from_events( + all_raw_signal, valid_cpts, OUTLIER_THRESH, + num_events=RNA_SCALE_NUM_EVENTS, + max_frac_events=RNA_SCALE_MAX_FRAC_EVENTS) + return normalize_raw_signal( + all_raw_signal, scale_values=scale_values) + def get_read_corr_factors(r_data): with h5py.File(r_data.fn, 'r+') as fast5_data: - all_raw_signal = th.get_raw_read_slot(fast5_data)['Signal'].value + all_raw_signal = th.get_raw_read_slot(fast5_data)['Signal'][:] event_starts, r_seq = th.get_multiple_slots_read_centric( fast5_data, ('start', 'base'), r_data.corr_group) if r_data.rna: all_raw_signal = all_raw_signal[::-1] - norm_signal, scale_values = normalize_raw_signal( - all_raw_signal, 0, all_raw_signal.shape[0], 'median', None, None) + if USE_RNA_EVENT_SCALE: + norm_signal, scale_values = get_event_scale_values( + all_raw_signal, r_data.end - r_data.start) + else: + # use raw signal median normalization + norm_signal, scale_values = normalize_raw_signal( + all_raw_signal) + else: + norm_signal, scale_values = normalize_raw_signal(all_raw_signal) event_starts = event_starts.astype(np.int64) rsrtr = r_data.read_start_rel_to_raw + event_starts[upstrm_bases] @@ -893,35 +1372,33 @@ def get_read_corr_factors(r_data): (_, _, shift_corr_factor, scale_corr_factor) = calc_kmer_fitted_shift_scale( scale_values.shift, scale_values.scale, - c_new_means(norm_signal, event_starts), r_ref_means, + compute_base_means(norm_signal, event_starts), r_ref_means, method='theil_sen') return shift_corr_factor, scale_corr_factor + th.status_message('Centering model to normalized signal') all_shift_corr_factors, all_scale_corr_factors = [], [] - all_reads = [r_data for cs_reads in raw_read_coverage.values() - for r_data in cs_reads] - random.shuffle(all_reads) - not_enough_reads = True + all_reads = list(reads_index.iter_reads()) + np.random.shuffle(all_reads) for r_data in all_reads: try: r_shift_corr_factor, r_scale_corr_factor = get_read_corr_factors( r_data) all_shift_corr_factors.append(r_shift_corr_factor) all_scale_corr_factors.append(r_scale_corr_factor) - if len(all_scale_corr_factors) >= max_reads: - not_enough_reads = False + if len(all_shift_corr_factors) >= max_reads: break except: continue - if not_enough_reads: + if len(all_shift_corr_factors) < max_reads: if len(all_shift_corr_factors) == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'No reads succcessfully processed for sequence-based ' + 'normalization parameter re-fitting.') - th._warning_message( + th.warning_message( 'Fewer reads succcessfully processed for sequence-based ' + 'normalization parameter re-fitting than requested.') @@ -931,39 +1408,44 @@ def get_read_corr_factors(r_data): med_shift_corr_factor = np.median(all_shift_corr_factors) med_scale_corr_factor = np.median(all_scale_corr_factors) - th._status_message('Shift and scale adjustments to match model to ' + + th.status_message('Shift and scale adjustments to match model to ' + 'median normalization: ' + str(med_shift_corr_factor) + " " + str(med_scale_corr_factor)) - init_ref.center_model(med_shift_corr_factor, med_scale_corr_factor) + init_ref._center_model(med_shift_corr_factor, med_scale_corr_factor) return init_ref +if _PROFILE_CENTER_REF: + center_model_to_median_norm_wrapper = center_model_to_median_norm + def center_model_to_median_norm(*args): + import cProfile + cProfile.runctx( + 'center_model_to_median_norm_wrapper(*args)', globals(), locals(), + filename='center_kmer_model.prof') + return + def estimate_kmer_model( - f5_dirs, corrected_group, basecall_subgroups, + fast5s_dirs, corr_grp, bc_subgrps, kmer_ref_fn, cov_thresh, upstrm_bases, dnstrm_bases, min_kmer_obs, kmer_specific_sd, cs_cov_thresh, est_mean, region_size, num_processes): + """Estimate a standard tombo k-mer model """ - Estimate a standard tombo k-mer model - """ - raw_read_coverage = th.parse_fast5s( - f5_dirs, corrected_group, basecall_subgroups) + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) all_reg_kmer_levels = extract_kmer_levels( - raw_read_coverage, region_size, cov_thresh, upstrm_bases, dnstrm_bases, + reads_index, region_size, cov_thresh, upstrm_bases, dnstrm_bases, cs_cov_thresh, est_mean, num_processes) - all_kmer_mean_sds = tabulate_kmer_levels( - upstrm_bases + dnstrm_bases + 1, all_reg_kmer_levels, min_kmer_obs) + all_kmer_mean_sds = tabulate_kmer_levels(all_reg_kmer_levels, min_kmer_obs) # adjust model to match median normalization best via Theil-Sen optimizer fit # this will increase the accuracy of median normalized re-squiggle results # and should reduce the need for (or number of) iterative re-squiggle runs - init_ref = TomboModel( - ref_fn=None, kmer_ref=all_kmer_mean_sds, central_pos=upstrm_bases) + init_ref = TomboModel(kmer_ref=all_kmer_mean_sds, central_pos=upstrm_bases) - centered_ref = center_model_to_median_norm(raw_read_coverage, init_ref) + centered_ref = center_model_to_median_norm(reads_index, init_ref) if not kmer_specific_sd: - centered_ref.make_constant_sd() + centered_ref._make_constant_sd() centered_ref.write_model(kmer_ref_fn) return @@ -1099,13 +1581,13 @@ def parse_base_levels( all_kmer_levels.values()) if fewest_kmer_obs < kmer_obs_thresh: if fewest_kmer_obs < min_kmer_obs_to_est: - th._error_message_and_exit( + th.error_message_and_exit( 'Too few minimal k-mer observations to continue to ' + 'alternative estimation. Minimal k-mer has ' + unicode(fewest_kmer_obs) + ' total observations and ' + unicode(min_kmer_obs_to_est) + ' observations per k-mer are required.') - th._warning_message( + th.warning_message( 'Requested minimal k-mer observations not found in all reads. ' + 'Continuing to estimation using a k-mer with ' + unicode(fewest_kmer_obs) + ' total observations') @@ -1135,24 +1617,23 @@ def parse_kmer_densities_file(dens_fn): for kmer, dens_i in kmer_dens_raw.items(): if first_len is None: first_len = len(dens_i) if len(dens_i) != first_len: - th._error_message_and_exit('Density file is valid.') + th.error_message_and_exit('Density file is valid.') kmer_dens[kmer] = np.array(dens_i) return kmer_dens def est_kernel_density( - raw_read_coverage, std_ref, kmer_obs_thresh, - density_basename, save_x, kernel_dens_bw, num_processes, - alt_or_stnd_name='alt', parse_levels_batch_size=ALT_EST_BATCH, - max_kmer_obs=MAX_KMER_OBS, min_kmer_obs_to_est=MIN_KMER_OBS_TO_EST): - all_reads = [r_data for cs_reads in raw_read_coverage.values() - for r_data in cs_reads] + reads_index, std_ref, kmer_obs_thresh, density_basename, save_x, + kernel_dens_bw, num_processes, alt_or_stnd_name='alt', + parse_levels_batch_size=ALT_EST_BATCH, max_kmer_obs=MAX_KMER_OBS, + min_kmer_obs_to_est=MIN_KMER_OBS_TO_EST): + all_reads = list(reads_index.iter_reads()) np.random.shuffle(all_reads) base_levels = parse_base_levels( all_reads, std_ref, parse_levels_batch_size, kmer_obs_thresh, max_kmer_obs, min_kmer_obs_to_est, num_processes) - if VERBOSE: th._status_message('Fitting kernel densities for k-mer levels.') + if VERBOSE: th.status_message('Fitting kernel densities for k-mer levels.') kmer_dens = {} for kmer, norm_levels in base_levels.items(): norm_levels = np.array(norm_levels) @@ -1169,54 +1650,47 @@ def est_kernel_density( return kmer_dens def estimate_kmer_densities( - f5_dirs, control_dirs, corrected_group, basecall_subgroups, - standard_ref_fn, bio_samp_type, kmer_obs_thresh, density_basename, + fast5s_dirs, ctrl_fast5s_dirs, corr_grp, bc_subgrps, + standard_ref_fn, seq_samp_type, kmer_obs_thresh, density_basename, kernel_dens_bw, save_x, num_processes): - raw_read_coverage = th.parse_fast5s( - f5_dirs, corrected_group, basecall_subgroups) - cntrl_read_coverage = th.parse_fast5s( - control_dirs, corrected_group, basecall_subgroups) - - if VERBOSE: th._status_message('Parsing standard model file.') - if standard_ref_fn is None: - standard_ref_fn, bio_samp_type = get_default_standard_ref( - raw_read_coverage, bio_samp_type) - std_ref = TomboModel(standard_ref_fn) - - if VERBOSE: th._status_message('Parsing base levels from alternative reads.') + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + ctrl_reads_index = th.TomboReads(ctrl_fast5s_dirs, corr_grp, bc_subgrps) + + if VERBOSE: th.status_message('Parsing standard model file.') + std_ref = TomboModel(ref_fn=standard_ref_fn, seq_samp_type=seq_samp_type, + reads_index=reads_index) + + if VERBOSE: th.status_message('Parsing base levels from alternative reads.') alt_dens = est_kernel_density( - raw_read_coverage, std_ref, kmer_obs_thresh, density_basename, + reads_index, std_ref, kmer_obs_thresh, density_basename, save_x, kernel_dens_bw, num_processes, 'alternate') - if VERBOSE: th._status_message('Parsing base levels from standard reads.') + if VERBOSE: th.status_message('Parsing base levels from standard reads.') std_dens = est_kernel_density( - cntrl_read_coverage, std_ref, kmer_obs_thresh, density_basename, + ctrl_reads_index, std_ref, kmer_obs_thresh, density_basename, save_x, kernel_dens_bw, num_processes, 'control') return alt_dens, std_dens, std_ref def load_kmer_densities( - alt_dens_fn, std_dens_fn, f5_dirs, corrected_group, - basecall_subgroups, std_ref_fn, bio_samp_type): - if VERBOSE: th._status_message('Parsing standard model file.') - if std_ref_fn is None: - if f5_dirs is None and bio_samp_type is None: - th._error_message_and_exit( + alt_dens_fn, std_dens_fn, fast5s_dirs, corr_grp, bc_subgrps, + std_ref_fn, seq_samp_type): + if VERBOSE: th.status_message('Parsing standard model file.') + reads_index = None + if std_ref_fn is None and seq_samp_type is None: + if fast5s_dirs is None: + th.error_message_and_exit( 'Must provide a FAST5s directory, a canonical model ' + 'file or spcify the biological sample type.') - raw_read_coverage = None - if f5_dirs is not None: - raw_read_coverage = th.parse_fast5s( - f5_dirs, corrected_group, basecall_subgroups) - std_ref_fn, bio_samp_type = get_default_standard_ref( - raw_read_coverage, bio_samp_type) - std_ref = TomboModel(std_ref_fn) - - if VERBOSE: th._status_message('Parsing density files.') + reads_index = th.TomboReads(fast5s_dirs, corr_grp, bc_subgrps) + std_ref = TomboModel(ref_fn=std_ref_fn, seq_samp_type=seq_samp_type, + reads_index=reads_index) + + if VERBOSE: th.status_message('Parsing density files.') alt_dens = parse_kmer_densities_file(alt_dens_fn) std_dens = parse_kmer_densities_file(std_dens_fn) num_dens_points = next(v for v in alt_dens.values()).shape[0] if num_dens_points != next(v for v in std_dens.values()).shape[0]: - th._error_message_and_exit( + th.error_message_and_exit( 'Alternative and standard density ' + 'estimates do not correspond.') @@ -1272,11 +1746,11 @@ def get_peak_frac(kmer_std_dens, kmer_alt_dens): std_frac = np.percentile([ get_peak_frac(std_dens[kmer], shifted_alt_dens[kmer]) for kmer in std_dens if kmer.count(alt_base) == 1], alt_frac_pctl) - if VERBOSE: th._status_message( + if VERBOSE: th.status_message( 'Alternative base incorporation rate estimate: ' + unicode(1 - std_frac)) if std_frac >= 1: - th._warning_message( + th.warning_message( 'Alternative base incorporation rate ' + 'estimate is approximately 0. Consider lowering ' + '--alt-fraction-percentile.') @@ -1301,34 +1775,31 @@ def get_peak_frac(kmer_std_dens, kmer_alt_dens): alt_level = np.average(save_x, weights=diff_dens) alt_ref.append((kmer, alt_level, model_sd)) - alt_ref = TomboModel( - ref_fn=None, kmer_ref=alt_ref, central_pos=std_ref.central_pos, - minimal_startup=True) + alt_ref = TomboModel(kmer_ref=alt_ref, central_pos=std_ref.central_pos) return alt_ref def estimate_alt_model( - f5_dirs, control_dirs, corrected_group, basecall_subgroups, - std_ref_fn, bio_samp_type, alt_base, alt_frac_pctl, + fast5s_dirs, ctrl_fast5s_dirs, corr_grp, bc_subgrps, + std_ref_fn, seq_samp_type, alt_base, alt_frac_pctl, kmer_obs_thresh, density_basename, kernel_dens_bw, alt_dens_fn, std_dens_fn, num_processes, num_dens_points=NUM_DENS_POINTS): - """ - Estimate an alternative model from a sample with a single, + """Estimate an alternative model from a sample with a single, known, randomly-incorporated alternative base """ if alt_dens_fn is None or std_dens_fn is None: save_x = np.linspace(KERNEL_DENSITY_RANGE[0], KERNEL_DENSITY_RANGE[1], num_dens_points) alt_dens, std_dens, std_ref = estimate_kmer_densities( - f5_dirs, control_dirs, corrected_group, basecall_subgroups, - std_ref_fn, bio_samp_type, kmer_obs_thresh, density_basename, + fast5s_dirs, ctrl_fast5s_dirs, corr_grp, bc_subgrps, + std_ref_fn, seq_samp_type, kmer_obs_thresh, density_basename, kernel_dens_bw, save_x, num_processes) else: alt_dens, std_dens, std_ref, save_x = load_kmer_densities( - alt_dens_fn, std_dens_fn, f5_dirs, corrected_group, - basecall_subgroups, std_ref_fn, bio_samp_type) + alt_dens_fn, std_dens_fn, fast5s_dirs, corr_grp, bc_subgrps, + std_ref_fn, seq_samp_type) - if VERBOSE: th._status_message('Isolating alternative base distribtuions.') + if VERBOSE: th.status_message('Isolating alternative base distribtuions.') # perform alternative density isolation algorithm alt_ref = isolate_alt_density( alt_dens, std_dens, alt_base, alt_frac_pctl, std_ref, save_x) @@ -1407,9 +1878,8 @@ def calc_window_fishers_method(pvals, lag): assert lag > 0, 'Invalid p-value window provided.' width = (lag * 2) + 1 if pvals.shape[-1] < width: - raise NotImplementedError( - "P-values vector too short for Fisher's Method " + - "window compuation.") + raise th.TomboError("P-values vector too short for Fisher's Method " + + "window compuation.") with np.errstate(invalid='ignore'): pvals = np.maximum(pvals, SMALLEST_PVAL) log_sums = np.lib.stride_tricks.as_strided( @@ -1463,11 +1933,65 @@ def calc_mann_whitney_z_score(samp1, samp2): return z -def compute_accuracy_rates(stat_has_mod, num_plot_points=ROC_PLOT_POINTS): +def get_read_seg_score(r_means, r_ref_means, r_ref_sds): + """Compute expected to observed signal matching score + + Args: + r_means (`np.array::np.float64`): observed base signal levels + r_ref_means (`np.array::np.float64`): expected base signal levels + r_ref_sds (`np.array::np.float64`): expected base signal level sds + + Returns: + Mean half z-score for observed versus expected signal levels """ - Given a list or numpy array of true/false values, function returns - num_plot_point evenly spaced values along the true positive, false - positive and presicion arrays + return np.mean([ + np.abs((b_m - b_ref_m) / b_ref_s) + for b_m, b_ref_m, b_ref_s in zip(r_means, r_ref_means, r_ref_sds)]) + +def score_valid_bases(read_tb, event_means, r_ref_means, r_ref_sds): + """Compute expected to observed signal matching score for bases not deleted in dynamic programming + + Args: + read_tb (`np.array::np.int32`): event changepoints + r_means (`np.array::np.float64`): observed base signal levels + r_ref_means (`np.array::np.float64`): expected base signal levels + r_ref_sds (`np.array::np.float64`): expected base signal level sds + + Returns: + Mean half z-score for observed versus expected signal levels (for valid bases) + """ + valid_bases = np.where(np.diff(read_tb) != 0)[0] + if valid_bases.shape[0] == 0: + raise th.TomboError('Invalid path through read start') + valid_ref_means, valid_ref_sds = ( + r_ref_means[valid_bases], r_ref_sds[valid_bases]) + base_means = np.array([event_means[b_start:b_end].mean() + for b_start, b_end in zip(read_tb[:-1], read_tb[1:]) + if b_start != b_end]) + return get_read_seg_score(base_means, valid_ref_means, valid_ref_sds) + +def get_dynamic_prog_params(match_evalue): + """ + Compute dynamic programming shift parameters from an expected match + expected value + """ + z_shift = HALF_NORM_EXPECTED_VAL + match_evalue + stay_pen = match_evalue + return z_shift, stay_pen + + +########################## +##### Statistics I/O ##### +########################## + +def compute_auc(tp_rate, fp_rate): + return np.sum(tp_rate[:-1] * (fp_rate[1:] - fp_rate[:-1])) + +def compute_mean_avg_precison(tp_rate, precision): + return np.mean(np.cumsum((tp_rate[1:] - tp_rate[:-1]) * precision[1:])) + +def compute_accuracy_rates(stat_has_mod, num_plot_points=ROC_PLOT_POINTS): + """Given a list or numpy array of true/false values, function returns num_plot_point evenly spaced values along the true positive, false positive and precision arrays """ tp_cumsum = np.cumsum(stat_has_mod) tp_rate = tp_cumsum / tp_cumsum[-1] @@ -1486,266 +2010,315 @@ def compute_accuracy_rates(stat_has_mod, num_plot_points=ROC_PLOT_POINTS): return tp_rate, fp_rate, precision -def get_motif_stats(motif, stats, genome_index): - stat_has_mod = [] - for stat_seq in stats.iter_stat_seqs( - genome_index, motif.mod_pos - 1, - motif.motif_len - motif.mod_pos): - if motif.motif_pat.match(stat_seq) is not None: - stat_has_mod.append(True) - # don't include sites that aren't at the base of interest - elif stat_seq[motif.mod_pos - 1] == motif.mod_base: - stat_has_mod.append(False) - - return compute_accuracy_rates(stat_has_mod) - - -######################################### -##### Local Model-based Re-squiggle ##### -######################################### - -def get_dynamic_prog_params(match_evalue): - """ - Compute dynamic programming shift parameters from an expected match - expected value - """ - z_shift = HALF_NORM_EXPECTED_VAL + match_evalue - stay_pen = match_evalue - return z_shift, stay_pen - -def get_begin_nan(arr): - """ - Find the index of the first NAN value - """ - tot=0 - for val in iter(arr): - if not np.isnan(val): break - tot += 1 - return tot - -def get_read_signif_shift_regions( - z_scores, z_thresh, context_bases, signif_shift_len_thresh=None): - """ - Identify regions along a read that do not match well with the genomic - reference tombo model - """ - # extend NANs by context_bases to avoid regions extending outside of - # valid regions over which statistics were computed - start_nans = get_begin_nan(z_scores) + context_bases - z_scores[:start_nans] = np.NAN - end_nans = get_begin_nan(z_scores[::-1]) + context_bases - if end_nans > 0: - z_scores[-end_nans:] = np.NAN - # suppress NAN errors form less than compare - with np.errstate(invalid='ignore'): - signif_shift_locs = z_scores > z_thresh - # find the edges of significantly shifted signal regions - signif_shift_chngpnts = np.where(np.diff(signif_shift_locs) != 0)[0] + 1 - - signif_shift_regs = zip(signif_shift_chngpnts[:-1:2], - signif_shift_chngpnts[1::2]) - signif_shift_cntxt_regs = [] - curr_start, curr_end = next(signif_shift_regs) - for reg_start, reg_end in signif_shift_regs: - # if next region overlaps the current region with context - if reg_start - (context_bases * 2) <= curr_end: - # extend the current region to cover both regions - curr_end = reg_end +def _compute_motif_stats( + stats, motif_descs, genome_index, pos_stat_name='damp_frac', + stats_per_block=None, total_stats_limit=None): + all_motif_stats = dict( + (mod_name, []) for mod_name in list(zip(*motif_descs))[1]) + before_bases = max(( + motif.mod_pos for motif in list(zip(*motif_descs))[0])) - 1 + after_bases = max((motif.motif_len - motif.mod_pos + for motif in list(zip(*motif_descs))[0])) + total_num_stats = 0 + for chrm, strand, start, end, block_stats in stats: + if strand == '+': + seq_start = max(start - before_bases, 0) + seq_end = end + after_bases else: - # else check that the region is long enough - if (signif_shift_len_thresh is None or - curr_end - curr_start >= signif_shift_len_thresh): - # and add it to the list of regions to model re-squiggle - signif_shift_cntxt_regs.append((curr_start - context_bases, - curr_end + context_bases)) - # and set the next region to be the current one - curr_start, curr_end = reg_start, reg_end - - # add last region - if (signif_shift_len_thresh is None or - curr_end - curr_start >= signif_shift_len_thresh): - signif_shift_cntxt_regs.append((curr_start - context_bases, - curr_end + context_bases)) - - return signif_shift_cntxt_regs + seq_start = max(start - after_bases, 0) + seq_end = end + before_bases + + reg_seq = genome_index.get_seq( + chrm, seq_start, seq_end, error_end=False) + # TODO potentially keep all mod sites when they are extremely rare + # randomly sub-sample per-read stats here + if (stats_per_block is not None and + block_stats.shape[0] > stats_per_block): + block_stats = block_stats[np.random.choice( + block_stats.shape[0], stats_per_block, replace=False)] + total_num_stats += block_stats.shape[0] + for r_pos_stat in block_stats: + # extract position sequence + if strand == '+': + r_pos_seq = reg_seq[ + r_pos_stat['pos'] - seq_start - before_bases: + r_pos_stat['pos'] - seq_start + after_bases + 1] + else: + r_pos_seq = th.rev_comp(reg_seq[ + r_pos_stat['pos'] - seq_start - after_bases: + r_pos_stat['pos'] - seq_start + before_bases + 1]) + + # add statistic and whether the sequence matches each motif + for motif, mod_name in motif_descs: + if r_pos_seq[before_bases] != motif.mod_base: continue + all_motif_stats[mod_name].append(( + r_pos_stat[pos_stat_name], + bool(motif.motif_pat.match( + r_pos_seq[before_bases - motif.mod_pos + 1:])))) + + if (total_stats_limit is not None and + total_num_stats >= total_stats_limit): + break + return all_motif_stats -########################## -##### Statistics I/O ##### -########################## +def calc_damp_fraction(cov_damp_counts, fracs, valid_cov): + """Compute dampened fraction of un-modified reads using provided modified and un-modified pseudo-counts from cov_damp_counts -def write_stats( - all_reg_stats, stats_bsnm, stat_type, min_test_vals, alt_name=None): - """ - Write a tombo statistics file + See https://nanoporetech.github.io/tombo/text_output.html?highlight=dampened#text-output-browser-files for more details """ - if VERBOSE: th._status_message( - 'Saving signal shift significance testing results.') - def convert_reg_stats(reg_stats): - # get all unique fasta record names to store in HDF5 attributes and - # encode as integers in the stats numpy table - chrms_lookup = dict(zip( - sorted(set(map(itemgetter(2), reg_stats))), count())) - - np_stats = [] - for (reg_frac_standard_base, reg_poss, chrm, strand, - reg_cov, ctrl_cov, valid_cov) in reg_stats: - np_stats.append(np.array( - [pos_stats for pos_stats in zip( - reg_frac_standard_base, reg_poss, - repeat(chrms_lookup[chrm]), repeat(strand), - reg_cov, ctrl_cov, valid_cov) - if not np.isnan(pos_stats[0])], - dtype=[ - (str('frac'), 'f8'), (str('pos'), 'u4'), (str('chrm'), 'u4'), - (str('strand'), 'S1'), (str('cov'), 'u4'), - (str('control_cov'), 'u4'), (str('valid_cov'), 'u4')])) - - np_stats = np.concatenate(np_stats) - - np_stats = np_stats[np.greater_equal( - np_stats['valid_cov'], min_test_vals)] - - return np_stats, chrms_lookup - - def write_stats_data(stats_fp, stats, stat_type, chrms_lookup): - stats_fp.create_dataset('stats', data=stats, compression="gzip") - stats_fp.attrs['stat_type'] = stat_type - - chrms_subgrp = stats_fp.create_group('chromosome_values') - for chrm, chrm_val in chrms_lookup.items(): - chrms_subgrp.attrs[chrm] = chrm_val - - return - - stats_fn = stats_bsnm + '.tombo.stats' if alt_name is None else \ - stats_bsnm + '.' + alt_name + '.tombo.stats' - all_reg_stats, chrms_lookup = convert_reg_stats(all_reg_stats) - with h5py.File(stats_fn, 'w') as stats_fp: - write_stats_data(stats_fp, all_reg_stats, stat_type, chrms_lookup) + damp_fracs = np.empty(fracs.shape[0]) + damp_fracs[:] = np.nan + non_mod_counts = np.round(fracs * valid_cov) + # compute dampened fraction of modified reads by adding psuedo-counts + # to the modified and un-modified counts (equivalent to a beta prior + # on the fraction estimation as a binomial variable) + damp_fracs = (non_mod_counts + cov_damp_counts['unmod']) / ( + valid_cov + sum(list(cov_damp_counts.values()))) - return + return damp_fracs +# TODO write BaseStats class since many operations are quite similar for +# TomboStats and PerReadStats class TomboStats(object): + """Parse and retrieve relevant information from a standard (per-genomic base) Tombo statistics file. + + .. automethod:: __init__ """ - Parse and retrieve relevant information from a standard (not per-read) Tombo - statistics file. - """ + # TODO add attributes def _parse_stats(self): if self.stats_fn is None or not os.path.isfile(self.stats_fn): - th._error_message_and_exit( + th.error_message_and_exit( 'Statistics file not provided or provided file does not exist.') + self._fp = h5py.File(self.stats_fn, 'r') + self.stat_type = self._fp.attrs.get('stat_type') + self.region_size = self._fp.attrs.get('block_size') + self.stat_blocks = self._fp[STAT_BLOCKS_H5_NAME] + self.num_blocks = 0 + blocks_index = defaultdict(dict) + for block_name, block_data in self.stat_blocks.items(): + blocks_index[ + (block_data.attrs.get('chrm'), block_data.attrs.get('strand'))][ + block_data.attrs.get('start')] = block_name + self.num_blocks += 1 + self.blocks_index = dict(blocks_index) + + self.cov_thresh = self._fp.attrs.get(COV_THRESH_H5_NAME) + most_signif_grp = self._fp[MOST_SIGNIF_H5_NAME] + # read full most significant array into memory + self.most_signif_stats = most_signif_grp[MOST_SIGNIF_H5_NAME][:] + self.most_signif_chrm_map = dict( + (v,k) for k,v in most_signif_grp['chrm_ids'].attrs.items()) + self.cov_damp_counts = dict(self._fp[ + COV_DAMP_COUNTS_H5_NAME].attrs.items()) + + return + + def _create_new_stats_file(self): + # try to remove file for overwriting old results try: - with h5py.File(self.stats_fn, 'r') as stats_fp: - self.stats = stats_fp['stats'].value - self.stat_type = stats_fp.attrs['stat_type'] - try: - self.chrms_lookup = dict( - (chrm_val, chrm_name) for chrm_name, chrm_val in - stats_fp['chromosome_values'].attrs.items()) - self.has_chrm_lookup = True - except: - self.has_chrm_lookup = False - th._warning_message( - 'Old version of Tombo used to create statistics ' + - 'file. Upgrading to the current version suggested ' + - 'for best results.') + os.remove(self.stats_fn) except: - th._error_message_and_exit( - 'Attempt to load statistics file failed. May be an old ' + - 'version of statistics file. Try deleting statistics ' + - 'file and re-calculating using current tombo version.') + pass + # open file for writing + self._fp = h5py.File(self.stats_fn, 'w') + + # save attributes to file and open stats blocks group + self._fp.attrs['stat_type'] = self.stat_type + self._fp.attrs['block_size'] = self.region_size + self.stat_blocks = self._fp.create_group(STAT_BLOCKS_H5_NAME) + + # save coverage damp counts and threshold attributes + self._fp.attrs[COV_THRESH_H5_NAME] = self.cov_thresh + self.cov_damp_counts_grp = self._fp.create_group(COV_DAMP_COUNTS_H5_NAME) + self.cov_damp_counts_grp.attrs[ + 'unmod'] = self.cov_damp_counts['unmod'] + self.cov_damp_counts_grp.attrs[ + 'mod'] = self.cov_damp_counts['mod'] + + # storage for most significant stats + self.most_signif_sites = self._fp.create_group(MOST_SIGNIF_H5_NAME) + self.running_most_signif_sites = np.empty( + shape=(self.num_most_signif,), + dtype=[(str('damp_frac'), 'f8'), (str('frac'), 'f8'), + (str('pos'), 'u4'), (str('cov'), 'u4'), + (str('control_cov'), 'u4'), (str('valid_cov'), 'u4'), + (str('chrm'), 'u4'), (str('strand'), 'S1')]) + self.running_most_signif_sites[:] = np.NAN + # store a queue of completed stat batches to be concatenated and stored + # as a group to avoid too many array copy and sorting ops + self.queued_stat_batches = [] + # store chromosomes names in dict for storing most signif array + self.curr_chrm_id = 0 + self.chrm_names = {} + self.chrm_id_grp = self.most_signif_sites.create_group('chrm_ids') + + self.is_empty = True return - def __init__(self, stats_fn): - """ - Parse a standard Tombo statistics file. + def __init__(self, stats_fn, stat_type=None, region_size=None, + cov_damp_counts=None, cov_thresh=None, num_most_signif=None, + most_signif_num_batches=MOST_SIGNIF_NUM_BATCHES_DEFAULT): + """Parse or open for writing a standard (per-genomic base) Tombo statistics file. + + Example:: + + stats = tombo_stats.TomboStats('path/to/stats.file') + for chrm, strand, pos, frac, damp_frac, valid_cov in stats.iter_most_signif_sites(): + # do stuff + + Args: + stats_fn (str): filename for previously saved tombo stats + stat_type (str): type of statistic (model_compare, de_novo, or sample_compare); only applicable for new file writing + region_size (int): size of chunked storage blocks; only applicable for new file writing + cov_damp_counts (tuple): pseudo-counts for modified and un-modified reads to compute ``damp_frac`` + cov_thresh (int): only sites with coverage greater than or equal to this value will be stored + num_most_signif (int): number of most significant sites to be stored for faster access + most_signif_num_batches (int): number of region batches to store before re-computing the most significant array (default: 10) + + Warning: + + If all arguments are provided the current file's contents will be deleted. + + Intended to open a fresh ``TomboStats`` file for writing. """ self.stats_fn = stats_fn - self.has_damp_frac = False - self.cov_damp_counts = None - self.has_stat_dict = False - self.stat_dict = None - self._parse_stats() - return + if any(arg is None for arg in (stat_type, region_size, cov_damp_counts, + cov_thresh, num_most_signif)): + self.open_for_writing = False + # open file for reading + try: + self._parse_stats() + except: + raise th.TomboError( + 'Invalid statistics file provided. Try running ' + + 'tombo/scripts/convert_stats.py if this stats file ' + + 'was created before Tombo v1.3.1') + else: + self.open_for_writing = True + # set class attributes + self.stat_type = stat_type + self.region_size = region_size + self.curr_block_num = 0 + self.cov_damp_counts = dict(zip(('unmod', 'mod'), cov_damp_counts)) + self.cov_thresh = cov_thresh + self.num_most_signif = num_most_signif + self.most_signif_num_batches = most_signif_num_batches + # open file for writing + self._create_new_stats_file() - def filter_coverage(self, min_reads): - """ - Filter statistics at locations with less than `min_reads` coverage. - """ - self.stats = self.stats[np.logical_and( - self.stats['valid_cov'] >= min_reads, - np.logical_or(self.stat_type != SAMP_COMP_TXT, - self.stats['control_cov'] >= min_reads))] + return + def _update_most_signif(self): + tmp_most_signif = np.concatenate( + [self.running_most_signif_sites,] + self.queued_stat_batches) + tmp_most_signif.sort(kind='mergesort', order=str('damp_frac')) + self.running_most_signif_sites = tmp_most_signif[:self.num_most_signif] + self.queued_stat_batches = [] return - def is_empty(self): - """ - Is the statistics table empty? - """ - return self.stats.shape[0] == 0 + def _add_to_most_signif(self, reg_stats, chrm, strand): + if chrm not in self.chrm_names: + self.chrm_names[chrm] = self.curr_chrm_id + self.curr_chrm_id += 1 - def calc_damp_fraction(self, cov_damp_counts): - """ - Compute dampened fraction of unmodified reads using provided - un-modified and modified pseudo-counts from cov_damp_counts - """ - self.has_damp_frac = True - self.cov_damp_counts = cov_damp_counts - damp_frac = np.empty(self.stats.shape[0]) - damp_frac[:] = np.nan - non_mod_counts = np.round(self.stats['frac'] * self.stats['valid_cov']) - # compute dampened fraction of modified reads by adding psuedo-counts - # to the modified and un-modified counts (equivalent to a beta prior - # on the fraction estimation as a binomial variable) - damp_frac = (non_mod_counts + cov_damp_counts[0]) / ( - self.stats['valid_cov'] + sum(cov_damp_counts)) - damp_name = 'damp_frac' if sys.version_info[0] > 2 else b'damp_frac' - self.stats = append_fields(self.stats, damp_name, damp_frac) + self.queued_stat_batches.append(append_fields( + base=reg_stats, names=(str('chrm'), str('strand')), + data=(list(repeat(self.chrm_names[chrm], reg_stats.shape[0])), + list(repeat(strand, reg_stats.shape[0]))), + dtypes=('u4', 'S1'))) + if len(self.queued_stat_batches) >= self.most_signif_num_batches: + self._update_most_signif() return - def order_by_frac(self, cov_damp_counts=None): + def _write_stat_block(self, reg_stats): + """Write region statistics block to file. """ - Order statistics table via fraction of unmodified reads + try: + block_data = self.stat_blocks.create_group( + 'Block_' + unicode(self.curr_block_num)) + self.curr_block_num += 1 + except: + th.warning_message('Statistics file not opened for writing.') + return - If cov_damp_counts is provided or has been previously provided - fractions will be dampened accordingly - """ - if cov_damp_counts is None and not self.has_damp_frac: - self.stats.sort(order=str('frac')) - else: - self.calc_damp_fraction(cov_damp_counts) - self.stats.sort(order=str('damp_frac')) + block_data.attrs['chrm'] = reg_stats.chrm + block_data.attrs['strand'] = reg_stats.strand + block_data.attrs['start'] = reg_stats.start + + damp_frac = calc_damp_fraction( + self.cov_damp_counts, reg_stats.reg_frac_standard_base, + reg_stats.valid_cov) + reg_stats_arr = np.array( + [pos_stats for pos_stats in zip( + damp_frac, reg_stats.reg_frac_standard_base, + reg_stats.reg_poss, reg_stats.reg_cov, + reg_stats.ctrl_cov, reg_stats.valid_cov) + if not np.isnan(pos_stats[0])], + dtype=[ + (str('damp_frac'), 'f8'), (str('frac'), 'f8'), + (str('pos'), 'u4'), (str('cov'), 'u4'), + (str('control_cov'), 'u4'), (str('valid_cov'), 'u4')]) + block_data.create_dataset( + 'block_stats', data=reg_stats_arr, compression="gzip") + + self._add_to_most_signif(reg_stats_arr, reg_stats.chrm, reg_stats.strand) + + #self._fp.flush() + self.is_empty = False return - def order_by_pos(self): - """ - Order statistics table by chrmomosome, then strand and then position + def _close_write(self): + # process any remaining batches + if len(self.queued_stat_batches) >= 1: + self._update_most_signif() + # trim the array if necessary + if np.isnan(self.running_most_signif_sites['damp_frac'][-1]): + # not as many signif sites were stored as requested so trim array + first_nan = np.where(np.isnan( + self.running_most_signif_sites['damp_frac']))[0][0] + self.running_most_signif_sites = self.running_most_signif_sites[ + :first_nan,] + # add dataset to file + self.most_signif_sites.create_dataset( + MOST_SIGNIF_H5_NAME, data=self.running_most_signif_sites, + compression="gzip") + # and add chrm ids map to file (store in reverse order of useful dict, + # since int's can't be hdf5 keys + for chrm_name, chrm_id in self.chrm_names.items(): + self.chrm_id_grp.attrs[chrm_name] = chrm_id + + return + + def close(self): + """Close open HDF5 file and write most significant sites if open for writing """ - self.stats.sort(order=['chrm', 'strand', 'pos']) + if self.open_for_writing: + self._close_write() + self._fp.close() return + + # Reading functions def _get_chrm_name(self, pos_stat): - if self.has_chrm_lookup: - return self.chrms_lookup[pos_stat['chrm']] - return pos_stat['chrm'].decode() + return self.most_signif_chrm_map[pos_stat['chrm']] def iter_stat_seqs(self, genome_index, before_bases, after_bases, - include_pos=False): - """ - Iterate through statistics table in current order returning the genomic - sequence surrounding each position. - - `include_position` option will yeild (pos_seq, chrm, strand, start, end) - for each record. + include_pos=True): + """Iterate through most significant genomic sites returning the genomic sequence surrounding each position. + + Args: + genome_index (:class:`tombo.tombo_helper.Fasta`): genome index object + before_bases (int): number of sequence bases before positions to include + after_bases (int): number of sequence bases after positions to include + include_pos (bool): yeild (pos_seq, chrm, strand, start, end) for each site (default: True) """ - for pos_stat in self.stats: + for pos_stat in self.most_signif_stats: chrm, strand, pos = (self._get_chrm_name(pos_stat), pos_stat['strand'].decode(), pos_stat['pos']) @@ -1764,30 +2337,33 @@ def iter_stat_seqs(self, genome_index, before_bases, after_bases, return - def iter_fracs(self): - """ - Iterate through statistics table yeilding - (chrm, strand, pos, frac, damp_frac). + def iter_most_signif_sites(self): + """Iterate through statistics table yeilding (chrm, strand, pos, frac, damp_frac). """ - for pos_stat in self.stats: + for pos_stat in self.most_signif_stats: yield ( self._get_chrm_name(pos_stat), pos_stat['strand'].decode(), - pos_stat['pos'], pos_stat['frac'], - pos_stat['damp_frac'] if self.has_damp_frac else None, + pos_stat['pos'], pos_stat['frac'], pos_stat['damp_frac'], pos_stat['valid_cov']) return def get_most_signif_regions(self, num_bases, num_regions, unique_pos=True, - cov_damp_counts=None): - """ - Select regions centered on locations with the largest fraction - of modified bases + prepend_loc_to_text=False): + """Select regions centered on locations with the largest fraction of modified bases + + Args: + num_bases (int): number of bases to output + num_regions (int): number of regions to output + unique_pos (bool): get only unique positions (optional; default True) intervals may overlap, but identified significant position is outside other intervals + prepend_loc_to_text (bool): pre-prend most significant location to the region text (can be off for interval near start/end of sequence records) + + Returns: + A list of :class:`tombo.tombo_helper.intervalData` objects """ - self.order_by_frac(cov_damp_counts) selected_regs = [] used_intervals = defaultdict(set) - for i, pos_stat in enumerate(self.stats): + for i, pos_stat in enumerate(self.most_signif_stats): int_start = max(0, pos_stat['pos'] - int(num_bases / 2.0)) chrm = self._get_chrm_name(pos_stat) strand = pos_stat['strand'].decode() @@ -1795,71 +2371,94 @@ def get_most_signif_regions(self, num_bases, num_regions, unique_pos=True, pos_stat['pos'] not in used_intervals[(chrm, strand)]): used_intervals[(chrm, strand)].update( range(int_start, int_start + num_bases)) - int_text = 'Est. Frac. Alternate: {0:.2g} Coverage: {1}'.format( - 1 - pos_stat[str('damp_frac')], pos_stat[str('valid_cov')]) \ - if self.has_damp_frac else \ - 'Frac. Alternate: {0:.2g} Coverage: {1}'.format( - 1 - pos_stat[str('frac')], pos_stat[str('valid_cov')]) + int_text = 'Est. Frac. Alternate: {0:.2g}'.format( + 1 - pos_stat[str('damp_frac')]) + if prepend_loc_to_text: + int_text = '{0}:{1:d}:{2}'.format( + chrm, pos_stat['pos'] + 1, strand) + " " + int_text selected_regs.append(th.intervalData( - '{:03d}'.format(i), chrm, int_start, - int_start + num_bases, strand, int_text)) + chrm=chrm, start=int_start, end=int_start + num_bases, + strand=strand, reg_id='{:03d}'.format(i), reg_text=int_text)) if len(selected_regs) >= num_regions: break if len(selected_regs) == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'No locations identified. Most likely an empty statistics file.') if len(selected_regs) < num_regions: - th._warning_message( + th.warning_message( 'Fewer unique significant locations more than [--num-bases]/2 ' + 'apart were identified. Continuing with ' + - str(len(selected_regs)) + ' unique locations.') + str(len(selected_regs)) + ' unique locations. Must raise ' + + '--num-most-significant-stored in order to see more most ' + + 'significant stats.') return selected_regs - def create_stat_dict(self, dict_batch_size=10000): + def compute_motif_stats( + self, motif_descs, genome_index, + stats_per_block=None, total_stats_limit=None): + """Compute lists of statistic values and whether this site represents a match to the provided motifs + + Args: + motif_descs (list; see :class:`tombo.tombo_helper.parse_motif_descs`): containing tuples with :class:`tombo.tombo_helper.TomboMotif` and motif/modification names + genome_index (:class:`tombo.tombo_helper.Fasta`): genome index + stats_per_block (int): statistics to include in calculations per-block (`--multiprocess-region-size`) + total_stats_limit (int): maximum total statistics to include in computation (Default: include all stats) + + Returns: + Dictionary with (key) motif/modification name and (value) list of tuples containing statistic value and boolean motif match """ - Create random access to fraction modified values by position + return _compute_motif_stats( + self, motif_descs, genome_index, 'damp_frac', + stats_per_block=stats_per_block, total_stats_limit=total_stats_limit) - Fraction will be dampened if cov_damp_counts was previously provided - Access dictionary will be stored in the stat_dict slot + def __iter__(self): + """Iterator over all statistics blocks, yeilding chrm, strand, start, end, block_stats """ - self.has_stat_dict = True - self.dict_batch_size = dict_batch_size - s_stats = np.sort(self.stats, order=['chrm', 'strand', 'pos']) - self.stat_dict = {} - # split at chromosome/strand switches - for cs_stats in np.split(s_stats, np.where(np.logical_or( - s_stats['strand'][:-1] != s_stats['strand'][1:], - np.diff(s_stats['chrm']) != 0))[0] + 1): - for batch_stats in np.split(cs_stats, np.where(np.diff( - np.floor_divide( - cs_stats['pos'], dict_batch_size)) != 0)[0] + 1): - batch_fracs = 1 - ( - batch_stats[str('damp_frac')] if self.has_damp_frac else - batch_stats[str('frac')]) - self.stat_dict[( - self._get_chrm_name(batch_stats[0]), - batch_stats[0]['strand'].decode(), - np.floor_divide(batch_stats[0]['pos'], - dict_batch_size))] = ( - batch_fracs, batch_stats['pos']) + self.iter_all_cs = iter(sorted(self.blocks_index)) + self.iter_curr_cs = next(self.iter_all_cs) + self.iter_curr_cs_blocks = iter( + self.blocks_index[self.iter_curr_cs].items()) + return self - return + def __next__(self): + try: + next_start, next_block_name = next(self.iter_curr_cs_blocks) + except StopIteration: + # move to next chromosome and strand + # this will raise a second StopIteration + # when the end of the blocks is hit + self.iter_curr_cs = next(self.iter_all_cs) + self.iter_curr_cs_blocks = iter(sorted( + self.blocks_index[self.iter_curr_cs].items())) + next_start, next_block_name = next(self.iter_curr_cs_blocks) - def get_pos_frac(self, chrm, strand, pos, missing_value=None): + chrm, strand = self.iter_curr_cs + return (chrm, strand, next_start, next_start + self.region_size, + self.stat_blocks[next_block_name]['block_stats'][:]) + + # for python2 compatibility + def next(self): + """Return next statistics block from file including (chrm, strand, block start, block end and statistics table ``numpy structured array``) """ - Obtain statistic value from the requested genomic position + return self.__next__() + + def get_pos_frac(self, chrm, strand, pos, missing_value=None): + """Extract statistic value from the requested genomic position. """ - # TODO: Add a get_reg_pos function and only get the reg values + # TODO: Add a get_reg_fracs and only get the reg values # once. Just need to handle edge of batch cases - if not self.has_stat_dict: - self.create_stat_dict() try: - reg_fracs, reg_poss = self.stat_dict[( - chrm, strand, np.floor_divide(pos, self.dict_batch_size))] - pos_index = np.where(reg_poss == pos)[0] + pos_block_start = np.floor_divide( + pos, self.region_size) * self.region_size + # TODO: blocks may have missing data (consider full sized blocks + # for random disk access to single or range of elements) + #block_pos = np.remainder(pos, self.region_size) + block_name = self.blocks_index[(chrm, strand)][pos_block_start] + block_data = self.stat_blocks[block_name]['block_stats'][:] + pos_index = np.where(block_data['pos'] == pos)[0] if len(pos_index) != 1: raise KeyError - pos_frac = reg_fracs[pos_index[0]] + pos_frac = 1 - block_data['damp_frac'][pos_index[0]] except KeyError: pos_frac = missing_value @@ -1867,71 +2466,96 @@ def get_pos_frac(self, chrm, strand, pos, missing_value=None): class PerReadStats(object): + """Store and accses per-read modified base testing statistics + + .. automethod:: __init__ + """ + # TODO add attributes + def _parse_per_read_stats(self): + self._fp = h5py.File(self.per_read_stats_fn, 'r') + self.stat_type = self._fp.attrs.get('stat_type') + self.region_size = self._fp.attrs.get('block_size') + self.per_read_blocks = self._fp[STAT_BLOCKS_H5_NAME] + self.num_blocks = 0 + blocks_index = defaultdict(dict) + for block_name, block_data in self.per_read_blocks.items(): + blocks_index[ + (block_data.attrs.get('chrm'), block_data.attrs.get('strand'))][ + block_data.attrs.get('start')] = block_name + self.num_blocks += 1 + self.blocks_index = dict(blocks_index) + + return + + def _create_new_per_read_stats_file(self): + # try to remove file for overwriting old results + try: + os.remove(self.per_read_stats_fn) + except: + pass + # open file for writing + self._fp = h5py.File(self.per_read_stats_fn, 'w') + + # save attributes to file and open stats blocks group + self.curr_block_num = 0 + self._fp.attrs['stat_type'] = self.stat_type + self._fp.attrs['block_size'] = self.region_size + self.per_read_blocks = self._fp.create_group(STAT_BLOCKS_H5_NAME) + + return + def __init__(self, per_read_stats_fn, stat_type=None, region_size=None): - """ - Open per-read statistics file. If stat_type and region_size are provided - the file is opened for writing, else it is opened for random access. + """Open per-read statistics file. + + Examples:: - WARNING: If stat_type and region_size are provided the current file's - contents will be deleted. + per_read_stats = tombo_stats.PerReadStats('path/to/sample.tombo.per_read_stats') + int_data = tombo_helper.intervalData( + chrm='chr20', start=10000, end=10100, strand='+') + reg_per_read_stats = per_read_stats.get_region_per_read_stats( + int_data, num_reads=10) + + Args: + + per_read_stats_fn (str): filename containing (or to write) per-read Tombo statistics + stat_type (str): type of statistic (model_compare, de_novo, or sample_compare); only applicable for new file writing + region_size (int): size of chunked storage blocks; only applicable for new file writing + + Warning: + + If ``stat_type`` and ``region_size`` are provided the current file's contents will be deleted. + + Intended to open a fresh ``PerReadStats`` file for writing. """ + self.per_read_stats_fn = per_read_stats_fn if stat_type is None or region_size is None: # open file for reading try: - self._fp = h5py.File(per_read_stats_fn, 'r') - self.stat_type = self._fp.attrs['stat_type'] - self.region_size = self._fp.attrs['block_size'] - self.per_read_blocks = self._fp['Statistic_Blocks'] - blocks_index = defaultdict(list) - for block_name, block_data in self.per_read_blocks.items(): - blocks_index[ - (block_data.attrs['chrm'], - block_data.attrs['strand'])].append(( - block_data.attrs['start'], - block_data.attrs['start'] + self.region_size, - block_name)) - self.blocks_index = dict(blocks_index) - self.iter_all_cs = iter(list(self.blocks_index)) - self.iter_curr_cs = next(self.iter_all_cs) - self.iter_curr_cs_blocks = iter( - self.blocks_index[self.iter_curr_cs]) + self._parse_per_read_stats() except: - th._error_message_and_exit( + th.error_message_and_exit( 'Non-existent or invalid per-read statistics file provided.') else: # set class attributes self.stat_type = stat_type self.region_size = region_size - self.curr_block_num = 0 - # try to remove file for overwriting old results - try: - os.remove(per_read_stats_fn) - except: - pass - # open file for writing - self._fp = h5py.File(per_read_stats_fn, 'w') - - # save attributes to file and open stats blocks group - self._fp.attrs['stat_type'] = stat_type - self._fp.attrs['block_size'] = region_size - self.per_read_blocks = self._fp.create_group('Statistic_Blocks') + self._create_new_per_read_stats_file() self.are_pvals = self.stat_type != ALT_MODEL_TXT return - def write_per_read_block( + def _write_per_read_block( self, per_read_block, read_id_lookup, chrm, strand, start): - """ - Write region statistics block to file. + """Write region statistics block to file. """ try: block_data = self.per_read_blocks.create_group( 'Block_' + unicode(self.curr_block_num)) self.curr_block_num += 1 except: - th._warning_message( + th.warning_message( 'Per-read statistics file not opened for writing.') return @@ -1951,9 +2575,15 @@ def write_per_read_block( return def get_region_per_read_stats(self, interval_data, num_reads=None): - """ - Get per-read statistics from the specifed interval for a random selection - of num_reads. + """Extract per-read statistics over the specifed interval. + + Args: + + interval_data (:class:`tombo.tombo_helper.intervalData`): genomic interval + num_reads (int): randomly select this many reads (default: inlcude all reads) + + Returns: + `np.array` structured array containing ``pos``, ``stat`` and ``read_id`` for per-read stats over requested interval """ try: cs_blocks = self.blocks_index[( @@ -1962,18 +2592,17 @@ def get_region_per_read_stats(self, interval_data, num_reads=None): return int_block_stats = [] - for block_data in cs_blocks: - if (interval_data.end < block_data[0] or - interval_data.start > block_data[1]): continue - block_stats = self.per_read_blocks[ - block_data[2]]['block_stats'].value + for block_start, block_name in cs_blocks.items(): + if (interval_data.end < block_start or + interval_data.start > block_start + self.region_size): continue + # extract stats from FAST5 + block_stats = self.per_read_blocks[block_name]['block_stats'][:] reg_poss = block_stats['pos'] reg_read_stats = block_stats['stat'] - # convert read_ids back into strings + # extract and convert read_ids back into strings block_read_id_lookup = dict([ (read_id_val, read_id) for read_id, read_id_val in - self.per_read_blocks[block_data[2]][ - 'read_ids'].attrs.items()]) + self.per_read_blocks[block_name]['read_ids'].attrs.items()]) reg_read_ids = [ block_read_id_lookup[r_id] for r_id in block_stats['read_id']] int_block_stats.append(np.array( @@ -2002,6 +2631,24 @@ def get_region_per_read_stats(self, interval_data, num_reads=None): return all_int_stats + def compute_motif_stats( + self, motif_descs, genome_index, + stats_per_block=None, total_stats_limit=None): + """Compute lists of statistic values and whether this site represents a match to the provided motifs + + Args: + motif_descs (list; see :class:`tombo.tombo_helper.parse_motif_descs`): containing tuples with :class:`tombo.tombo_helper.TomboMotif` and motif/modification names + genome_index (:class:`tombo.tombo_helper.Fasta`): genome index + stats_per_block (int): statistics to include in calculations per-block (`--multiprocess-region-size`) + total_stats_limit (int): maximum total statistics to include in computation (Default: include all stats) + + Returns: + Dictionary with (key) motif/modification name and (value) list of tuples containing statistic value and boolean motif match + """ + return _compute_motif_stats( + self, motif_descs, genome_index, 'stat', + stats_per_block=stats_per_block, total_stats_limit=total_stats_limit) + def __iter__(self): """ Iterator over all statistics blocks, yeilding chrm, strand, @@ -2010,29 +2657,34 @@ def __iter__(self): self.iter_all_cs = iter(list(self.blocks_index)) self.iter_curr_cs = next(self.iter_all_cs) self.iter_curr_cs_blocks = iter( - self.blocks_index[self.iter_curr_cs]) + self.blocks_index[self.iter_curr_cs].items()) return self def __next__(self): try: - next_start, next_end, next_block_name = next( - self.iter_curr_cs_blocks) + next_start, next_block_name = next(self.iter_curr_cs_blocks) except StopIteration: # move to next chromosome and strand # this will raise a second StopIteration # when the end of the blocks is hit self.iter_curr_cs = next(self.iter_all_cs) - self.iter_curr_cs_blocks = iter( - self.blocks_index[self.iter_curr_cs]) - next_start, next_end, next_block_name = next( - self.iter_curr_cs_blocks) + self.iter_curr_cs_blocks = iter(sorted( + self.blocks_index[self.iter_curr_cs].items())) + next_start, next_block_name = next(self.iter_curr_cs_blocks) chrm, strand = self.iter_curr_cs - next_block_stats = self.per_read_blocks[ - next_block_name]['block_stats'].value - return chrm, strand, next_start, next_end, next_block_stats + return (chrm, strand, next_start, next_start + self.region_size, + self.per_read_blocks[next_block_name]['block_stats'][:]) + + # for python2 compatibility + def next(self): + """Return next per-read statistics block from file including (chrm, strand, block start, block end and per-read statistics table ``numpy structured array``) + """ + return self.__next__() def close(self): + """Close HDF5 file + """ self._fp.close() return @@ -2041,99 +2693,130 @@ def close(self): ##### Base-by-base Testing ##### ################################ -def apply_per_read_thresh( - pr_stats_fn, single_read_thresh, min_test_vals, lower_thresh): - if VERBOSE: th._status_message( - 'Loading and aggregating per-read statistics.') - all_reg_stats = [] - pr_stats = PerReadStats(pr_stats_fn) - for chrm, strand, start, end, block_stats in pr_stats: - block_stats.sort(order=str('pos')) - reg_base_stats = np.split( - block_stats['stat'], np.where(np.concatenate( - [[0,], np.diff(block_stats['pos'])]) > 0)[0]) - reg_poss = np.unique(block_stats['pos']) - - reg_cov = [base_stats.shape[0] for base_stats in reg_base_stats] - if lower_thresh is not None: - # filter base statistics that fall between the upper and lower - # stat threshold for the log likelihood statistic - reg_base_stats = [ - base_stats[np.logical_or( - base_stats <= lower_thresh, - base_stats >= single_read_thresh)] - for base_stats in reg_base_stats] - valid_cov = [base_stats.shape[0] for base_stats in reg_base_stats] - elif pr_stats.stat_type == ALT_MODEL_TXT: - # filter base statistics that fall between the upper and lower - # stat threshold for the log likelihood statistic - reg_base_stats = [ - base_stats[np.abs(base_stats) >= single_read_thresh] - for base_stats in reg_base_stats] - valid_cov = [base_stats.shape[0] for base_stats in reg_base_stats] +def compute_posterior_samp_dists( + ctrl_means, ctrl_sds, ctrl_cov, ctrl_reg_data, std_ref, + prior_weights, min_test_reads, fm_offset): + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + gnm_begin_lag = ( + std_ref.central_pos if ctrl_reg_data.strand == '+' else dnstrm_bases) + gnm_end_lag = ( + dnstrm_bases if ctrl_reg_data.strand == '+' else std_ref.central_pos) + reg_seq = ctrl_reg_data.copy().update( + start=ctrl_reg_data.start - gnm_begin_lag - fm_offset, + end=ctrl_reg_data.end + gnm_end_lag + fm_offset).add_seq().seq + if ctrl_reg_data.strand == '-': + reg_seq = th.rev_comp(reg_seq) + + reg_ref_means, reg_ref_sds = get_ref_from_seq_with_gaps( + reg_seq, std_ref, ctrl_reg_data.strand == '-') + + # compute vectorized weighted means for new mean and sd estimates + post_ref_means = (( + (prior_weights[0] * reg_ref_means) + (ctrl_cov * ctrl_means)) / + (prior_weights[0] + ctrl_cov)) + post_ref_sds = (( + (prior_weights[1] * reg_ref_sds) + (ctrl_cov * ctrl_sds)) / + (prior_weights[1] + ctrl_cov)) + + # This bit should work, but the SD estimates seem to be incorrect + # and the computation is likely far too much for likely very similar + # results from a weighted average with the prior SD. + """ + # compute posterior sds; see example formula here: + # https://www.statlect.com/fundamentals-of-statistics/\ + # normal-distribution-Bayesian-estimation + # optimizations to matrix ops applied here + def compute_mean_diff_factor(event_means, ref_mean): + valid_indices = ~np.isnan(event_means) + num_valid_indices = sum(valid_indices) + if num_valid_indices < min_test_reads: + return np.NAN + n = float(num_valid_indices + prior_weights[0]) + c1 = (n - 1) / n + c2 = -1 / n + mean_diffs = event_means[valid_indices] - ref_mean + mds_sum = mean_diffs.sum() + return sum(((i_md * c1) + (mds_sum - i_md) * c2) * i_md + for i, i_md in enumerate(mean_diffs)) + mean_diff_factors = np.array([ + compute_mean_diff_factor(b_events, ref_mean) + for b_events, ref_mean in zip(ctrl_base_events, reg_ref_means)]) + post_ref_sds = np.sqrt(( + mean_diff_factors + (prior_weights[1] * np.square(reg_ref_sds))) / ( + ctrl_cov + prior_weights[1])) + """ + + return post_ref_means, post_ref_sds + +def get_reads_ref( + ctrl_reg_data, min_test_reads, fm_offset, std_ref=None, + prior_weights=None, est_mean=False): + """Get mean and standard deviation of levels from a sample across the genome + """ + # expand region to include fm_offset + ctrl_base_events = ctrl_reg_data.copy().update( + start=ctrl_reg_data.start - fm_offset, + end=ctrl_reg_data.end + fm_offset).get_base_levels() + # means over all nan values raises warnings so suppress those here + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=RuntimeWarning) + if est_mean: + ctrl_means = np.apply_along_axis(np.nanmean, 1, ctrl_base_events) else: - valid_cov = reg_cov + ctrl_means = np.apply_along_axis(np.nanmedian, 1, ctrl_base_events) + ctrl_sds = np.apply_along_axis( + lambda x: max(np.nanstd(x), MIN_POSITION_SD), 1, + ctrl_base_events) + ctrl_cov = np.apply_along_axis( + lambda x: sum(~np.isnan(x)), 1, ctrl_base_events) + # set means and sds with cov below min_test_reads to NAN + ctrl_means[ctrl_cov < min_test_reads] = np.NAN + ctrl_sds[ctrl_cov < min_test_reads] = np.NAN + + if std_ref is not None: + if prior_weights is None: + prior_weights = (MEAN_PRIOR_CONST, SD_PRIOR_CONST) + ctrl_means, ctrl_sds = compute_posterior_samp_dists( + ctrl_means, ctrl_sds, ctrl_cov, ctrl_reg_data, std_ref, + prior_weights, min_test_reads, fm_offset) + + # convert coverate to a dict for later lookup + ctrl_cov = dict(zip(range(ctrl_reg_data.start - fm_offset, + ctrl_reg_data.end + fm_offset), ctrl_cov)) - ctrl_cov = repeat(0) + return ctrl_means, ctrl_sds, ctrl_cov - reg_frac_standard_base = np.array([ - np.greater_equal( - base_stats, single_read_thresh).sum() / base_stats.shape[0] - if base_stats.shape[0] > 0 else np.NAN - for base_stats in reg_base_stats]) +def compute_sample_compare_read_stats( + r_data, ctrl_means, ctrl_sds, fm_offset=FM_OFFSET_DEFAULT, + reg_data=None): + """Compute signficance statistics using comparison of two sequenceing samples method for a single read within a specified genomic region. - reg_stats = (reg_frac_standard_base, reg_poss, chrm, strand, - reg_cov, ctrl_cov, valid_cov) - all_reg_stats.append(reg_stats) + Args: - if len(all_reg_stats) == 0: - th._error_message_and_exit( - 'No genomic positions contain --minimum-test-reads.') + r_data (:class:`tombo.tombo_helper.readData`): read data + ctrl_means (`np.array::np.float64`): mean level values from control set of reads + ctrl_sds (`np.array::np.float64`): level SD values from control set of reads + fm_offset (int): Fisher's Method offset for computing locally combined p-values (optional; default: 1) + reg_data (:class:`tombo.tombo_helper.intervalData`): region to test (default: test whole read) - return all_reg_stats, pr_stats.stat_type + Returns: + Read testing results, positions tested and the read_id -def get_reads_ref(ctrl_reg_reads, reg_start, region_size, - min_test_vals, fm_offset): + 1) r_pvals (`np.array::np.float64`): p-values for testing over specified region + 2) r_poss (`np.array::np.int64`): genomic positions for returned p-values + 3) read_id (str): read identifier """ - Get mean and standard deviation of levels from a sample across the genome - """ - ctrl_base_events = th.get_reads_events(ctrl_reg_reads) - if ctrl_base_events is None: - raise NotImplementedError - arr_size = region_size + (fm_offset * 2) - ctrl_means, ctrl_sds = np.empty(arr_size), np.empty(arr_size) - ctrl_means[:] = np.NAN - ctrl_sds[:] = np.NAN - ctrl_cov = {} - for pos, pos_events in sorted(ctrl_base_events.items()): - # if position is past the end of the region return - if pos - fm_offset >= reg_start + region_size: - break - if pos + fm_offset < reg_start: - continue - ctrl_cov[pos] = len(pos_events) - if ctrl_cov[pos] < min_test_vals: - continue - ctrl_mean, ctrl_sd = c_mean_std(pos_events) - ctrl_sd = max(ctrl_sd, MIN_POSITION_SD) - ctrl_means[pos - reg_start + fm_offset] = ctrl_mean - ctrl_sds[pos - reg_start + fm_offset] = ctrl_sd - - return ctrl_means, ctrl_sds, ctrl_cov + reg_start = reg_data.start if reg_data is not None else r_data.start + reg_size = (reg_data.end - reg_data.start if reg_data is not None + else r_data.end - r_data.start) -def compute_sample_compare_read_stats( - r_data, ctrl_means, ctrl_sds, fm_offset, reg_start, region_size): - """ - Compute signficance statistics using comparison of two sequenceing samples - method for a single read within a specified genomic region. - """ def comp_clip_and_flip(): with h5py.File(r_data.fn, 'r') as fast5_data: r_means = th.get_single_slot_read_centric( fast5_data, 'norm_mean', r_data.corr_group) - read_id = th.get_raw_read_slot(fast5_data).attrs['read_id'] + read_id = th.get_raw_read_slot(fast5_data).attrs.get('read_id') if r_means is None: - raise NotImplementedError( + raise th.TomboError( 'Read does not contain re-squiggled level means.') read_start, read_end = r_data.start, r_data.end @@ -2144,9 +2827,9 @@ def comp_clip_and_flip(): r_means = r_means[num_start_clip:] else: r_means = r_means[:-num_start_clip] - if read_end - fm_offset > reg_start + region_size: - num_end_clip = (read_end - fm_offset) - (reg_start + region_size) - read_end = reg_start + region_size + fm_offset + if read_end - fm_offset > reg_start + reg_size: + num_end_clip = (read_end - fm_offset) - (reg_start + reg_size) + read_end = reg_start + reg_size + fm_offset if r_data.strand == '+': r_means = r_means[:-num_end_clip] else: @@ -2181,7 +2864,7 @@ def get_pvals(r_z_scores): r_z_scores = get_read_comp_z_score(r_means, read_start, read_end) if np.sum(np.logical_not(np.isnan(r_z_scores))) == 0: - raise NotImplementedError('No valid z-scores in read.') + raise th.TomboError('No valid z-scores in read.') r_pvals, r_poss = get_pvals(r_z_scores) if fm_offset > 0: r_pvals = calc_window_fishers_method(r_pvals, fm_offset) @@ -2195,20 +2878,44 @@ def get_pvals(r_z_scores): return r_pvals, r_poss, read_id def compute_de_novo_read_stats( - r_data, gnm_begin_lag, gnm_end_lag, fm_offset, reg_start, - region_size, std_ref): - """ - Compute signficance statistics using de novo comparison to a canonical model - method for a single read within a specified genomic region. - """ + r_data, std_ref, fm_offset=FM_OFFSET_DEFAULT, reg_data=None, + gnm_begin_lag=None, gnm_end_lag=None): + """Compute signficance statistics using de novo comparison to a canonical model method for a single read within a specified genomic region. + + Args: + + r_data (:class:`tombo.tombo_helper.readData`): read data + std_ref (:class:`tombo.tombo_stats.TomboModel`): canonical expected signal level model + fm_offset (int): Fisher's Method offset for computing locally combined p-values (optional; default: 1) + reg_data (:class:`tombo.tombo_helper.intervalData`): region to test (default: test whole read) + gnm_begin_lag (int): upstream genomic overhang required for k-mer lookup (optional; default compute from read strand and `std_ref`) + gnm_end_lag (int): downstream genomic overhang required for k-mer lookup (optional; default compute from read strand and `std_ref`) + + Returns: + Read testing results, positions tested and the read_id + + 1) r_pvals (`np.array::np.float64`): p-values for testing over specified region + 2) r_poss (`np.array::np.int64`): genomic positions for returned p-values + 3) read_id (str): read identifier + """ + reg_start = reg_data.start if reg_data is not None else r_data.start + reg_size = (reg_data.end - reg_data.start if reg_data is not None + else r_data.end - r_data.start) + if gnm_begin_lag is None or gnm_end_lag is None: + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + gnm_begin_lag = (std_ref.central_pos if r_data.strand == '+' else + dnstrm_bases) + gnm_end_lag = (dnstrm_bases if r_data.strand == '+' else + std_ref.central_pos) + def de_novo_clip_and_flip(): with h5py.File(r_data.fn, 'r') as fast5_data: r_means, r_seq = th.get_multiple_slots_read_centric( fast5_data, ['norm_mean', 'base'], r_data.corr_group) - read_id = th.get_raw_read_slot(fast5_data).attrs['read_id'] + read_id = th.get_raw_read_slot(fast5_data).attrs.get('read_id') if r_means is None or r_seq is None: - raise NotImplementedError( + raise th.TomboError( 'Read does not contain valid re-squiggled data.') r_seq = b''.join(r_seq).decode() @@ -2216,7 +2923,8 @@ def de_novo_clip_and_flip(): # clip read if it extends outside the current genomic region, so # stats are only computed within this region if read_start + gnm_begin_lag + fm_offset < reg_start: - num_start_clip = reg_start - (read_start + gnm_begin_lag + fm_offset) + num_start_clip = reg_start - ( + read_start + gnm_begin_lag + fm_offset) read_start = reg_start - gnm_begin_lag - fm_offset if r_data.strand == '+': r_means = r_means[num_start_clip:] @@ -2224,10 +2932,10 @@ def de_novo_clip_and_flip(): else: r_means = r_means[:-num_start_clip] r_seq = r_seq[:-num_start_clip] - if read_end - gnm_end_lag - fm_offset > reg_start + region_size: + if read_end - gnm_end_lag - fm_offset > reg_start + reg_size: num_end_clip = (read_end - gnm_end_lag - fm_offset) - ( - reg_start + region_size) - read_end = reg_start + region_size + gnm_end_lag + fm_offset + reg_start + reg_size) + read_end = reg_start + reg_size + gnm_end_lag + fm_offset if r_data.strand == '+': r_means = r_means[:-num_end_clip] r_seq = r_seq[:-num_end_clip] @@ -2238,7 +2946,7 @@ def de_novo_clip_and_flip(): # if this read does not cover enough of this region for stat # computation raise an error to be handled below if len(r_seq) < std_ref.kmer_width: - raise NotImplementedError( + raise th.TomboError( 'Read does not contain information in this region.') r_ref_means, r_ref_sds, _, _ = get_ref_from_seq( @@ -2275,11 +2983,7 @@ def de_novo_clip_and_flip(): def calc_llh_ratio(reg_means, reg_ref_means, reg_ref_vars, reg_alt_means, reg_alt_vars): - """ - Compute log likelihood ratio - - This is about 10X slower than the cython version in tombo.c_helper, but - has been kept for debugging purposes + """Compute log likelihood ratio. This is about 10X slower than the cython version in tombo._c_helper, but has been kept for debugging purposes. """ # compute log likelihood ratio # positive value means standard base fits data better @@ -2290,40 +2994,63 @@ def calc_llh_ratio(reg_means, reg_ref_means, reg_ref_vars, np.sum(np.log(reg_ref_vars))) def compute_alt_model_read_stats( - r_data, gnm_begin_lag, gnm_end_lag, reg_start, region_size, - std_ref, alt_ref, use_standard_llhr): - """ - Compute signficance statistics using comparison of read signal to canonical - and alternative models method for a single read within a specified genomic - region. - """ - motif_width = gnm_begin_lag + gnm_end_lag + 1 + r_data, std_ref, alt_ref, use_standard_llhr=False, reg_data=None, + gnm_begin_lag=None, gnm_end_lag=None): + """Compute signficance statistics using comparison of read signal to canonical and alternative models method for a single read within a specified genomic region. + + Args: + r_data (:class:`tombo.tombo_helper.readData`): read data + std_ref (:class:`tombo.tombo_stats.TomboModel`): canonical expected signal level model + alt_ref (:class:`tombo.tombo_stats.TomboModel`): alternative expected signal level model + use_standard_llhr (bool): compute standard likelihood ratio; for details see https://nanoporetech.github.io/tombo/modified_base_detection.html#alternative-model-method (optional; default: False) + reg_data (:class:`tombo.tombo_helper.intervalData`): region to test (default: test whole read) + gnm_begin_lag (int): upstream genomic overhang required for k-mer lookup (optional; default compute from read strand and `std_ref`) + gnm_end_lag (int): downstream genomic overhang required for k-mer lookup (optional; default compute from read strand and `std_ref`) + + Returns: + Read testing results, positions tested and the read_id + + 1) r_llhrs (`np.array::np.float64`): log-likelihood ratios (or psuedo-llhrs) for testing over specified region + 2) r_poss (`np.array::np.int64`): genomic positions for returned p-values + 3) read_id (str): read identifier + """ + reg_start = reg_data.start if reg_data is not None else r_data.start + reg_size = (reg_data.end - reg_data.start if reg_data is not None + else r_data.end - r_data.start) + if gnm_begin_lag is None or gnm_end_lag is None: + dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 + gnm_begin_lag = (std_ref.central_pos if r_data.strand == '+' else + dnstrm_bases) + gnm_end_lag = (dnstrm_bases if r_data.strand == '+' else + std_ref.central_pos) + + std_ref.kmer_width = gnm_begin_lag + gnm_end_lag + 1 def alt_clip_and_flip(): with h5py.File(r_data.fn, 'r') as fast5_data: r_means, r_seq = th.get_multiple_slots_read_centric( fast5_data, ['norm_mean', 'base'], r_data.corr_group) - read_id = th.get_raw_read_slot(fast5_data).attrs['read_id'] + read_id = th.get_raw_read_slot(fast5_data).attrs.get('read_id') if r_means is None or r_seq is None: - raise NotImplementedError( + raise th.TomboError( 'Read does not contain valid re-squiggled data.') r_seq = b''.join(r_seq).decode() read_start = r_data.start # clip read if it extends outside the current genomic region, so # stats are only computed within this region - if read_start + motif_width - 1 < reg_start: - num_start_clip = reg_start - (read_start + motif_width - 1) - read_start = reg_start - (motif_width - 1) + if read_start + std_ref.kmer_width - 1 < reg_start: + num_start_clip = reg_start - (read_start + std_ref.kmer_width - 1) + read_start = reg_start - (std_ref.kmer_width - 1) if r_data.strand == '+': r_means = r_means[num_start_clip:] r_seq = r_seq[num_start_clip:] else: r_means = r_means[:-num_start_clip] r_seq = r_seq[:-num_start_clip] - if r_data.end - (motif_width - 1) > reg_start + region_size: - num_end_clip = (r_data.end - (motif_width - 1)) - ( - reg_start + region_size) + if r_data.end - (std_ref.kmer_width - 1) > reg_start + reg_size: + num_end_clip = (r_data.end - (std_ref.kmer_width - 1)) - ( + reg_start + reg_size) if r_data.strand == '+': r_means = r_means[:-num_end_clip] r_seq = r_seq[:-num_end_clip] @@ -2334,7 +3061,7 @@ def alt_clip_and_flip(): # if this read does not cover enough of this region for stat # computation raise an error to be handled below if len(r_seq) < std_ref.kmer_width: - raise NotImplementedError( + raise th.TomboError( 'Read does not contain information in this region.') r_ref_means, r_ref_sds, r_alt_means, r_alt_sds = get_ref_from_seq( @@ -2348,8 +3075,8 @@ def alt_clip_and_flip(): r_means = r_means[gnm_begin_lag:-gnm_end_lag] # trim seq to positions with valid llh ratio test results # this is shorter than the means and model - r_seq = r_seq[(motif_width - 1):-(motif_width - 1)] - read_start += motif_width - 1 + r_seq = r_seq[(std_ref.kmer_width - 1):-(std_ref.kmer_width - 1)] + read_start += std_ref.kmer_width - 1 return (r_means, r_seq, r_ref_means, r_ref_sds, read_start, r_alt_means, r_alt_sds, read_id) @@ -2367,67 +3094,105 @@ def alt_clip_and_flip(): for alt_base_pos in re.finditer(alt_ref.alt_base, r_seq): alt_pos = alt_base_pos.start() alt_base_poss.append(alt_pos + read_start) + pos_args = [r_means[alt_pos:alt_pos + std_ref.kmer_width], + r_ref_means[alt_pos:alt_pos + std_ref.kmer_width], + r_alt_means[alt_pos:alt_pos + std_ref.kmer_width]] if CONST_SD_MODEL: const_var = r_ref_vars[alt_pos] if use_standard_llhr: pos_lh_ratio = c_calc_llh_ratio_const_var( - r_means[alt_pos:alt_pos + motif_width], - r_ref_means[alt_pos:alt_pos + motif_width], - r_alt_means[alt_pos:alt_pos + motif_width], - const_var) + *(pos_args + const_var)) else: pos_lh_ratio = c_calc_scaled_llh_ratio_const_var( - r_means[alt_pos:alt_pos + motif_width], - r_ref_means[alt_pos:alt_pos + motif_width], - r_alt_means[alt_pos:alt_pos + motif_width], - const_var, OCLLHR_SCALE, OCLLHR_HEIGHT, OCLLHR_POWER) + *(pos_args + [const_var, OCLLHR_SCALE, + OCLLHR_HEIGHT, OCLLHR_POWER])) else: if use_standard_llhr: pos_lh_ratio = c_calc_llh_ratio( - r_means[alt_pos:alt_pos + motif_width], - r_ref_means[alt_pos:alt_pos + motif_width], - r_ref_vars[alt_pos:alt_pos + motif_width], - r_alt_means[alt_pos:alt_pos + motif_width], - r_alt_vars[alt_pos:alt_pos + motif_width]) + *(pos_args + [ + r_ref_vars[alt_pos:alt_pos + std_ref.kmer_width], + r_alt_vars[alt_pos:alt_pos + std_ref.kmer_width]])) else: - raise NotImplementedError( + raise th.TomboError( 'Variable SD scaled likelihood ratio not implemented.') log_lh_ratios.append(pos_lh_ratio) return np.array(log_lh_ratios), np.array(alt_base_poss), read_id -def compute_read_stats( - chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, - region_size, single_read_thresh, lower_thresh, ctrl_reg_reads, std_ref, - alt_ref, use_standard_llhr, per_read_q, stat_type): +def apply_per_read_thresh( + reg_base_stats, single_read_thresh, lower_thresh, stat_type, + reg_poss, ctrl_cov=None): + reg_cov = np.array([base_stats.shape[0] for base_stats in reg_base_stats]) + + if lower_thresh is not None: + # filter base statistics that fall between the upper and lower + # stat threshold for the log likelihood statistic + reg_base_stats = [ + base_stats[np.logical_or(base_stats <= lower_thresh, + base_stats >= single_read_thresh)] + for base_stats in reg_base_stats] + valid_cov = np.array([base_stats.shape[0] + for base_stats in reg_base_stats]) + elif stat_type == ALT_MODEL_TXT: + # filter base statistics that fall between the upper and lower + # stat threshold for the log likelihood statistic + reg_base_stats = [base_stats[np.abs(base_stats) >= single_read_thresh] + for base_stats in reg_base_stats] + valid_cov = np.array([base_stats.shape[0] + for base_stats in reg_base_stats]) + else: + valid_cov = reg_cov + + if stat_type == SAMP_COMP_TXT: + ctrl_cov = [ctrl_cov[pos] if pos in ctrl_cov else 0 + for pos in reg_poss] + else: + # convert to list since python2 repeat objects can't be pickled + ctrl_cov = list(repeat(0, reg_poss.shape[0])) + + reg_frac_std_base = np.array([ + np.greater_equal( + base_stats, single_read_thresh).sum() / base_stats.shape[0] + if base_stats.shape[0] > 0 else np.NAN + for base_stats in reg_base_stats]) + + return reg_frac_std_base, reg_cov, ctrl_cov, valid_cov + +def compute_reg_stats( + reg_data, fm_offset, min_test_reads, + single_read_thresh, lower_thresh, ctrl_reg_data, std_ref, + alt_ref, use_standard_llhr, per_read_q, stat_type, prior_weights): if stat_type == SAMP_COMP_TXT: ctrl_means, ctrl_sds, ctrl_cov = get_reads_ref( - ctrl_reg_reads, reg_start, region_size, - min_test_vals, fm_offset) + ctrl_reg_data, min_test_reads, fm_offset, std_ref, prior_weights) else: + # TODO get region sequence and expected levels/sds here + # instead of for each read + # after that add per-read stat computation to API ctrl_cov = None # compute begin and end lag wrt the genome from upstream and downstream # which are wrt to the read dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 - gnm_begin_lag = std_ref.central_pos if strand == '+' else dnstrm_bases - gnm_end_lag = dnstrm_bases if strand == '+' else std_ref.central_pos + gnm_begin_lag = ( + std_ref.central_pos if reg_data.strand == '+' else dnstrm_bases) + gnm_end_lag = ( + dnstrm_bases if reg_data.strand == '+' else std_ref.central_pos) reg_read_stats, reg_poss, reg_ids = [], [], [] - for r_data in reg_reads: + for r_data in reg_data.reads: try: if stat_type == SAMP_COMP_TXT: r_stats, r_poss, read_id = compute_sample_compare_read_stats( - r_data, ctrl_means, ctrl_sds, fm_offset, reg_start, - region_size) + r_data, ctrl_means, ctrl_sds, fm_offset, reg_data) elif stat_type == DE_NOVO_TXT: r_stats, r_poss, read_id = compute_de_novo_read_stats( - r_data, gnm_begin_lag, gnm_end_lag, fm_offset, - reg_start, region_size, std_ref) + r_data, std_ref, fm_offset, reg_data, + gnm_begin_lag, gnm_end_lag) else: r_stats, r_poss, read_id = compute_alt_model_read_stats( - r_data, gnm_begin_lag, gnm_end_lag, reg_start, region_size, - std_ref, alt_ref, use_standard_llhr) - except NotImplementedError: + r_data, std_ref, alt_ref, use_standard_llhr, + reg_data, gnm_begin_lag, gnm_end_lag) + except th.TomboError: continue if r_stats is None: continue reg_read_stats.append(r_stats) @@ -2435,7 +3200,7 @@ def compute_read_stats( reg_ids.append(read_id) if len(reg_read_stats) == 0: - raise NotImplementedError + raise th.TomboError('Read contains no statistics in this region.') if per_read_q is not None: # compile read_ids vector for per-read output @@ -2467,7 +3232,8 @@ def compute_read_stats( dtype=[(str('pos'), 'u4'), (str('stat'), 'f8'), (str('read_id'), 'u4')]) per_read_q.put(( - per_read_block, read_id_lookup, chrm, strand, reg_start)) + per_read_block, read_id_lookup, reg_data.chrm, + reg_data.strand, reg_data.start)) # get order of all bases from position array as_reg_poss = np.argsort(reg_poss) @@ -2476,102 +3242,53 @@ def compute_read_stats( # get unique tested genomic positions across all reads us_reg_poss = np.unique(reg_poss) + if reg_poss.shape[0] == 0: + raise th.TomboError('No valid positions in this region.') + # then sort the stats array by genomic position and # split into stats by genomic base position reg_base_stats = np.split( reg_read_stats[as_reg_poss], np.where(np.concatenate([[0,], np.diff(reg_poss)]) > 0)[0]) - reg_cov = [base_stats.shape[0] for base_stats in reg_base_stats] - - if lower_thresh is not None: - # filter base statistics that fall between the upper and lower - # stat threshold for the log likelihood statistic - reg_base_stats = [ - base_stats[np.logical_or(base_stats <= lower_thresh, - base_stats >= single_read_thresh)] - for base_stats in reg_base_stats] - valid_cov = [base_stats.shape[0] for base_stats in reg_base_stats] - elif stat_type == ALT_MODEL_TXT: - # filter base statistics that fall between the upper and lower - # stat threshold for the log likelihood statistic - reg_base_stats = [base_stats[np.abs(base_stats) >= single_read_thresh] - for base_stats in reg_base_stats] - valid_cov = [base_stats.shape[0] for base_stats in reg_base_stats] - else: - valid_cov = reg_cov - - if stat_type == SAMP_COMP_TXT: - ctrl_cov = [ctrl_cov[pos] if pos in ctrl_cov else 0 - for pos in reg_poss] - else: - # convert to list since python2 repeat objects can't be pickled - ctrl_cov = list(repeat(0, reg_poss.shape[0])) - - return reg_base_stats, us_reg_poss, reg_cov, ctrl_cov, valid_cov - -def get_region_stats( - chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, - region_size, single_read_thresh, lower_thresh, ctrl_reg_reads, std_ref, - alt_ref, use_standard_llhr, per_read_q, stat_type): - """ - Compute requested statistics for a specific region of the genome - """ - try: - (reg_base_stats, reg_poss, - reg_cov, ctrl_cov, valid_cov) = compute_read_stats( - chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, - region_size, single_read_thresh, lower_thresh, ctrl_reg_reads, - std_ref, alt_ref, use_standard_llhr, per_read_q, stat_type) - except NotImplementedError: - return None - - if reg_poss.shape[0] == 0: - return None - - reg_frac_standard_base = np.array([ - np.greater_equal( - base_stats, single_read_thresh).sum() / base_stats.shape[0] - if base_stats.shape[0] > 0 else np.NAN - for base_stats in reg_base_stats]) - - reg_stats = (reg_frac_standard_base, reg_poss, chrm, strand, - reg_cov, ctrl_cov, valid_cov) + (reg_frac_std_base, reg_cov, ctrl_cov, valid_cov) = apply_per_read_thresh( + reg_base_stats, single_read_thresh, lower_thresh, + stat_type, reg_poss, ctrl_cov) - return reg_stats + return reg_frac_std_base, us_reg_poss, reg_cov, ctrl_cov, valid_cov def _test_signif_worker( - region_q, stats_q, progress_q, per_read_q, raw_read_coverage, fm_offset, - min_test_vals, single_read_thresh, lower_thresh, region_size, - ctrl_read_coverage, std_ref, alt_ref, use_standard_llhr, stat_type): - ctrl_reg_reads = None + region_q, stats_q, progress_q, per_read_q, reads_index, fm_offset, + min_test_reads, single_read_thresh, lower_thresh, ctrl_reads_index, + std_ref, alt_ref, use_standard_llhr, stat_type, prior_weights): + ctrl_reg_data = None while not region_q.empty(): try: - chrm, strand, reg_start = region_q.get(block=False) + reg_data = region_q.get(block=False) except queue.Empty: # sometimes throws false empty error with get(block=False) if not region_q.empty(): continue break - reg_reads = [r_data for r_data in raw_read_coverage[(chrm, strand)] - if not (r_data.start >= reg_start + region_size or - r_data.end <= reg_start)] - if len(reg_reads) == 0: + if ctrl_reads_index is not None: + ctrl_reg_data = reg_data.copy().add_reads(ctrl_reads_index) + reg_data.add_reads(reads_index) + if len(reg_data.reads) == 0: progress_q.put(1) continue - if ctrl_read_coverage is not None: - ctrl_reg_reads = [ - r_data for r_data in ctrl_read_coverage[(chrm, strand)] - if not (r_data.start >= reg_start + region_size or - r_data.end <= reg_start)] - reg_stats = get_region_stats( - chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, - region_size, single_read_thresh, lower_thresh, ctrl_reg_reads, - std_ref, alt_ref, use_standard_llhr, per_read_q, stat_type) - if reg_stats is not None: - stats_q.put(reg_stats) + try: + (reg_frac_std_base, reg_poss, + reg_cov, ctrl_cov, valid_cov) = compute_reg_stats( + reg_data, fm_offset, min_test_reads, single_read_thresh, + lower_thresh, ctrl_reg_data, std_ref, alt_ref, + use_standard_llhr, per_read_q, stat_type, prior_weights) + stats_q.put(th.regionStats( + reg_frac_std_base, reg_poss, reg_data.chrm, reg_data.strand, + reg_data.start, reg_cov, ctrl_cov, valid_cov)) + except th.TomboError: + pass progress_q.put(1) return @@ -2585,42 +3302,145 @@ def _test_signif_worker(*args): return -############################################## -########## Testing Multi-processing ########## -############################################## +################################################ +########## Aggregate Multi-processing ########## +################################################ -def _get_progress_queue(progress_q, prog_conn, num_regions): - th._status_message( - 'Performing modified base detection across genomic regions.') - bar = tqdm(total=num_regions, smoothing=0) - - tot_num_rec_proc = 0 +def _write_stats(stats_q, stats_fn, stat_type, region_size, cov_damp_counts, + min_test_reads, num_most_signif, num_blocks, num_processes): + all_stats = TomboStats( + stats_fn, stat_type=stat_type, region_size=region_size, + cov_damp_counts=cov_damp_counts, cov_thresh=min_test_reads, + num_most_signif=num_most_signif) + if VERBOSE: + bar = tqdm(total=num_blocks, smoothing=0) + num_agg_ps_finished = 0 while True: try: - iter_val = progress_q.get(block=False) - tot_num_rec_proc += iter_val - bar.update(iter_val) + agg_stats = stats_q.get(block=False) + if agg_stats is None: + num_agg_ps_finished += 1 + if num_agg_ps_finished >= num_processes: break + continue + + if VERBOSE: bar.update(1) + ((reg_frac_std_base, reg_cov, ctrl_cov, valid_cov), + chrm, strand, start, us_reg_poss) = agg_stats + all_stats._write_stat_block( + th.regionStats(reg_frac_std_base, us_reg_poss, chrm, strand, + start, reg_cov, ctrl_cov, valid_cov)) except queue.Empty: - if prog_conn.poll(): + sleep(0.1) + + if VERBOSE: bar.close() + all_stats.close() + if all_stats.is_empty: + th.error_message_and_exit( + 'No genomic positions contain --minimum-test-reads.') + + return + +def _agg_stats_worker( + pr_stats_q, stats_q, stat_type, single_read_thresh, lower_thresh): + while True: + try: + block_pr_stats = pr_stats_q.get(block=False) + # None value indicates that per-reads blocks have been exhausted + if block_pr_stats is None: + stats_q.put(None) break + chrm, strand, start, end, block_stats = block_pr_stats + + block_stats.sort(order=str('pos')) + reg_poss = block_stats['pos'] + us_reg_poss = np.unique(reg_poss) + + reg_base_stats = np.split( + block_stats['stat'], np.where(np.concatenate( + [[0,], np.diff(reg_poss)]) > 0)[0]) + + reg_stats = apply_per_read_thresh( + reg_base_stats, single_read_thresh, lower_thresh, + stat_type, reg_poss) + stats_q.put((reg_stats, chrm, strand, start, us_reg_poss)) + except queue.Empty: sleep(0.1) - continue - bar.close() - prog_conn.send(tot_num_rec_proc) + return + +def _load_stats_batches(pr_stats_fn, pr_stats_q, num_processes): + pr_stats = PerReadStats(pr_stats_fn) + for pr_block in pr_stats: + pr_stats_q.put(pr_block) + for _ in range(num_processes): + pr_stats_q.put(None) return +def aggregate_per_read_stats( + pr_stats_fn, single_read_thresh, lower_thresh, stats_fn, + cov_damp_counts, min_test_reads, num_most_signif, num_processes): + if VERBOSE: th.status_message( + 'Loading and aggregating per-read statistics.') + + # pre-load per-read stats queue + pr_stats = PerReadStats(pr_stats_fn) + stat_type, num_blocks, region_size = ( + pr_stats.stat_type, pr_stats.num_blocks, pr_stats.region_size) + pr_stats.close() + + pr_stats_q = Queue(STAT_BLOCKS_QUEUE_LIMIT) + stats_q = Queue(STAT_BLOCKS_QUEUE_LIMIT) + write_stats_q = Queue(STAT_BLOCKS_QUEUE_LIMIT) + + load_stats_p = Process(target=_load_stats_batches, args=( + pr_stats_fn, pr_stats_q, num_processes)) + load_stats_p.daemon = True + load_stats_p.start() + + agg_stats_ps = [] + for p_id in range(num_processes): + agg_p = Process(target=_agg_stats_worker, args=( + pr_stats_q, stats_q, stat_type, single_read_thresh, lower_thresh)) + agg_p.daemon = True + agg_p.start() + agg_stats_ps.append(agg_p) + + write_stats_p = Process(target=_write_stats, args=( + stats_q, stats_fn, stat_type, region_size, cov_damp_counts, + min_test_reads, num_most_signif, num_blocks, num_processes)) + write_stats_p.daemon = True + write_stats_p.start() + + # wait for processes to complete + load_stats_p.join() + for agg_p in agg_stats_ps: + agg_p.join() + write_stats_p.join() + + return + + +############################################## +########## Testing Multi-processing ########## +############################################## + def _get_stats_queue(stats_q, stats_conn, min_test_reads, stats_file_bn, - alt_name, stat_type): - # TODO convert to a TomboStats class that writes each batch to file as - # they are received - all_reg_stats = [] + alt_name, stat_type, reg_size, cov_damp_counts, + num_most_signif): + stats_fn = stats_file_bn + '.tombo.stats' if alt_name is None else \ + stats_file_bn + '.' + alt_name + '.tombo.stats' + all_stats = TomboStats( + stats_fn, stat_type=stat_type, region_size=reg_size, + cov_damp_counts=cov_damp_counts, cov_thresh=min_test_reads, + num_most_signif=num_most_signif) while True: try: reg_stats = stats_q.get(block=False) - all_reg_stats.append(reg_stats) + all_stats._write_stat_block(reg_stats) except queue.Empty: + # wait for main process to send indicator that all regions + # have been processed if stats_conn.poll(): sleep(0.1) break @@ -2630,14 +3450,13 @@ def _get_stats_queue(stats_q, stats_conn, min_test_reads, stats_file_bn, # Clear leftover values from queues while not stats_q.empty(): reg_stats = stats_q.get(block=False) - all_reg_stats.append(reg_stats) + all_stats._write_stat_block(reg_stats) - if len(all_reg_stats) == 0: - th._error_message_and_exit( + if all_stats.is_empty: + th.error_message_and_exit( 'No genomic positions contain --minimum-test-reads.') - write_stats(all_reg_stats, stats_file_bn, - stat_type, min_test_reads, alt_name) + all_stats.close() stats_conn.send(True) return @@ -2649,7 +3468,7 @@ def _get_per_read_queue( while True: try: per_read_block = per_read_q.get(block=False) - per_read_stats.write_per_read_block(*per_read_block) + per_read_stats._write_per_read_block(*per_read_block) del per_read_block except queue.Empty: if per_read_conn.poll(): @@ -2661,7 +3480,7 @@ def _get_per_read_queue( # Clear leftover values from queues while not per_read_q.empty(): per_read_block = per_read_q.get(block=False) - per_read_stats.write_per_read_block(*per_read_block) + per_read_stats._write_per_read_block(*per_read_block) del per_read_block per_read_stats.close() @@ -2670,44 +3489,69 @@ def _get_per_read_queue( return +def _get_progress_queue(progress_q, prog_conn, num_regions): + th.status_message( + 'Performing modified base detection across genomic regions.') + bar = tqdm(total=num_regions, smoothing=0) + + tot_num_rec_proc = 0 + while True: + try: + iter_val = progress_q.get(block=False) + tot_num_rec_proc += iter_val + bar.update(iter_val) + except queue.Empty: + if prog_conn.poll(): + break + sleep(0.1) + continue + + bar.close() + prog_conn.send(tot_num_rec_proc) + + return + def test_significance( - raw_read_coverage, min_test_vals, fm_offset, single_read_thresh, - lower_thresh, region_size, num_processes, per_read_bn, stat_type, - min_test_reads, stats_file_bn, - ctrl_read_coverage=None, std_ref=None, alt_ref=None, - use_standard_llhr=False, alt_name=None): - """ - Test for significant shifted signal in mutliprocessed batches + reads_index, stat_type, per_read_bn, stats_file_bn, + single_read_thresh, lower_thresh, region_size, num_processes, + min_test_reads, cov_damp_counts, num_most_signif, + fm_offset=None, ctrl_reads_index=None, std_ref=None, alt_ref=None, + use_standard_llhr=False, alt_name=None, prior_weights=None): + """Test for significant shifted signal in mutliprocessed batches """ region_q = Queue() - stats_q = Queue() + stats_q = Queue(STAT_BLOCKS_QUEUE_LIMIT) progress_q = Queue() - per_read_q = Queue(PER_READ_BLOCKS_QUEUE_LIMIT) \ + per_read_q = Queue(STAT_BLOCKS_QUEUE_LIMIT) \ if per_read_bn else None # split chromosomes into separate regions to process independently - chrm_sizes = th.get_chrm_sizes(raw_read_coverage, ctrl_read_coverage) + chrm_sizes = th.get_chrm_sizes(reads_index, ctrl_reads_index) num_regions = 0 for chrm, chrm_len in chrm_sizes.items(): # only process regions covered by both samples if control # reads are provided plus_covered = ( - (chrm, '+') in raw_read_coverage and - (ctrl_read_coverage is None or (chrm, '+') in ctrl_read_coverage)) + (chrm, '+') in reads_index and + (ctrl_reads_index is None or (chrm, '+') in ctrl_reads_index)) minus_covered = ( - (chrm, '-') in raw_read_coverage and - (ctrl_read_coverage is None or (chrm, '-') in ctrl_read_coverage)) + (chrm, '-') in reads_index and + (ctrl_reads_index is None or (chrm, '-') in ctrl_reads_index)) for reg_start in range(0, chrm_len, region_size): if plus_covered: - region_q.put((chrm, '+', reg_start)) + region_q.put(th.intervalData( + chrm=chrm, start=reg_start, end=reg_start + region_size, + strand='+')) num_regions += 1 if minus_covered: - region_q.put((chrm, '-', reg_start)) + region_q.put(th.intervalData( + chrm=chrm, start=reg_start, end=reg_start + region_size, + strand='-')) num_regions += 1 test_args = ( - region_q, stats_q, progress_q, per_read_q, raw_read_coverage, fm_offset, - min_test_vals, single_read_thresh, lower_thresh, region_size, - ctrl_read_coverage, std_ref, alt_ref, use_standard_llhr, stat_type) + region_q, stats_q, progress_q, per_read_q, reads_index, fm_offset, + min_test_reads, single_read_thresh, lower_thresh, ctrl_reads_index, + std_ref, alt_ref, use_standard_llhr, stat_type, prior_weights) test_ps = [] for p_id in range(num_processes): p = Process(target=_test_signif_worker, args=test_args) @@ -2725,7 +3569,8 @@ def test_significance( # main region stats queue getter main_stats_conn, stats_conn = Pipe() stats_p = Process(target=_get_stats_queue, args=( - stats_q, stats_conn, min_test_reads, stats_file_bn, alt_name, stat_type)) + stats_q, stats_conn, min_test_reads, stats_file_bn, alt_name, stat_type, + region_size, cov_damp_counts, num_most_signif)) stats_p.daemon = True stats_p.start() @@ -2769,91 +3614,90 @@ def test_significance( ########################## def _test_shifts_de_novo_main( - args, lower_thresh, single_read_thresh, bio_samp_type, raw_read_coverage): - tb_model_fn = args.tombo_model_filename - if bio_samp_type is None: - bio_samp_type = 'RNA' if th.is_rna(raw_read_coverage) else 'DNA' - if tb_model_fn is None: - tb_model_fn, bio_samp_type = get_default_standard_ref( - raw_read_coverage, bio_samp_type) - std_ref = TomboModel(tb_model_fn) + args, lower_thresh, single_read_thresh, seq_samp_type, reads_index): + if seq_samp_type is None: + seq_samp_type = th.get_seq_sample_type(reads_index=reads_index) + std_ref = TomboModel( + ref_fn=args.tombo_model_filename, seq_samp_type=seq_samp_type, + reads_index=reads_index) stat_type = DE_NOVO_TXT lower_thresh, single_read_thresh = ( (lower_thresh, single_read_thresh) if single_read_thresh - is not None else DE_NOVO_THRESH[bio_samp_type]) - if VERBOSE: th._status_message( + is not None else DE_NOVO_THRESH[seq_samp_type.name]) + if VERBOSE: th.status_message( 'Performing de novo model testing against canonical model.') test_significance( - raw_read_coverage, args.minimum_test_reads, - args.fishers_method_context, single_read_thresh, lower_thresh, + reads_index, stat_type, args.per_read_statistics_basename, + args.statistics_file_basename, single_read_thresh, lower_thresh, args.multiprocess_region_size, args.processes, - args.per_read_statistics_basename, stat_type, - args.minimum_test_reads, args.statistics_file_basename, - std_ref=std_ref) + args.minimum_test_reads, args.coverage_dampen_counts, + args.num_most_significant_stored, + fm_offset=args.fishers_method_context, std_ref=std_ref) return def _test_shifts_alt_main( - args, lower_thresh, single_read_thresh, bio_samp_type, raw_read_coverage): - tb_model_fn = args.tombo_model_filename - if bio_samp_type is None: - bio_samp_type = 'RNA' if th.is_rna(raw_read_coverage) else 'DNA' - if tb_model_fn is None: - tb_model_fn, bio_samp_type = get_default_standard_ref( - raw_read_coverage, bio_samp_type) - std_ref = TomboModel(tb_model_fn) + args, lower_thresh, single_read_thresh, seq_samp_type, reads_index): + if seq_samp_type is None: + seq_samp_type = th.get_seq_sample_type(reads_index=reads_index) + std_ref = TomboModel( + ref_fn=args.tombo_model_filename, seq_samp_type=seq_samp_type, + reads_index=reads_index) stat_type = ALT_MODEL_TXT lower_thresh, single_read_thresh = ( (lower_thresh, single_read_thresh) if single_read_thresh - is not None else LLR_THRESH[bio_samp_type]) - if VERBOSE: th._status_message('Performing alternative model testing.') - if args.alternate_model_filenames is not None: - alt_refs = parse_tombo_models( - args.alternate_model_filenames, std_ref) - else: - alt_refs = load_alt_refs( - args.alternate_bases, raw_read_coverage, - std_ref, bio_samp_type) + is not None else LLR_THRESH[seq_samp_type.name]) + if VERBOSE: th.status_message('Performing alternative model testing.') + alt_refs = load_alt_refs( + args.alternate_model_filenames, args.alternate_bases, + reads_index, std_ref, seq_samp_type) if len(alt_refs) == 0: - th._error_message_and_exit('No alternative models successfully loaded.') + th.error_message_and_exit('No alternative models successfully loaded.') for alt_name, alt_ref in alt_refs.items(): - if VERBOSE: th._status_message( + if VERBOSE: th.status_message( 'Performing alternative model testing against ' + alt_name + ' model.') test_significance( - raw_read_coverage, args.minimum_test_reads, 0, - single_read_thresh, lower_thresh, + reads_index, stat_type, args.per_read_statistics_basename, + args.statistics_file_basename, single_read_thresh, lower_thresh, args.multiprocess_region_size, args.processes, - args.per_read_statistics_basename, stat_type, - args.minimum_test_reads, args.statistics_file_basename, - std_ref=std_ref, alt_ref=alt_ref, - use_standard_llhr=args.standard_log_likelihood_ratio, - alt_name=alt_name) + args.minimum_test_reads, args.coverage_dampen_counts, + args.num_most_significant_stored, + std_ref=std_ref, alt_ref=alt_ref, alt_name=alt_name, + use_standard_llhr=args.standard_log_likelihood_ratio) return def _test_shifts_samp_comp_main( - args, lower_thresh, single_read_thresh, bio_samp_type, raw_read_coverage): + args, lower_thresh, single_read_thresh, seq_samp_type, reads_index): stat_type = SAMP_COMP_TXT if single_read_thresh is None: - if bio_samp_type is None: - bio_samp_type = 'RNA' if th.is_rna(raw_read_coverage) else 'DNA' - lower_thresh, single_read_thresh = SAMP_COMP_THRESH[bio_samp_type] - if VERBOSE: th._status_message( + if seq_samp_type is None: + seq_samp_type = th.get_seq_sample_type(reads_index=reads_index) + lower_thresh, single_read_thresh = SAMP_COMP_THRESH[seq_samp_type.name] + if VERBOSE: th.status_message( 'Performing two-sample comparison significance testing.') - ctrl_read_coverage = th.parse_fast5s( + ctrl_reads_index = th.TomboReads( args.control_fast5_basedirs, args.corrected_group, args.basecall_subgroups) + + # load expected levels ref for posterior computation + std_ref = None if args.sample_only_estimates else TomboModel( + ref_fn=args.tombo_model_filename, seq_samp_type=seq_samp_type, + reads_index=reads_index) + test_significance( - raw_read_coverage, args.minimum_test_reads, - args.fishers_method_context, single_read_thresh, lower_thresh, + reads_index, stat_type, args.per_read_statistics_basename, + args.statistics_file_basename, single_read_thresh, lower_thresh, args.multiprocess_region_size, args.processes, - args.per_read_statistics_basename, stat_type, - args.minimum_test_reads, args.statistics_file_basename, - ctrl_read_coverage=ctrl_read_coverage) + args.minimum_test_reads, args.coverage_dampen_counts, + args.num_most_significant_stored, + fm_offset=args.fishers_method_context, + ctrl_reads_index=ctrl_reads_index, std_ref=std_ref, + prior_weights=args.model_prior_weights) return @@ -2868,13 +3712,13 @@ def _test_shifts_main(args): _print_alt_models() sys.exit() if args.fast5_basedirs is None or args.statistics_file_basename is None: - th._error_message_and_exit( + th.error_message_and_exit( 'Must provide both a set of FAST5 read files ' + '(--fast5-basedirs) and an output file basename ' + '(--statistics-file-basename).') if (args.alternate_model_filenames is None and args.alternate_bases is None): - th._error_message_and_exit( + th.error_message_and_exit( 'Must provide an alterntive model against which to test.\n\t' + 'Run with --print-available-models option to see possible ' + 'values for the --alternate-bases option.') @@ -2887,35 +3731,37 @@ def _test_shifts_main(args): lower_thresh = None else: if len(args.single_read_threshold) > 2: - th._warning_message( + th.warning_message( 'Only 1 or 2 values may be passed as single-read ' + 'thresholds. Only using the first 2 options provided.') lower_thresh = args.single_read_threshold[0] single_read_thresh = args.single_read_threshold[1] try: - # sample compare does not have bio_sample_type in the namespace - bio_samp_type = args.bio_sample_type + if args.seq_sample_type is None: + seq_samp_type = None + else: + # sample compare does not have seq_sample_type in the namespace + seq_samp_type = th.seqSampleType(DNA_SAMP_TYPE, False) \ + if args.seq_sample_type == DNA_SAMP_TYPE else \ + th.seqSampleType(RNA_SAMP_TYPE, True) except AttributeError: - bio_samp_type = None + seq_samp_type = None - raw_read_coverage = th.parse_fast5s( + reads_index = th.TomboReads( args.fast5_basedirs, args.corrected_group, args.basecall_subgroups) if args.action_command == 'de_novo': _test_shifts_de_novo_main( - args, lower_thresh, single_read_thresh, bio_samp_type, - raw_read_coverage) + args, lower_thresh, single_read_thresh, seq_samp_type, reads_index) elif args.action_command == 'alternative_model': _test_shifts_alt_main( - args, lower_thresh, single_read_thresh, bio_samp_type, - raw_read_coverage) + args, lower_thresh, single_read_thresh, seq_samp_type, reads_index) elif args.action_command == 'sample_compare': _test_shifts_samp_comp_main( - args, lower_thresh, single_read_thresh, bio_samp_type, - raw_read_coverage) + args, lower_thresh, single_read_thresh, seq_samp_type, reads_index) else: - th._error_message_and_exit('Invalid Tombo detect_modifications command.') + th.error_message_and_exit('Invalid Tombo detect_modifications command.') return @@ -2929,18 +3775,16 @@ def _aggregate_per_read_main(args): single_read_thresh = args.single_read_threshold[0] else: if len(args.single_read_threshold) > 2: - th._warning_message( + th.warning_message( 'Only 1 or 2 values may be passed as single-read ' + 'thresholds. Only using the first 2 options provided.') lower_thresh = args.single_read_threshold[0] single_read_thresh = args.single_read_threshold[1] - all_reg_stats, stat_type = apply_per_read_thresh( - args.per_read_statistics_filename, single_read_thresh, - args.minimum_test_reads, lower_thresh) - - write_stats(all_reg_stats, args.statistics_file_basename, stat_type, - args.minimum_test_reads) + aggregate_per_read_stats( + args.per_read_statistics_filename, single_read_thresh, lower_thresh, + args.statistics_filename, args.coverage_dampen_counts, + args.minimum_test_reads, args.num_most_significant_stored, args.processes) return @@ -2950,7 +3794,7 @@ def _est_ref_main(args): th.VERBOSE = VERBOSE if min(args.upstream_bases, args.downstream_bases) == 0: - th._error_message_and_exit( + th.error_message_and_exit( 'Context upstream and downstream must be greater ' + 'than 0 for model estimation.') @@ -2972,15 +3816,16 @@ def _est_alt_ref_main(args): alt_ref = estimate_alt_model( args.fast5_basedirs, args.control_fast5_basedirs, args.corrected_group, args.basecall_subgroups, - args.tombo_model_filename, args.bio_sample_type, + args.tombo_model_filename, args.seq_sample_type, args.alternate_model_base, args.alt_fraction_percentile, args.minimum_kmer_observations, args.save_density_basename, args.kernel_density_bandwidth, args.alternate_density_filename, args.control_density_filename, args.processes) # returns None when profiling method if alt_ref is None: return - alt_ref.write_model(args.alternate_model_filename, - args.alternate_model_base, args.alternate_model_name) + alt_ref.alt_name = args.alternate_model_name + alt_ref.alt_base = args.alternate_model_base + alt_ref.write_model(args.alternate_model_filename) return @@ -2989,31 +3834,31 @@ def _estimate_scale_main(args): VERBOSE = not args.quiet th.VERBOSE = VERBOSE - if VERBOSE: th._status_message('Getting files list.') + if VERBOSE: th.status_message('Getting files list.') try: - if not os.path.isdir(args.fast5_basedir): - th._error_message_and_exit( + if not os.path.isdir(args.fast5s_basedir): + th.error_message_and_exit( 'Provided [fast5-basedir] is not a directory.') - fast5_basedir = ( - args.fast5_basedir if args.fast5_basedir.endswith('/') else - args.fast5_basedir + '/') - fast5_fns = th.get_files_list(fast5_basedir) + fast5s_basedir = ( + args.fast5s_basedir if args.fast5s_basedir.endswith('/') else + args.fast5s_basedir + '/') + fast5_fns = th.get_files_list(fast5s_basedir) except OSError: - th._error_message_and_exit( + th.error_message_and_exit( 'Reads base directory, a sub-directory or an old (hidden) ' + 'index file does not appear to be accessible. Check ' + 'directory permissions.') if len(fast5_fns) < 1: - th._error_message_and_exit( + th.error_message_and_exit( 'No files identified in the specified ' + 'directory or within immediate subdirectories.') - th._status_message('Global scaling estimate: ' + + th.status_message('Global scaling estimate: ' + unicode(estimate_global_scale(fast5_fns))) return if __name__ == '__main__': - raise NotImplementedError( - 'This is a module. See commands with `tombo -h`') + sys.stderr.write('This is a module. See commands with `tombo -h`') + sys.exit(1)