From 4dacd765fed79c7df48b77a6c935709c4d2e2529 Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 2 Jun 2023 12:08:06 +0100 Subject: [PATCH 1/3] Add wobble to correlation() --- README.md | 15 ++++++++++ docs/source/usage.md | 11 +++++++ pyproject.toml | 6 ++-- streq/distance.py | 71 +++++++++++++++++++++++++++++++------------- streq/seqtools.py | 3 +- streq/utils.py | 2 ++ 6 files changed, 85 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index f5ade88..0cbc0de 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # 🧬 streq +![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/scbirlab/streq/python-publish.yml) +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/streq) +![PyPI](https://img.shields.io/pypi/v/streq) + Python utilities for working with nucleotide sequence strings. ## Installation @@ -85,6 +89,17 @@ Get autocorrelation (rough indicator for secondary structure). 1.9238095238095236 ``` +Wobble base-pairing can be taken into account. + +```python +>>> correlation('GGGTTT') +0.0 +>>> correlation('GGGTTT', wobble=True) +2.3 +>>> correlation('GGGUUU', wobble=True) +2.3 +``` + Provide a second sequence to get correlation between sequences. ```python diff --git a/docs/source/usage.md b/docs/source/usage.md index 4b52f92..8cb7372 100644 --- a/docs/source/usage.md +++ b/docs/source/usage.md @@ -63,6 +63,17 @@ Get autocorrelation (rough indicator for secondary structure). 1.9238095238095236 ``` +Wobble base-pairing can be taken into account. + +```python +>>> correlation('GGGTTT') +0.0 +>>> correlation('GGGTTT', wobble=True) +2.3 +>>> correlation('GGGUUU', wobble=True) +2.3 +``` + Provide a second sequence to get correlation between sequences. ```python diff --git a/pyproject.toml b/pyproject.toml index 439455f..db06c76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,12 @@ [project] name = "streq" -version = "0.0.1" +version = "0.0.2" authors = [ { name="Eachan Johnson", email="eachan.johnson@crick.ac.uk" }, ] description = "Basic utilities for working with nucleotide sequence strings." readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.8" license = {file = "LICENSE"} keywords = ["nucleotide", "sequence", "science"] @@ -20,6 +20,8 @@ classifiers = [ "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3 :: Only", diff --git a/streq/distance.py b/streq/distance.py index 454ae3d..e5d9093 100644 --- a/streq/distance.py +++ b/streq/distance.py @@ -1,4 +1,10 @@ +"""Functions for calculating distances and similarities between sequences.""" + +from __future__ import annotations + +from collections.abc import Callable from difflib import SequenceMatcher +from functools import partial from .seqtools import reverse_complement from .utils import _normalize_case @@ -69,14 +75,13 @@ def levenshtein(x: str, y: str) -> int: @_normalize_case(nargs=2) -def hamming(x: str, y: str, - wobble: bool = False) -> int: +def hamming(x: str, y: str) -> int: """Calculate the Hamming distance between two sequences. - The Hamming distance is the number of mismatches. - - Note: the `wobble` parameter is not yet implemented. + The Hamming distance is the number of mismatches for two sequences + of identical length. This function truncates the longer sequence + to the shortest length. Parameters ---------- @@ -84,9 +89,6 @@ def hamming(x: str, y: str, Sequence. y : str, optional Second sequence for correlation with x. - wobble : bool, optional - Whether to calulate correlations taking into account G.U wobble. - Not yet implemented. Returns ------- @@ -103,7 +105,7 @@ def hamming(x: str, y: str, 3 """ - + return sum(a != b for a, b in zip(x, y)) @@ -147,19 +149,35 @@ def ratcliff_obershelp(x: str, return sum(code[0] != 'equal' for code in sm) +def mismatch_fun(x, y, n, wobble): + + wobbles = sum(wobble and ((a == 'G' and b == 'A') or (a in 'TU' and b == 'C')) for a, b in zip(x[n:], y)) + return hamming(x[n:], y) - wobbles + + +def _mismatch_fun(wobble: bool = False) -> Callable[[str, str, int], int]: + + return partial(mismatch_fun, wobble=wobble) + @_normalize_case(nargs=2) def correlation(x: str, y: str = '', wobble: bool = False) -> float: - """Get autocorrelation of a single sequence or + """Calculate autocorrelation of a single sequence or correlation between two sequences. - If a single sequence is provided, it's autocorrelation - is calculated, which might be an indicator of secondary structure. + If a single sequence is provided, its correlation with its + reverse complement is calculated, which might be an indicator + of secondary structure. - Note: the `wobble` parameter is not yet implemented. + If two sequences are provided, then the correlation between the + first sequence and the reverse complement of the second sequence + is calculated. This might be an indicator of binding affinity. + + If wobble is True, then the G.U basepairing is also taken into + account. Parameters ---------- @@ -169,7 +187,6 @@ def correlation(x: str, Second sequence for correlation with x. wobble : bool, optional Whether to calulate correlations taking into account G.U wobble. - Not yet implemented. Returns ------- @@ -187,16 +204,30 @@ def correlation(x: str, >>> correlation('AAACTTT') 1.9238095238095236 >>> correlation('AAA', 'TTT') - 0.0 + 3.0 >>> correlation('AAA', 'AAA') + 0.0 + >>> correlation('GGGTTT') + 0.0 + >>> correlation('GGGTTT', wobble=True) + 2.3 + >>> correlation('GGGUUU', wobble=True) + 2.3 + >>> correlation('GGG', 'UUU') + 0.0 + >>> correlation('GGG', 'UUU', wobble=True) 3.0 """ x = x - y = y or reverse_complement(x) + y = reverse_complement(y or x) - max_len = min(len(x), len(y)) - - return sum(((max_len - n) - hamming(x[n:], y, wobble)) / (max_len - n) - for n in range(len(x))) \ No newline at end of file + len_x = len(x) + + max_len = min(len_x, len(y)) + + mismatch_fun = _mismatch_fun(wobble=wobble) + + return sum(((max_len - n) - mismatch_fun(x, y, n)) / (max_len - n) + for n in range(len_x)) \ No newline at end of file diff --git a/streq/seqtools.py b/streq/seqtools.py index 4a9d6c8..5652992 100644 --- a/streq/seqtools.py +++ b/streq/seqtools.py @@ -5,8 +5,9 @@ """ +from __future__ import annotations + from collections.abc import Generator, Sequence -from difflib import SequenceMatcher import re from .utils import (sequences, diff --git a/streq/utils.py b/streq/utils.py index 3fc5671..be7caf4 100644 --- a/streq/utils.py +++ b/streq/utils.py @@ -1,5 +1,7 @@ """Miscellaneous utilities used in streq.""" +from __future__ import annotations + from collections import namedtuple from collections.abc import Callable, Sequence from functools import wraps From 34c250de660f2568124aa74d90d409803370c319 Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 2 Jun 2023 12:08:30 +0100 Subject: [PATCH 2/3] Update .gitignore --- .gitignore | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index b59571f..32ccb7a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,25 +2,18 @@ *.so *.egg-info *.whl -/build/lib -/build/bazel* -/dist/ -.ipynb_checkpoints -/bazel-* -.jax_configure.bazelrc -/tensorflow .DS_Store .mypy_cache/ .pytype/ -/docs/build -*_pb2.py -/docs/notebooks/.ipynb_checkpoints/ -/docs/_autosummary +docs/build +docs/_autosummary .idea .vscode .envrc -jax.iml -.bazelrc.user +/*/__pycache__/ +/test/*.xml +.coverage +.pytest_cache # virtualenv/venv directories /venv/ From 54cffa421a00e4d428d2704926523f8a8044dd91 Mon Sep 17 00:00:00 2001 From: Eachan Johnson Date: Fri, 2 Jun 2023 12:08:52 +0100 Subject: [PATCH 3/3] Improve testing workflow --- .github/workflows/python-package.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 841f2e3..aab947b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -3,11 +3,7 @@ name: Python package -on: - push: - branches: [ $default-branch ] - pull_request: - branches: [ $default-branch ] +on: [push] jobs: build: @@ -29,6 +25,7 @@ jobs: python -m pip install --upgrade pip python -m pip install flake8 pytest pytest-cov if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install -e . - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -37,4 +34,11 @@ jobs: flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | - pytest streq --doctest-modules --junitxml=tests/test-results.xml --cov=com --cov-report=xml --cov-report=html \ No newline at end of file + pytest streq --doctest-modules --junitxml=test/test-results.xml --cov=com --cov-report=xml --cov-report=html + - name: Upload pytest test results + uses: actions/upload-artifact@v3 + with: + name: pytest-results-${{ matrix.python-version }} + path: junit/test-results-${{ matrix.python-version }}.xml + # Use always() to always run this step to publish test results when there are test failures + if: ${{ always() }} \ No newline at end of file