From 4dacd765fed79c7df48b77a6c935709c4d2e2529 Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 2 Jun 2023 12:08:06 +0100
Subject: [PATCH 1/3] Add wobble to correlation()

---
 README.md            | 15 ++++++++++
 docs/source/usage.md | 11 +++++++
 pyproject.toml       |  6 ++--
 streq/distance.py    | 71 +++++++++++++++++++++++++++++++-------------
 streq/seqtools.py    |  3 +-
 streq/utils.py       |  2 ++
 6 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index f5ade88..0cbc0de 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,9 @@
 # 🧬 streq
 
+![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/scbirlab/streq/python-publish.yml)
+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/streq)
+![PyPI](https://img.shields.io/pypi/v/streq)
+
 Python utilities for working with nucleotide sequence strings.
 
 ## Installation
@@ -85,6 +89,17 @@ Get autocorrelation (rough indicator for secondary structure).
 1.9238095238095236
 ```
 
+Wobble base-pairing can be taken into account.
+
+```python
+>>> correlation('GGGTTT')
+0.0
+>>> correlation('GGGTTT', wobble=True)
+2.3
+>>> correlation('GGGUUU', wobble=True)
+2.3
+```
+
 Provide a second sequence to get correlation between sequences.
 
 ```python
diff --git a/docs/source/usage.md b/docs/source/usage.md
index 4b52f92..8cb7372 100644
--- a/docs/source/usage.md
+++ b/docs/source/usage.md
@@ -63,6 +63,17 @@ Get autocorrelation (rough indicator for secondary structure).
 1.9238095238095236
 ```
 
+Wobble base-pairing can be taken into account.
+
+```python
+>>> correlation('GGGTTT')
+0.0
+>>> correlation('GGGTTT', wobble=True)
+2.3
+>>> correlation('GGGUUU', wobble=True)
+2.3
+```
+
 Provide a second sequence to get correlation between sequences.
 
 ```python
diff --git a/pyproject.toml b/pyproject.toml
index 439455f..db06c76 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,12 +1,12 @@
 [project]
 name = "streq"
-version = "0.0.1"
+version = "0.0.2"
 authors = [
   { name="Eachan Johnson", email="eachan.johnson@crick.ac.uk" },
 ]
 description = "Basic utilities for working with nucleotide sequence strings."
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = ">=3.8"
 license = {file = "LICENSE"}
 keywords = ["nucleotide", "sequence", "science"]
 
@@ -20,6 +20,8 @@ classifiers = [
 
   "License :: OSI Approved :: MIT License",
 
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3 :: Only",
diff --git a/streq/distance.py b/streq/distance.py
index 454ae3d..e5d9093 100644
--- a/streq/distance.py
+++ b/streq/distance.py
@@ -1,4 +1,10 @@
+"""Functions for calculating distances and similarities between sequences."""
+
+from __future__ import annotations
+
+from collections.abc import Callable
 from difflib import SequenceMatcher
+from functools import partial
 
 from .seqtools import reverse_complement
 from .utils import _normalize_case
@@ -69,14 +75,13 @@ def levenshtein(x: str, y: str) -> int:
 
 
 @_normalize_case(nargs=2)
-def hamming(x: str, y: str, 
-	        wobble: bool = False) -> int:
+def hamming(x: str, y: str) -> int:
     
     """Calculate the Hamming distance between two sequences.
     
-    The Hamming distance is the number of mismatches.
-
-    Note: the `wobble` parameter is not yet implemented.
+    The Hamming distance is the number of mismatches for two sequences 
+    of identical length. This function truncates the longer sequence
+    to the shortest length.
 
     Parameters
     ----------
@@ -84,9 +89,6 @@ def hamming(x: str, y: str,
         Sequence.
     y : str, optional
         Second sequence for correlation with x.
-    wobble : bool, optional
-        Whether to calulate correlations taking into account G.U wobble.
-        Not yet implemented.
 
     Returns
     -------
@@ -103,7 +105,7 @@ def hamming(x: str, y: str,
     3
 
     """
-
+    
     return sum(a != b for a, b in zip(x, y))
 
 
@@ -147,19 +149,35 @@ def ratcliff_obershelp(x: str,
     
 	return sum(code[0] != 'equal' for code in sm)
 
+def mismatch_fun(x, y, n, wobble):
+    
+    wobbles = sum(wobble and ((a == 'G' and b == 'A') or (a in 'TU' and b == 'C')) for a, b in zip(x[n:], y))
+    return hamming(x[n:], y) - wobbles
+
+
+def _mismatch_fun(wobble: bool = False) -> Callable[[str, str, int], int]:
+
+    return  partial(mismatch_fun, wobble=wobble)
+
 
 @_normalize_case(nargs=2)
 def correlation(x: str, 
                 y: str = '',
                 wobble: bool = False) -> float:
     
-    """Get autocorrelation of a single sequence or 
+    """Calculate autocorrelation of a single sequence or 
     correlation between two sequences.
 
-    If a single sequence is provided, it's autocorrelation 
-    is calculated, which might be an indicator of secondary structure.
+    If a single sequence is provided, its correlation with its
+    reverse complement is calculated, which might be an indicator 
+    of secondary structure.
 
-    Note: the `wobble` parameter is not yet implemented.
+    If two sequences are provided, then the correlation between the 
+    first sequence and the reverse complement of the second sequence
+    is calculated. This might be an indicator of binding affinity. 
+
+    If wobble is True, then the G.U basepairing is also taken into 
+    account.
 
     Parameters
     ----------
@@ -169,7 +187,6 @@ def correlation(x: str,
         Second sequence for correlation with x.
     wobble : bool, optional
         Whether to calulate correlations taking into account G.U wobble.
-        Not yet implemented.
 
     Returns
     -------
@@ -187,16 +204,30 @@ def correlation(x: str,
     >>> correlation('AAACTTT')
     1.9238095238095236
     >>> correlation('AAA', 'TTT')
-    0.0
+    3.0
     >>> correlation('AAA', 'AAA')
+    0.0
+    >>> correlation('GGGTTT')
+    0.0
+    >>> correlation('GGGTTT', wobble=True)
+    2.3
+    >>> correlation('GGGUUU', wobble=True)
+    2.3
+    >>> correlation('GGG', 'UUU')
+    0.0
+    >>> correlation('GGG', 'UUU', wobble=True)
     3.0
 
     """
     
     x = x
-    y = y or reverse_complement(x)
+    y = reverse_complement(y or x)
 
-    max_len = min(len(x), len(y))
-    
-    return sum(((max_len - n) - hamming(x[n:], y, wobble)) / (max_len - n)
-               for n in range(len(x)))
\ No newline at end of file
+    len_x = len(x)
+
+    max_len = min(len_x, len(y))
+
+    mismatch_fun = _mismatch_fun(wobble=wobble)
+	    
+    return sum(((max_len - n) - mismatch_fun(x, y, n)) / (max_len - n)
+                for n in range(len_x))
\ No newline at end of file
diff --git a/streq/seqtools.py b/streq/seqtools.py
index 4a9d6c8..5652992 100644
--- a/streq/seqtools.py
+++ b/streq/seqtools.py
@@ -5,8 +5,9 @@
 
 """
 
+from __future__ import annotations
+
 from collections.abc import Generator, Sequence
-from difflib import SequenceMatcher
 import re
 
 from .utils import (sequences, 
diff --git a/streq/utils.py b/streq/utils.py
index 3fc5671..be7caf4 100644
--- a/streq/utils.py
+++ b/streq/utils.py
@@ -1,5 +1,7 @@
 """Miscellaneous utilities used in streq."""
 
+from __future__ import annotations
+
 from collections import namedtuple
 from collections.abc import Callable, Sequence
 from functools import wraps

From 34c250de660f2568124aa74d90d409803370c319 Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 2 Jun 2023 12:08:30 +0100
Subject: [PATCH 2/3] Update .gitignore

---
 .gitignore | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index b59571f..32ccb7a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,25 +2,18 @@
 *.so
 *.egg-info
 *.whl
-/build/lib
-/build/bazel*
-/dist/
-.ipynb_checkpoints
-/bazel-*
-.jax_configure.bazelrc
-/tensorflow
 .DS_Store
 .mypy_cache/
 .pytype/
-/docs/build
-*_pb2.py
-/docs/notebooks/.ipynb_checkpoints/
-/docs/_autosummary
+docs/build
+docs/_autosummary
 .idea
 .vscode
 .envrc
-jax.iml
-.bazelrc.user
+/*/__pycache__/
+/test/*.xml
+.coverage
+.pytest_cache
 
 # virtualenv/venv directories
 /venv/

From 54cffa421a00e4d428d2704926523f8a8044dd91 Mon Sep 17 00:00:00 2001
From: Eachan Johnson <eachan.johnson@crick.ac.uk>
Date: Fri, 2 Jun 2023 12:08:52 +0100
Subject: [PATCH 3/3] Improve testing workflow

---
 .github/workflows/python-package.yml | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 841f2e3..aab947b 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -3,11 +3,7 @@
 
 name: Python package
 
-on:
-  push:
-    branches: [ $default-branch ]
-  pull_request:
-    branches: [ $default-branch ]
+on: [push]
 
 jobs:
   build:
@@ -29,6 +25,7 @@ jobs:
         python -m pip install --upgrade pip
         python -m pip install flake8 pytest pytest-cov
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install -e .
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
@@ -37,4 +34,11 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        pytest streq --doctest-modules --junitxml=tests/test-results.xml --cov=com --cov-report=xml --cov-report=html
\ No newline at end of file
+        pytest streq --doctest-modules --junitxml=test/test-results.xml --cov=com --cov-report=xml --cov-report=html
+    - name: Upload pytest test results
+      uses: actions/upload-artifact@v3
+      with:
+          name: pytest-results-${{ matrix.python-version }}
+          path: junit/test-results-${{ matrix.python-version }}.xml
+      # Use always() to always run this step to publish test results when there are test failures
+      if: ${{ always() }}
\ No newline at end of file