"""Functions for calculating distances and similarities between sequences."""
from __future__ import annotations
from collections.abc import Callable
from difflib import SequenceMatcher
from functools import partial
from .seqtools import reverse_complement
from .utils import _normalize_case
[docs]@_normalize_case(nargs=2)
def levenshtein(x: str, y: str) -> int:
"""Calculate the Levenshtein distance between two sequences.
The Levenshtein distance is the number of insertions,
deletions, and mutations required to make two
sequences match.
Parameters
----------
x : str
Sequence.
y : str, optional
Second sequence for correlation with x.
Returns
-------
int
Levenshtein distance.
Examples
--------
>>> levenshtein('AAATTT', 'AAATTT')
0
>>> levenshtein('AAATTT', 'ACTTT')
2
>>> levenshtein('AAATTT', 'AACTTT')
1
>>> levenshtein('AAAG', 'TCGA')
4
"""
x_len, y_len = map(len, (x, y))
matrix = {i: {j: 0 for j in range(y_len + 1)}
for i in range(x_len + 1)}
for i in range(x_len + 1):
matrix[i][0] = i
for j in range(y_len + 1):
matrix[0][j] = j
for i in range(1, x_len + 1):
for j in range(1, y_len + 1):
if x[i - 1] == y[j - 1]:
matrix[i][j] = matrix[i - 1][j - 1]
else:
minimum = min(matrix[i - 1][j],
matrix[i][j - 1],
matrix[i - 1][j - 1])
matrix[i][j] = minimum + 1
return matrix[x_len][y_len]
[docs]@_normalize_case(nargs=2)
def hamming(x: str, y: str) -> int:
"""Calculate the Hamming distance between two sequences.
The Hamming distance is the number of mismatches for two sequences
of identical length. This function truncates the longer sequence
to the shortest length.
Parameters
----------
x : str
Sequence.
y : str, optional
Second sequence for correlation with x.
Returns
-------
int
Hamming distance.
Examples
--------
>>> hamming('AAA', 'ATA')
1
>>> hamming('AAA', 'ATT')
2
>>> hamming('AAA', 'TTT')
3
"""
return sum(a != b for a, b in zip(x, y))
[docs]@_normalize_case(nargs=2)
def ratcliff_obershelp(x: str,
y: str) -> int:
"""Calculate the Ratcliff-Obershelp distance between two sequences.
The Ratcliff-Obershelp distance is the number of grouped
insertions, deletions, and mutations required to make two
sequences match.
Parameters
----------
x : str
Sequence.
y : str, optional
Second sequence for correlation with x.
Returns
-------
int
Ratcliff-Obershelp distance.
Examples
--------
>>> ratcliff_obershelp('AAATTT', 'AAATTT')
0
>>> ratcliff_obershelp('AAATTT', 'ACTTT')
1
>>> ratcliff_obershelp('AAATTT', 'AACTTT')
1
>>> ratcliff_obershelp('AAAG', 'TCGA')
2
"""
sm = SequenceMatcher(a=x, b=y,
autojunk=False).get_opcodes()
return sum(code[0] != 'equal' for code in sm)
[docs]def mismatch_fun(x, y, n, wobble):
wobbles = sum(wobble and ((a == 'G' and b == 'A') or (a in 'TU' and b == 'C')) for a, b in zip(x[n:], y))
return hamming(x[n:], y) - wobbles
def _mismatch_fun(wobble: bool = False) -> Callable[[str, str, int], int]:
return partial(mismatch_fun, wobble=wobble)
[docs]@_normalize_case(nargs=2)
def correlation(x: str,
y: str = '',
wobble: bool = False) -> float:
"""Calculate autocorrelation of a single sequence or
correlation between two sequences.
If a single sequence is provided, its correlation with its
reverse complement is calculated, which might be an indicator
of secondary structure.
If two sequences are provided, then the correlation between the
first sequence and the reverse complement of the second sequence
is calculated. This might be an indicator of binding affinity.
If wobble is True, then the G.U basepairing is also taken into
account.
Parameters
----------
x : str
Sequence.
y : str, optional
Second sequence for correlation with x.
wobble : bool, optional
Whether to calulate correlations taking into account G.U wobble.
Returns
-------
float
Correlation.
Examples
--------
>>> correlation('AACC')
0.0
>>> correlation('AAATTT')
2.3
>>> correlation('AAATTCT')
1.3047619047619046
>>> correlation('AAACTTT')
1.9238095238095236
>>> correlation('AAA', 'TTT')
3.0
>>> correlation('AAA', 'AAA')
0.0
>>> correlation('GGGTTT')
0.0
>>> correlation('GGGTTT', wobble=True)
2.3
>>> correlation('GGGUUU', wobble=True)
2.3
>>> correlation('GGG', 'UUU')
0.0
>>> correlation('GGG', 'UUU', wobble=True)
3.0
"""
x = x
y = reverse_complement(y or x)
len_x = len(x)
max_len = min(len_x, len(y))
mismatch_fun = _mismatch_fun(wobble=wobble)
return sum(((max_len - n) - mismatch_fun(x, y, n)) / (max_len - n)
for n in range(len_x))