"""Python utilities for working with nucleotide sequence strings.
Variety of utilities for converting, searching, and doing
calculations on nucleotide sequences.
"""
from __future__ import annotations
from collections.abc import Generator, Sequence
import re
from .utils import (sequences,
_preserve_case,
_preserve_circular,
_normalize_case)
seqs = sequences
[docs]@_preserve_circular
def reverse(x: str) -> str:
"""Reverse a sequence.
Parameters
----------
x : str
Sequence to convert.
Returns
-------
str
Converted sequence.
"""
return x[::-1]
[docs]@_preserve_circular
@_preserve_case
@_normalize_case(nargs=1)
def complement(x: str) -> str:
"""Complement (but don't reverse) a sequence.
Parameters
----------
x : str
Sequence to convert.
Returns
-------
str
Converted sequence.
"""
return x.translate(seqs.complementer)
[docs]def reverse_complement(x: str) -> str:
"""Reverse complement a sequence.
Parameters
----------
x : str
Sequence to convert.
Returns
-------
str
Converted sequence.
Examples
--------
>>> reverse_complement('ATCG')
'CGAT'
"""
return complement(reverse(x))
[docs]@_preserve_circular
@_preserve_case
@_normalize_case(nargs=1)
def to_rna(x: str) -> str:
"""Convert nucleotides to RNA.
Parameters
----------
x : str
Sequence to convert.
Returns
-------
str
Converted sequence.
Examples
--------
>>> to_rna('ATCG')
'AUCG'
"""
return x.replace('T', 'U')
[docs]@_preserve_circular
@_preserve_case
@_normalize_case(nargs=1)
def to_dna(x: str) -> str:
"""Convert nucleotides to DNA.
Parameters
----------
x : str
Sequence to convert.
Returns
-------
str
Converted sequence.
Examples
--------
>>> to_dna('AUCG')
'ATCG'
"""
return x.replace('U', 'T')
[docs]@_normalize_case(nargs=2)
def find_iupac(query: str,
sequence: str) -> Generator[Sequence[int], str]:
"""Find occurrences of a query in a larger sequence.
IUPAC codes in the query will be interpreted as ambiguities:
A: A
C: C
G: G
T: T
U: U
N: .
R: "[AG]"
Y: "[TUC]"
W: "[ATU]"
S: "[CG]"
V: "[ACG]"
B: "[TUGC]"
Parameters
----------
query : str
Sequence to search for. Accepts IUPAC codes: N, R, Y, S, W, V, B.
sequence : str
Sequence to search within.
Yields
------
Generator
Generator of tuples containing the match indices and matched sequence.
indices : tuple
Start and stop indices of the match
sequence : str
matched sequence
Examples
--------
>>> for (start_idx, end_idx), match in find_iupac('ARY', 'AATAGCAGTGTGAAC'):
... print(f"Found ARY at {start_idx}:{end_idx}: {match}")
...
Found ARY at 0:3: AAT
Found ARY at 3:6: AGC
Found ARY at 6:9: AGT
Found ARY at 12:15: AAC
"""
query = query.translate(seqs.base2regex)
query = re.compile(query)
for match in query.finditer(sequence):
yield match.span(), match.group()
[docs]@_normalize_case(nargs=1)
def which_re_sites(x: str) -> Sequence[str]:
"""List Type IIS restriction sites in sequence.
Currently only searches for the most commonly used
Type IIS restriction sites for Golden Gate Cloning:
BbsI: GAAGAC
BsmBI: CGTCTC
BtgZI: GCGATG
PaqCI: CACCTGC
SapI: GCTCTTC
BsaI: GGTCTC
Parameters
----------
x : str
Sequence to check.
Returns
-------
tuple
List of Type IIS restriction sites in x
Examples
--------
>>> which_re_sites('AAAGAAG')
()
>>> which_re_sites('AAAGAAGAC')
('BbsI',)
>>> which_re_sites('AAAGAAGACACCTGC')
('BbsI', 'PaqCI')
"""
fwd = [enz for enz, site in seqs.re_sites.items()
if (site in x) or
(reverse_complement(site) in x)]
return tuple(fwd)
[docs]@_normalize_case(nargs=1)
def count_re_sites(x: str) -> bool:
"""Count Type IIS restriction sites in sequence.
Currently only searches for the most commonly used
Type IIS restriction sites for Golden Gate Cloning:
BbsI: GAAGAC
BsmBI: CGTCTC
BtgZI: GCGATG
PaqCI: CACCTGC
SapI: GCTCTTC
BsaI: GGTCTC
Parameters
----------
x : str
Sequence to check.
Returns
-------
int
Number of Type IIS restriction sites in x.
Examples
--------
>>> count_re_sites('AAAGAAG')
0
>>> count_re_sites('AAAGAAGAC')
1
>>> count_re_sites('AAAGAAGACACCTGC')
2
"""
return len(which_re_sites(x))
@_normalize_case(nargs=2)
def _x_content(x: str, y: str) -> float:
try:
return sum(letter in y for letter in x) / len(x)
except ZeroDivisionError:
return 0.
[docs]def gc_content(x: str) -> float:
"""Calculate proportional GC content.
Recognises IUPAC codes.
Parameters
----------
x : str
Sequence.
Returns
-------
float
GC content.
Examples
--------
>>> gc_content('AGGG')
0.75
"""
return _x_content(x, 'GCS')
[docs]def purine_content(x: str) -> float:
"""Calculate proportional purine content.
Recognises IUPAC codes.
Parameters
----------
x : str
Sequence.
Returns
-------
float
Purine content.
Examples
--------
>>> purine_content('AUGGR')
0.8
"""
return _x_content(x, 'GAR')
[docs]def pyrimidine_content(x: str) -> float:
"""Calculate proportional pyrimidine content.
Recognises IUPAC codes.
Parameters
----------
x : str
Sequence.
Returns
-------
float
Pyrimidine content.
Examples
--------
>>> pyrimidine_content('AUGGG')
0.2
"""
return _x_content(x, 'CUTY')