Source code for streq.seqtools

"""Python utilities for working with nucleotide sequence strings.

Variety of utilities for converting, searching, and doing 
calculations on nucleotide sequences.

"""

from collections.abc import Generator, Sequence
from difflib import SequenceMatcher
import re

from .utils import (sequences, 
                    _preserve_case, 
                    _preserve_circular,
                    _normalize_case)

seqs = sequences


[docs]@_preserve_circular def reverse(x: str) -> str: """Reverse a sequence. Parameters ---------- x : str Sequence to convert. Returns ------- str Converted sequence. """ return x[::-1]
[docs]@_preserve_circular @_preserve_case @_normalize_case(nargs=1) def complement(x: str) -> str: """Complement (but don't reverse) a sequence. Parameters ---------- x : str Sequence to convert. Returns ------- str Converted sequence. """ return x.translate(seqs.complementer)
[docs]def reverse_complement(x: str) -> str: """Reverse complement a sequence. Parameters ---------- x : str Sequence to convert. Returns ------- str Converted sequence. Examples -------- >>> reverse_complement('ATCG') 'CGAT' """ return complement(reverse(x))
[docs]@_preserve_circular @_preserve_case @_normalize_case(nargs=1) def to_rna(x: str) -> str: """Convert nucleotides to RNA. Parameters ---------- x : str Sequence to convert. Returns ------- str Converted sequence. Examples -------- >>> to_rna('ATCG') 'AUCG' """ return x.replace('T', 'U')
[docs]@_preserve_circular @_preserve_case @_normalize_case(nargs=1) def to_dna(x: str) -> str: """Convert nucleotides to DNA. Parameters ---------- x : str Sequence to convert. Returns ------- str Converted sequence. Examples -------- >>> to_dna('AUCG') 'ATCG' """ return x.replace('U', 'T')
[docs]@_normalize_case(nargs=2) def find_iupac(query: str, sequence: str) -> Generator[Sequence[int], str]: """Find occurrences of a query in a larger sequence. IUPAC codes in the query will be interpreted as ambiguities: A: A C: C G: G T: T U: U N: . R: "[AG]" Y: "[TUC]" W: "[ATU]" S: "[CG]" V: "[ACG]" B: "[TUGC]" Parameters ---------- query : str Sequence to search for. Accepts IUPAC codes: N, R, Y, S, W, V, B. sequence : str Sequence to search within. Yields ------ Generator Generator of tuples containing the match indices and matched sequence. indices : tuple Start and stop indices of the match sequence : str matched sequence Examples -------- >>> for (start_idx, end_idx), match in find_iupac('ARY', 'AATAGCAGTGTGAAC'): ... print(f"Found ARY at {start_idx}:{end_idx}: {match}") ... Found ARY at 0:3: AAT Found ARY at 3:6: AGC Found ARY at 6:9: AGT Found ARY at 12:15: AAC """ query = query.translate(seqs.base2regex) query = re.compile(query) for match in query.finditer(sequence): yield match.span(), match.group()
[docs]@_normalize_case(nargs=1) def which_re_sites(x: str) -> Sequence[str]: """List Type IIS restriction sites in sequence. Currently only searches for the most commonly used Type IIS restriction sites for Golden Gate Cloning: BbsI: GAAGAC BsmBI: CGTCTC BtgZI: GCGATG PaqCI: CACCTGC SapI: GCTCTTC BsaI: GGTCTC Parameters ---------- x : str Sequence to check. Returns ------- tuple List of Type IIS restriction sites in x Examples -------- >>> which_re_sites('AAAGAAG') () >>> which_re_sites('AAAGAAGAC') ('BbsI',) >>> which_re_sites('AAAGAAGACACCTGC') ('BbsI', 'PaqCI') """ fwd = [enz for enz, site in seqs.re_sites.items() if (site in x) or (reverse_complement(site) in x)] return tuple(fwd)
[docs]@_normalize_case(nargs=1) def count_re_sites(x: str) -> bool: """Count Type IIS restriction sites in sequence. Currently only searches for the most commonly used Type IIS restriction sites for Golden Gate Cloning: BbsI: GAAGAC BsmBI: CGTCTC BtgZI: GCGATG PaqCI: CACCTGC SapI: GCTCTTC BsaI: GGTCTC Parameters ---------- x : str Sequence to check. Returns ------- int Number of Type IIS restriction sites in x. Examples -------- >>> count_re_sites('AAAGAAG') 0 >>> count_re_sites('AAAGAAGAC') 1 >>> count_re_sites('AAAGAAGACACCTGC') 2 """ return len(which_re_sites(x))
@_normalize_case(nargs=2) def _x_content(x: str, y: str) -> float: try: return sum(letter in y for letter in x) / len(x) except ZeroDivisionError: return 0.
[docs]def gc_content(x: str) -> float: """Calculate proportional GC content. Recognises IUPAC codes. Parameters ---------- x : str Sequence. Returns ------- float GC content. Examples -------- >>> gc_content('AGGG') 0.75 """ return _x_content(x, 'GCS')
[docs]def purine_content(x: str) -> float: """Calculate proportional purine content. Recognises IUPAC codes. Parameters ---------- x : str Sequence. Returns ------- float Purine content. Examples -------- >>> purine_content('AUGGR') 0.8 """ return _x_content(x, 'GAR')
[docs]def pyrimidine_content(x: str) -> float: """Calculate proportional pyrimidine content. Recognises IUPAC codes. Parameters ---------- x : str Sequence. Returns ------- float Pyrimidine content. Examples -------- >>> pyrimidine_content('AUGGG') 0.2 """ return _x_content(x, 'CUTY')