Source code for streq.seqtools

"""Python utilities for working with nucleotide sequence strings.

Variety of utilities for converting, searching, and doing 
calculations on nucleotide sequences.

"""

from __future__ import annotations

from collections.abc import Generator, Sequence
import re

from .utils import (sequences, 
                    _preserve_case, 
                    _preserve_circular,
                    _normalize_case)

seqs = sequences


[docs]@_preserve_circular def reverse(x: str) -> str: """Reverse a sequence. Parameters ---------- x : str Sequence to convert. Returns ------- str Converted sequence. """ return x[::-1]
[docs]@_preserve_circular @_preserve_case @_normalize_case(nargs=1) def complement(x: str) -> str: """Complement (but don't reverse) a sequence. Parameters ---------- x : str Sequence to convert. Returns ------- str Converted sequence. """ return x.translate(seqs.complementer)
[docs]def reverse_complement(x: str) -> str: """Reverse complement a sequence. Parameters ---------- x : str Sequence to convert. Returns ------- str Converted sequence. Examples -------- >>> reverse_complement('ATCG') 'CGAT' """ return complement(reverse(x))
[docs]@_preserve_circular @_preserve_case @_normalize_case(nargs=1) def to_rna(x: str) -> str: """Convert nucleotides to RNA. Parameters ---------- x : str Sequence to convert. Returns ------- str Converted sequence. Examples -------- >>> to_rna('ATCG') 'AUCG' """ return x.replace('T', 'U')
[docs]@_preserve_circular @_preserve_case @_normalize_case(nargs=1) def to_dna(x: str) -> str: """Convert nucleotides to DNA. Parameters ---------- x : str Sequence to convert. Returns ------- str Converted sequence. Examples -------- >>> to_dna('AUCG') 'ATCG' """ return x.replace('U', 'T')
[docs]@_normalize_case(nargs=2) def find_iupac(query: str, sequence: str) -> Generator[Sequence[int], str]: """Find occurrences of a query in a larger sequence. IUPAC codes in the query will be interpreted as ambiguities: A: A C: C G: G T: T U: U N: . R: "[AG]" Y: "[TUC]" W: "[ATU]" S: "[CG]" V: "[ACG]" B: "[TUGC]" Parameters ---------- query : str Sequence to search for. Accepts IUPAC codes: N, R, Y, S, W, V, B. sequence : str Sequence to search within. Yields ------ Generator Generator of tuples containing the match indices and matched sequence. indices : tuple Start and stop indices of the match sequence : str matched sequence Examples -------- >>> for (start_idx, end_idx), match in find_iupac('ARY', 'AATAGCAGTGTGAAC'): ... print(f"Found ARY at {start_idx}:{end_idx}: {match}") ... Found ARY at 0:3: AAT Found ARY at 3:6: AGC Found ARY at 6:9: AGT Found ARY at 12:15: AAC """ query = query.translate(seqs.base2regex) query = re.compile(query) for match in query.finditer(sequence): yield match.span(), match.group()
[docs]@_normalize_case(nargs=1) def which_re_sites(x: str) -> Sequence[str]: """List Type IIS restriction sites in sequence. Currently only searches for the most commonly used Type IIS restriction sites for Golden Gate Cloning: BbsI: GAAGAC BsmBI: CGTCTC BtgZI: GCGATG PaqCI: CACCTGC SapI: GCTCTTC BsaI: GGTCTC Parameters ---------- x : str Sequence to check. Returns ------- tuple List of Type IIS restriction sites in x Examples -------- >>> which_re_sites('AAAGAAG') () >>> which_re_sites('AAAGAAGAC') ('BbsI',) >>> which_re_sites('AAAGAAGACACCTGC') ('BbsI', 'PaqCI') """ fwd = [enz for enz, site in seqs.re_sites.items() if (site in x) or (reverse_complement(site) in x)] return tuple(fwd)
[docs]@_normalize_case(nargs=1) def count_re_sites(x: str) -> bool: """Count Type IIS restriction sites in sequence. Currently only searches for the most commonly used Type IIS restriction sites for Golden Gate Cloning: BbsI: GAAGAC BsmBI: CGTCTC BtgZI: GCGATG PaqCI: CACCTGC SapI: GCTCTTC BsaI: GGTCTC Parameters ---------- x : str Sequence to check. Returns ------- int Number of Type IIS restriction sites in x. Examples -------- >>> count_re_sites('AAAGAAG') 0 >>> count_re_sites('AAAGAAGAC') 1 >>> count_re_sites('AAAGAAGACACCTGC') 2 """ return len(which_re_sites(x))
@_normalize_case(nargs=2) def _x_content(x: str, y: str) -> float: try: return sum(letter in y for letter in x) / len(x) except ZeroDivisionError: return 0.
[docs]def gc_content(x: str) -> float: """Calculate proportional GC content. Recognises IUPAC codes. Parameters ---------- x : str Sequence. Returns ------- float GC content. Examples -------- >>> gc_content('AGGG') 0.75 """ return _x_content(x, 'GCS')
[docs]def purine_content(x: str) -> float: """Calculate proportional purine content. Recognises IUPAC codes. Parameters ---------- x : str Sequence. Returns ------- float Purine content. Examples -------- >>> purine_content('AUGGR') 0.8 """ return _x_content(x, 'GAR')
[docs]def pyrimidine_content(x: str) -> float: """Calculate proportional pyrimidine content. Recognises IUPAC codes. Parameters ---------- x : str Sequence. Returns ------- float Pyrimidine content. Examples -------- >>> pyrimidine_content('AUGGG') 0.2 """ return _x_content(x, 'CUTY')