Source code for streq.seqtools

"""Python utilities for working with nucleotide sequence strings.

Variety of utilities for converting, searching, and doing 
calculations on nucleotide sequences.

"""

from collections.abc import Generator, Sequence
from difflib import SequenceMatcher
import re

from .utils import (sequences, 
                    _preserve_case, 
                    _preserve_circular,
                    _normalize_case)

seqs = sequences


[docs]@_preserve_circular
def reverse(x: str) -> str:

    """Reverse a sequence.

    Parameters
    ----------
    x : str
        Sequence to convert.

    Returns
    -------
    str
        Converted sequence.

    """

    return x[::-1]


[docs]@_preserve_circular
@_preserve_case
@_normalize_case(nargs=1)
def complement(x: str) -> str:

    """Complement (but don't reverse) a sequence.

    Parameters
    ----------
    x : str
        Sequence to convert.

    Returns
    -------
    str
        Converted sequence.

    """

    return x.translate(seqs.complementer)


[docs]def reverse_complement(x: str) -> str:

    """Reverse complement a sequence.

    Parameters
    ----------
    x : str
        Sequence to convert.

    Returns
    -------
    str
        Converted sequence.

    Examples
    --------
    >>> reverse_complement('ATCG')
    'CGAT'

    """

    return complement(reverse(x))


[docs]@_preserve_circular
@_preserve_case
@_normalize_case(nargs=1)
def to_rna(x: str) -> str:

    """Convert nucleotides to RNA.

    Parameters
    ----------
    x : str
        Sequence to convert.

    Returns
    -------
    str
        Converted sequence.

    Examples
    --------
    >>> to_rna('ATCG')
    'AUCG'

    """

    return x.replace('T', 'U')


[docs]@_preserve_circular
@_preserve_case
@_normalize_case(nargs=1)
def to_dna(x: str) -> str:

    """Convert nucleotides to DNA.

    Parameters
    ----------
    x : str
        Sequence to convert.

    Returns
    -------
    str
        Converted sequence.

    Examples
    --------
    >>> to_dna('AUCG')
    'ATCG'

    """

    return x.replace('U', 'T')


[docs]@_normalize_case(nargs=2)
def find_iupac(query: str, 
               sequence: str) -> Generator[Sequence[int], str]:
    
    """Find occurrences of a query in a larger sequence.

    IUPAC codes in the query will be interpreted as ambiguities:

    A: A
    C: C
    G: G
    T: T
    U: U
    N: .
    R: "[AG]"
    Y: "[TUC]"
    W: "[ATU]"
    S: "[CG]"
    V: "[ACG]"
    B: "[TUGC]"

    Parameters
    ----------
    query : str
        Sequence to search for. Accepts IUPAC codes: N, R, Y, S, W, V, B.
    sequence : str 
        Sequence to search within.
    
    Yields
    ------
    Generator
        Generator of tuples containing the match indices and matched sequence.
    indices : tuple
        Start and stop indices of the match
    sequence : str
        matched sequence

    Examples
    --------
    >>> for (start_idx, end_idx), match in find_iupac('ARY', 'AATAGCAGTGTGAAC'):
    ...     print(f"Found ARY at {start_idx}:{end_idx}: {match}")
    ... 
    Found ARY at 0:3: AAT
    Found ARY at 3:6: AGC
    Found ARY at 6:9: AGT
    Found ARY at 12:15: AAC

    """
    
    query = query.translate(seqs.base2regex)
    query = re.compile(query) 
    
    for match in query.finditer(sequence):

        yield match.span(), match.group()


[docs]@_normalize_case(nargs=1)
def which_re_sites(x: str) -> Sequence[str]:

    """List Type IIS restriction sites in sequence.

    Currently only searches for the most commonly used
    Type IIS restriction sites for Golden Gate Cloning:

    BbsI: GAAGAC
    BsmBI: CGTCTC
    BtgZI: GCGATG
    PaqCI: CACCTGC
    SapI: GCTCTTC
    BsaI: GGTCTC

    Parameters
    ----------
    x : str
        Sequence to check.

    Returns
    -------
    tuple
        List of Type IIS restriction sites in x

    Examples
    --------
    >>> which_re_sites('AAAGAAG')
    ()
    >>> which_re_sites('AAAGAAGAC')
    ('BbsI',)
    >>> which_re_sites('AAAGAAGACACCTGC')
    ('BbsI', 'PaqCI')
    
    """

    fwd = [enz for enz, site in seqs.re_sites.items() 
           if (site in x) or 
           (reverse_complement(site) in x)]

    return tuple(fwd)


[docs]@_normalize_case(nargs=1)
def count_re_sites(x: str) -> bool:

    """Count Type IIS restriction sites in sequence.

    Currently only searches for the most commonly used
    Type IIS restriction sites for Golden Gate Cloning:

    BbsI: GAAGAC
    BsmBI: CGTCTC
    BtgZI: GCGATG
    PaqCI: CACCTGC
    SapI: GCTCTTC
    BsaI: GGTCTC

    Parameters
    ----------
    x : str
        Sequence to check.

    Returns
    -------
    int
        Number of Type IIS restriction sites in x.

    Examples
    --------
    >>> count_re_sites('AAAGAAG')
    0
    >>> count_re_sites('AAAGAAGAC')
    1
    >>> count_re_sites('AAAGAAGACACCTGC')
    2
    
    """

    return len(which_re_sites(x))
    

@_normalize_case(nargs=2)
def _x_content(x: str, y: str) -> float:

    try:
        return sum(letter in y for letter in x) / len(x)
    except ZeroDivisionError:
        return 0. 
    

[docs]def gc_content(x: str) -> float:

    """Calculate proportional GC content.

    Recognises IUPAC codes.

    Parameters
    ----------
    x : str
        Sequence.

    Returns
    -------
    float
        GC content.

    Examples
    --------
    >>> gc_content('AGGG')
    0.75

    """

    return _x_content(x, 'GCS')


[docs]def purine_content(x: str) -> float:

    """Calculate proportional purine content.

    Recognises IUPAC codes.

    Parameters
    ----------
    x : str
        Sequence.

    Returns
    -------
    float
        Purine content.

    Examples
    --------
    >>> purine_content('AUGGR')
    0.8

    """

    return _x_content(x, 'GAR')


[docs]def pyrimidine_content(x: str) -> float:

    """Calculate proportional pyrimidine content.

    Recognises IUPAC codes.

    Parameters
    ----------
    x : str
        Sequence.

    Returns
    -------
    float
        Pyrimidine content.
    
    Examples
    --------
    >>> pyrimidine_content('AUGGG')
    0.2

    """

    return _x_content(x, 'CUTY')