# -*- coding: utf-8 -*-
"""
.. wikisection:: overview
:title: (1) Alphabets and Sequences
An :class:`Alphabet` is defined by a list of letters and a
:class:`Sequence` is a list of letters from an alphabet.
>>> from biseqt.sequence import Alphabet, Sequence
>>> A = Alphabet('ACGT')
>>> S = A.parse('AACTTCG')
>>> print S.contents
(0, 0, 1, 3, 3, 1, 2)
>>> print S[:3]
AAC
>>> print S.content_id[:8]
'd9f235b1a44358cc80237d5e5c46c06d81a83e46' # SHA1 of sequence contents
The letters in an alphabet not need be single characters but all must have
the same length.
>>> A = Alphabet(['A1', 'A2', 'A3', 'A4'])
>>> S = A.parse('A1A1A3A2')
>>> print len(S)
4
"""
from itertools import chain
from hashlib import sha1
[docs]class Alphabet(object):
"""A sequence alphabet.
Attributes:
_letters (tuple):
The letters in the alphabet. All ``getitem`` operations (i.e
indexing and slicing) are delegated to this tuple. This attribute
should be considered read-only.
_letlen (int):
The length of the letters in the alphabet when represented as a
string. This attribute should be considered read-only.
"""
def __init__(self, letters):
"""
Args:
letters (iterable):
The elements of this iterable must be hashable, i.e can be
keys of a dictionary, and must respond to :func:`len`.
Typically, they are single character strings.
"""
self._letters = tuple(letters)
self._letlen = len(self._letters[0])
assert all(len(l) == self._letlen for l in self._letters), \
'All alphabet letters must have the same length'
self._idx_by_letter = {l: idx for idx, l in enumerate(self._letters)}
[docs] def letter_to_idx(self, letters):
"""Translates provided letters to the integer sequence corresponding
to the index of each letter in this alphabet.
Args:
letters (iterable): The letters to be translated to integer
indices. Each element retrieved through iteration should be
an element in :attr:`_letters`.
Returns:
tuple
"""
return tuple(self._idx_by_letter[l] for l in letters)
[docs] def parse(self, string):
"""Given a string representation of a sequence returns a corresponding
:class:`Sequence` object.
Args:
string (str): The raw sequence represented as a string.
Returns:
Sequence
"""
assert isinstance(string, str), 'Raw sequence must be in string form'
assert len(string) % self._letlen == 0, 'String representation ' + \
'of sequence must be a multiple of the alphabet letter length'
contents = []
idx = 0
while idx < len(string):
contents.append(string[idx:idx + self._letlen])
idx += self._letlen
contents = self.letter_to_idx(contents)
return Sequence(self, contents)
def __len__(self):
return len(self._letters)
def __eq__(self, other):
assert isinstance(other, Alphabet), \
'Only alphabets can be compared with alphabets'
return self._letters == other._letters
def __getitem__(self, key):
return self._letters.__getitem__(key)
def __repr__(self):
return 'Alphabet([%s])' % \
','.join('"%s"' % self[idx] for idx in range(len(self)))
[docs]class Sequence(object):
"""An immutable sequence of letters from some :class:`Alphabet` which
behaves mostly like a tuple.
Attributes:
alphabet (Alphabet): The :class:`Alphabet` of the sequence.
contents (tuple): The contents of the sequence represented as tuple of
integers of the same length where each letter is represented by
its position in the alphabet.
content_id (string): Hex representation of the sequence SHA1.
"""
def __init__(self, alphabet, contents=()):
"""Initializes the sequence object: translates all letters to integers
corresponding to the position of each letter in the alphabet.
Args:
alphabet (Alphabet):
The :class:`Alphabet` of the sequence.
contents (iterable):
The contents of the sequence as an iterable, each element of
which is the integer representation of a letter from the
:class:`Alphabet`; default is an empty sequence. If the
alphabet letter length is one, this argument can be a string.
"""
assert isinstance(alphabet, Alphabet)
self.alphabet = alphabet
assert all(isinstance(c, int) and c < len(alphabet) for c in contents)
self.contents = tuple(contents)
self.content_id = sha1(str(self)).hexdigest()
[docs] def reverse(self):
"""Returns another sequence whose contents are the reverse of this
sequence in order.
Returns:
Sequence
"""
return Sequence(self.alphabet, tuple(reversed(self.contents)))
def __str__(self):
return ''.join(self.alphabet[idx] for idx in self.contents)
def __repr__(self):
return 'Sequence(%s, contents=%s)' % \
(repr(self.alphabet), repr(self.contents))
def __len__(self):
return len(self.contents)
def __nonzero__(self):
return True if self.contents else False
def __getitem__(self, key):
if isinstance(key, int):
return self.contents[key]
else:
return Sequence(self.alphabet, self.contents.__getitem__(key))
def __eq__(self, other):
return self.alphabet == other.alphabet and \
self.content_id == other.content_id
def __add__(self, other):
if isinstance(other, Sequence):
assert self.alphabet == other.alphabet
contents = other.contents
else:
contents = self.alphabet.letter_to_idx(other)
return Sequence(self.alphabet, self.contents + contents)