import numpy as np
from matplotlib import gridspec
import matplotlib
import logging
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from biseqt.pw import Aligner, STD_MODE, LOCAL
from biseqt.blot import WordBlot
from biseqt.sequence import Alphabet
from biseqt.stochastics import rand_seq, MutationProcess
from util import plot_scored_seeds, plot_seeds
from util import plot_similar_segment, adjust_pw_plot
from util import log, savefig
[docs]def exp_rearrangement():
"""Example demonstrating of Word-Blot for pairwise local similarity search on
two randomly generated sequencees with motif sequences violating
collinearity :math:`S=M_1M_2M_3, T=M'_1M'_1M'_3M'_2` where motif pairs
:math:`(M_i, M'_i)_{i=1,2,3}` have lengths 200, 400, 600 and are related by
match probabilities 0.95, 0.85, and 0.75, respectively.
.. figure::
https://www.dropbox.com/s/nsvsf5gaui6t9ww/rearrangement.png?raw=1
:target:
https://www.dropbox.com/s/nsvsf5gaui6t9ww/rearrangement.png?raw=1
:alt: lightbox
Dynamic programming scores of the forward pass of Smith Waterman are
shown in color code (*left*) with seeds (word length 6) grey intensity
coded according to the local match probability assigned by Word-Blot
(minimum similarity length 200). Similar segments reported by Word-Blot
are shown as grey diagonal strips (*left*) and schematically (*right*)
color coded by their Word-Blot estimated match probabilities (note
agreement with true match probabilities).
"""
# NOTE we are running whole table DP later here; be careful with size
K = 200
wordlen = 6
A = Alphabet('ACGT')
WB_kw = {'g_max': .2, 'sensitivity': .9, 'alphabet': A, 'wordlen': wordlen,
'path': ':memory:', 'log_level': logging.INFO}
# homologies
Hs = [rand_seq(A, i) for i in [i * K for i in range(1, 4)]]
ps = [.95, .85, .75]
Ms = []
for p_match in ps:
subst = gap = 1 - np.sqrt(p_match)
print subst, gap
Ms.append(
MutationProcess(A, subst_probs=subst, ge_prob=gap, go_prob=gap)
)
# connector junk
def J(): return rand_seq(A, 2 * K)
S = J() + Hs[0] + J() + Hs[1] + J() + Hs[2] + J()
Hs = [M.mutate(hom)[0] for hom, M in zip(Hs, Ms)]
T = J() + Hs[0] + J() + Hs[0] + Hs[2] + J() + Hs[1] + J()
fig = plt.figure(figsize=(9, 6))
gs = gridspec.GridSpec(1, 2, width_ratios=[3, 1])
ax_seeds = plt.subplot(gs[0])
ax_mapping = plt.subplot(gs[1])
WB = WordBlot(S, T, **WB_kw)
p_min = .95 * min(ps)
scored_seeds = WB.score_seeds(K)
scored_seeds = [(WB.to_ij_coordinates(*rec['seed']), rec['p'])
for rec in scored_seeds]
plot_seeds(ax_seeds, [x[0] for x in scored_seeds])
cmap = plt.cm.get_cmap('plasma')
sim_segments = list(WB.similar_segments(K_min=K, p_min=p_min))
min_p_obs = min(rec['p'] for rec in sim_segments)
max_p_obs = max(rec['p'] for rec in sim_segments)
for rec in sim_segments:
print rec
seg = rec['segment']
(i_start, i_end), (j_start, j_end) = WB.to_ij_coordinates_seg(seg)
i_ctr, j_ctr = (i_start + i_end) / 2, (j_start + j_end) / 2
color = cmap((rec['p'] - min_p_obs) / (max_p_obs - min_p_obs))[:3]
plot_similar_segment(ax_seeds, seg, lw=5, alpha=.1, c='k')
ax_mapping.plot([1, 1], [i_start, i_end], lw=3, c=color, alpha=.7)
ax_mapping.plot([2, 2], [j_start, j_end], lw=3, c=color, alpha=.7)
ax_mapping.plot([1, 2], [i_ctr, j_ctr], marker='o', markersize=7, lw=2,
c=color, alpha=.4)
ax_mapping.set_xticks([1, 2])
ax_mapping.set_xticklabels(['S', 'T'])
ax_mapping.set_xlim(0, 3)
ax_mapping.set_ylim(0, None)
ax_c = make_axes_locatable(ax_mapping).append_axes('right', size='4%',
pad=0.05)
norm = matplotlib.colors.Normalize(vmin=min_p_obs, vmax=max_p_obs)
matplotlib.colorbar.ColorbarBase(ax_c, cmap=cmap, norm=norm,
orientation='vertical')
aligner_kw = {
'match_score': 1 / p_min - 1,
'mismatch_score': -1,
'ge_score': -1,
'go_score': 0,
'alnmode': STD_MODE,
'alntype': LOCAL,
}
print len(S), len(T)
with Aligner(S, T, **aligner_kw) as aligner:
aligner.solve()
scores = np.array(aligner.table_scores())
min_score = min(scores.flatten())
max_score = max(scores.flatten())
ax_seeds.imshow(scores, cmap='plasma', alpha=.3)
ax_c = make_axes_locatable(ax_seeds).append_axes('right', size='4%',
pad=0.05)
norm = matplotlib.colors.Normalize(vmin=min_score, vmax=max_score)
matplotlib.colorbar.ColorbarBase(ax_c, cmap='plasma', norm=norm,
orientation='vertical')
adjust_pw_plot(ax_seeds, len(S), len(T))
ax_seeds.set_xlabel('T')
ax_seeds.set_ylabel('S')
fig.tight_layout()
savefig(fig, 'rearrangement.png')
def exp_repeat_regions():
gap = .2
subst = .1
K = 500
wordlen = 8
A = Alphabet('ACGT')
# NOTE I can drive sensitivity to 0 and get decent results
WB_kw = {'g_max': .2, 'sensitivity': .9, 'alphabet': A, 'wordlen': wordlen,
'path': ':memory:', 'log_level': logging.INFO}
M = MutationProcess(A, subst_probs=subst, ge_prob=gap, go_prob=gap)
homs = [rand_seq(A, i) for i in [K/2, K, 2 * K, 4 * K]]
def junk(): return rand_seq(A, np.random.randint(2 * K, 4 * K))
junks = [junk() for _ in range(3 * len(homs))]
S = sum([junks[3 * i] + R + junks[3 * i + 1] + R + junks[3 * i + 2] + R
for i, R in enumerate(homs)], A.parse('')) + junk()
homs = [M.mutate(homs[i])[0] for i in range(len(homs))]
T = S
fig = plt.figure(figsize=(6, 5))
ax = fig.add_subplot(1, 1, 1)
log('finding repeat regions')
WB = WordBlot(S, T, **WB_kw)
match = (1 - gap) * (1 - subst)
scored_seeds = WB.score_seeds(K)
# convert to ij coordinates and exclude half of the table (mirror image)
scored_seeds = [(WB.to_ij_coordinates(*rec['seed']), rec['p'])
for rec in scored_seeds if rec['seed'][0] <= 0]
plot_scored_seeds(ax, scored_seeds)
for rec in WB.similar_segments(K_min=K, p_min=match):
segment, scores, p_hat = rec['segment'], rec['scores'], rec['p']
(d_min, d_max), (a_min, a_max) = rec['segment']
if d_min > 0:
# self comparison, exclude half of the table
continue
log('repeat region %s, scores = (%.2f, %.2f), p = %.2f' %
(str(segment), scores[0], scores[1], p_hat))
plot_similar_segment(ax, segment, c='b', lw=5, alpha=.2)
adjust_pw_plot(ax, len(S), len(T))
ax.set_title('Repeat regions', y=1.05, fontsize=10)
fig.suptitle('word len. = %d, min. hom. len = %d, min.match = %.2f' %
(wordlen, K, match), fontsize=8)
fig.tight_layout()
savefig(fig, 'repeat_regions.png')
if __name__ == '__main__':
exp_rearrangement()
exp_repeat_regions()