#!/usr/bin/env python
from matplotlib import pyplot as plt
from matplotlib import gridspec
import sys
from time import time
import numpy as np
from biseqt.sequence import Alphabet
from biseqt.stochastics import rand_seq, MutationProcess
from biseqt.seeds import SeedIndex
from biseqt.blot import band_radius, band_radii, H0_moments, H1_moments
from util import seq_pair, color_code
from util import plot_with_sd, with_dumpfile, log, savefig
from logging import WARN
@with_dumpfile
def sim_count_seeds(**kw):
ns, n_samples = kw['ns'], kw['n_samples']
gap, subst, wordlen = kw['gap'], kw['subst'], kw['wordlen']
log('simulating # of seeds (%d samples), lengths = %s' %
(n_samples, str(ns)))
def _zero(): return np.zeros((len(ns), n_samples))
A = Alphabet('ACGT')
seed_index_kw = {
'alphabet': A,
'wordlen': wordlen,
'path': kw.get('path', ':memory:'),
'log_level': WARN,
}
sim_data = {
'time': {'pos': _zero(), 'neg': _zero()},
'n_seeds': {'all': {'pos': _zero(), 'neg': _zero()},
'band': {'pos': _zero(), 'neg': _zero()}},
'gap': gap,
'match': (1 - gap) * (1 - subst),
'ns': ns,
'seed_index_kw': seed_index_kw,
}
M = MutationProcess(A, go_prob=gap, ge_prob=gap, subst_probs=subst)
for n_idx, n in enumerate(ns):
radius = band_radius(n, .4, 1 - 1e-4)
log('n = %d ' % n, newline=False)
for idx in range(n_samples):
sys.stderr.write('.')
S_rel, T_rel = seq_pair(n, A, mutation_process=M)
S_urel, T_urel = rand_seq(A, n), rand_seq(A, n)
for key, (S, T) in zip(['pos', 'neg'],
[(S_rel, T_rel), (S_urel, T_urel)]):
t = time()
seed_index = SeedIndex(S, T, **seed_index_kw)
n_seeds = seed_index.seed_count()
sim_data['time'][key][n_idx][idx] = time() - t
sim_data['n_seeds']['all'][key][n_idx][idx] = n_seeds
n_seeds = seed_index.seed_count(d_band=(-radius, radius))
sim_data['n_seeds']['band'][key][n_idx][idx] = n_seeds
sys.stderr.write('\n')
return sim_data
def plot_count_seeds_moments(sim_data, K=None, suffix=''):
ns, wordlen = sim_data['ns'], sim_data['seed_index_kw']['wordlen']
match = sim_data['match']
def _zero(): return {'all': [], 'band': []}
mus_H0, mus_H1, sds_H0, sds_H1 = _zero(), _zero(), _zero(), _zero()
for n in ns:
for mode in ['all', 'band']:
if mode == 'all':
area = n ** 2
else:
radius = band_radius(n, .4, 1 - 1e-4)
area = n * 2 * radius
mu_H0, sd_H0 = H0_moments(4, wordlen, area)
mus_H0[mode].append(mu_H0)
sds_H0[mode].append(sd_H0)
mu_H1, sd_H1 = H1_moments(4, wordlen, area, n, match)
mus_H1[mode].append(mu_H1)
sds_H1[mode].append(sd_H1)
fig = plt.figure(figsize=(11, 5))
ax_t = fig.add_subplot(1, 3, 1)
ax_mu = fig.add_subplot(1, 3, 2)
ax_sd = fig.add_subplot(1, 3, 3)
# time to find all seeds
kw = {'marker': 'o', 'markersize': 4, 'lw': 1, 'alpha': .8}
plot_with_sd(ax_t, ns, 1000 * sim_data['time']['neg'], axis=1, color='r',
label='unrelated', **kw)
plot_with_sd(ax_t, ns, 1000 * sim_data['time']['pos'], axis=1, color='g',
label='related', **kw)
kw = {'markersize': 3, 'lw': 1.2}
for mode, marker, alpha in zip(['all', 'band'], 'ox', [.7, .5]):
kw['alpha'] = alpha
kw['marker'] = marker
pos = sim_data['n_seeds'][mode]['pos']
neg = sim_data['n_seeds'][mode]['neg']
# average no. of seeds
kw['color'] = 'r'
ax_mu.plot(ns, neg.mean(axis=1), label='unrelated (%s)' % mode, **kw)
ax_mu.plot(ns, mus_H0[mode], ls='--', **kw)
kw['color'] = 'g'
ax_mu.plot(ns, pos.mean(axis=1), label='related (%s)' % mode, **kw)
ax_mu.plot(ns, mus_H1[mode], ls='--', **kw)
# std dev. of no of seeds
kw['color'] = 'r'
sds_emp = np.sqrt(neg.var(axis=1))
ax_sd.plot(ns, sds_emp, label='unrelated (%s)' % mode, **kw)
ax_sd.plot(ns, sds_H0[mode], ls='--', **kw)
kw['color'] = 'g'
sds_emp = np.sqrt(pos.var(axis=1))
ax_sd.plot(ns, sds_emp, label='related (%s)' % mode, **kw)
ax_sd.plot(ns, sds_H1[mode], ls='--', **kw)
_ns = np.arange(min(ns), max(ns))
ax_mu.plot(_ns, _ns, color='k', alpha=.9, lw=.5, ls='--')
ax_t.plot(_ns, _ns, color='k', alpha=.9, lw=.5, ls='--')
for ax in [ax_sd, ax_mu, ax_t]:
# ax.set_xlim(None, 1.1 * max(ns))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('sequence length')
ax.set_xticks(ns)
ax.set_xticklabels(ns, rotation=90)
ax.legend(loc='lower right', fontsize=8)
ax_sd.set_ylabel('standard deviation of no. of matching %d-mers' % wordlen)
ax_mu.set_ylabel('average no. of matching %d-mers' % wordlen)
ax_t.set_ylabel('time to find matching %d-mers (ms)' % wordlen)
# n_samples = pos.shape[1]
savefig(fig, 'num_seeds_moments%s.png' % suffix)
[docs]def exp_count_seeds():
"""Shows theoretical and simulation results for the mean and variance of
the number of exactly matching kmers between related and unrelated
sequences as a function of sequence length. Theoretical predictions are
based on *m-dependent Central Limit Theorem* which suggests a limiting
Normal distribution with mean and variance given by
:func:`biseqt.blot.H0_moments` and :func:`biseqt.blot.H1_moments`.
**Supported Claims**
* The theoretical calculations of mean and variance for the number of
seeds, given by :func:`biseqt.blot.H0_moments` and
:func:`biseqt.blot.H1_moments` agree with simulations at least upto 25kbp
sequence lengths.
* Although the expected number of seeds is technically quadratic, the
highest order coefficient is so small that it can be considered
effectively linear in sequence length. Furthermore, we note that word
length provides expoenentially strong control on the number of seeds as
sequence lengths increase; hence maintaining a small quadratic
coefficient across biologically relevant sequence lengths (up to 1 Gbp)
is feasible with reasonable word lenghts (up to 30)
.. figure::
https://www.dropbox.com/s/fkd6u6gec6rzrjm/
num_seeds_moments%5Bw%3D8%5D.png?raw=1
:target:
https://www.dropbox.com/s/fkd6u6gec6rzrjm/
num_seeds_moments%5Bw%3D8%5D.png?raw=1
:alt: lightbox
Time to find all exactly matching 8-mers (*left*) for related (*green*)
and unrelated (*red*) sequences of varying lengths (n=50 samples for
each length; shaded regions show one standard deviation). Related
sequences are mutations of each other with substitution and gap
probabilities both equal to 0.1. For the same simulations, mean
(*middle*) and standard deviation (*right*) of the number of seeds as a
function of sequence length are shown (solid lines) with theoretical
predictions for each case (dashed lines). All axes are in log scale, and
the dotted black lines in the left and middle plots are y=x lines for
comparison.
"""
ns = [200 * 2 ** i for i in range(8)]
gap = .1
subst = .1
n_samples = 50
wordlen = 8
suffix = '[w=%d]' % wordlen
dumpfile = 'num_seeds%s.txt' % suffix
sim_data = sim_count_seeds(
ns=ns, n_samples=n_samples, gap=gap, subst=subst,
wordlen=wordlen, dumpfile=dumpfile, ignore_existing=False)
plot_count_seeds_moments(sim_data, suffix=suffix)
@with_dumpfile
def sim_count_seeds_segment(**kw):
Ks, g_radii, n_samples = kw['Ks'], kw['g_radii'], kw['n_samples']
gap, subst, wordlen = kw['gap'], kw['subst'], kw['wordlen']
def _zero(): return np.zeros((len(Ks), len(g_radii), n_samples))
A = Alphabet('ACGT')
seed_index_kw = {
'alphabet': A,
'wordlen': wordlen,
'path': kw.get('path', ':memory:'),
'log_level': WARN,
}
sim_data = {
'n_seeds': {'pos': _zero(), 'neg': _zero()},
'p_hat': {'pos': _zero(), 'neg': _zero()},
'gap': gap,
'match': (1 - gap) * (1 - subst),
'Ks': Ks,
'g_radii': g_radii,
'seed_index_kw': seed_index_kw,
}
M = MutationProcess(A, go_prob=gap, ge_prob=gap, subst_probs=subst)
for K_idx, K in enumerate(Ks):
log('K = %d' % K, newline=False)
for idx in range(n_samples):
sys.stderr.write('.')
S_rel, T_rel = seq_pair(K, A, mutation_process=M)
S_urel, T_urel = rand_seq(A, K), rand_seq(A, K)
for key, (S, T) in zip(['pos', 'neg'],
[(S_rel, T_rel), (S_urel, T_urel)]):
seed_index = SeedIndex(S, T, **seed_index_kw)
for g_idx, g_max in enumerate(g_radii):
radius = band_radius(K, g_max, 1 - 1e-4)
d_band = (-radius, radius)
n_seeds = seed_index.seed_count(d_band=d_band)
sim_data['n_seeds'][key][K_idx, g_idx, idx] = n_seeds
area = 2 * radius * K
word_p_null = (1./len(A)) ** wordlen
word_p = (n_seeds - area * word_p_null) / K
try:
p_hat = np.exp(np.log(word_p) / wordlen)
except Warning:
# presumably this happened because word_p was too small
p_hat = 0
p_hat = min(p_hat, 1)
sim_data['p_hat'][key][K_idx, g_idx, idx] = p_hat
sys.stderr.write('\n')
return sim_data
def plot_count_seeds_segment(sim_data, suffix=''):
Ks, g_radii = sim_data['Ks'], sim_data['g_radii']
match = sim_data['match']
fig = plt.figure(figsize=(10, 4))
grids = gridspec.GridSpec(1, 2, width_ratios=[5, 3])
ax_p = fig.add_subplot(grids[0])
ax_rad = fig.add_subplot(grids[1])
pad = min(Ks) / 3
colors = color_code(g_radii)
arrow_kw = {'marker': '>', 'c': 'k', 'markevery': 2, 'markersize': 2,
'lw': 1, 'alpha': .8}
ax_p.plot([Ks[0] - .2 * pad, Ks[0] - .9 * pad], [match, match], **arrow_kw)
ax_p.plot([Ks[0] - .2 * pad, Ks[0] - .9 * pad], [.25, .25], ls='--',
**arrow_kw)
kw_rad = {'marker': 'o', 'markersize': 3, 'lw': 1, 'alpha': .8}
kw_p = {'marker': 'o', 'markersize': 5, 'lw': 3, 'alpha': .5}
for g_idx, (g_max, color) in enumerate(zip(g_radii, colors)):
label = '$g_{\max} = %.2f$' % g_max
pos = sim_data['p_hat']['pos'][:, g_idx, :]
neg = sim_data['p_hat']['neg'][:, g_idx, :]
plot_with_sd(ax_p, Ks, neg, axis=1, color=color, ls='--', **kw_p)
plot_with_sd(ax_p, Ks, pos, axis=1, color=color, label=label, **kw_p)
ax_rad.plot(Ks, band_radii(Ks, g_max, 1 - 1e-4), color=color,
label=label, **kw_rad)
ax_p.set_ylim(-.2, 1.1)
for ax in [ax_p, ax_rad]:
ax.set_xlim(Ks[0] - pad, Ks[-1] + pad)
ax.set_xscale('log')
ax.set_xlabel('similarity length')
ax.set_xticks(Ks)
ax.set_xticklabels(Ks)
ax.legend(loc='best')
ax_p.set_ylabel('estimated match probability')
ax_rad.set_ylabel('diagonal band radius')
# n_samples = pos.shape[1]
savefig(fig, 'num_seeds_segment%s.png' % suffix)
[docs]def exp_count_seeds_segment():
"""Shows simulation results for alignment-free estimated match probability
for globally homologous sequence pairs of length up to 25kbp.
**Supported Claims**
* The match probability estimator provided by
:func:`biseqt.blot.WordBlot.estimate_match_probability` using the band
radius provided by :func:`biseqt.blot.band_radius` are accurate for
similarities of lengths up to 25kbp regardless of gap probability upper
bound. Therefore, we can justify using generous overestimates of gap
probability (e.g. :math:`g_{\max}=0.4`) in typical contexts.
* For unrelated sequences our estimator reports a number close to .25 (one
standard deviation range :math:`[0, .4]`).
.. figure::
https://www.dropbox.com/s/gnvb8eiiezyysuq/
num_seeds_segment%5Bw%3D6%5D.png?raw=1
:target:
https://www.dropbox.com/s/gnvb8eiiezyysuq/
num_seeds_segment%5Bw%3D6%5D.png?raw=1
:alt: lightbox
For multiple values of maximum allowed gap probability :math:`g_{\max}`
(which dictates diagonal band radius), estimated match probability
(using word length 6) is shown as a function of sequence length (*left*)
for globally homologous sequences (solid lines) and unrelated sequences
(dashed lines), n=50 samples, shaded regions show one standard
deviation. Homologous sequences were simulated by mutations with gap
probability 0.1 and substitution probability 0.15 (hence a match
probability of 0.77 indicated by a solid arrow (note agreement with
Word-Blot estimation), the dashed arrow shows the 0.25 point). For each
value of :math:`g_{\max}`, the corresponding band radius is shown as a
function of similarity length (right). Horizontal axes in both plots is
in log scale.
"""
Ks = [200 * 2 ** i for i in range(8)]
g_radii = [.05, .1, .2, .4]
gap = .1
subst = .15
n_samples = 50
wordlen = 6
suffix = '[w=%d]' % wordlen
dumpfile = 'num_seeds_segment%s.txt' % suffix
sim_data = sim_count_seeds_segment(
Ks=Ks, g_radii=g_radii, n_samples=n_samples,
gap=gap, subst=subst, wordlen=wordlen,
dumpfile=dumpfile)
plot_count_seeds_segment(sim_data, suffix=suffix)
if __name__ == '__main__':
exp_count_seeds()
exp_count_seeds_segment()