Source code for experiments.num_seeds

#!/usr/bin/env python
from matplotlib import pyplot as plt
from matplotlib import gridspec
import sys
from time import time
import numpy as np
from biseqt.sequence import Alphabet
from biseqt.stochastics import rand_seq, MutationProcess
from biseqt.seeds import SeedIndex
from biseqt.blot import band_radius, band_radii, H0_moments, H1_moments
from util import seq_pair, color_code
from util import plot_with_sd, with_dumpfile, log, savefig
from logging import WARN


@with_dumpfile
def sim_count_seeds(**kw):
    ns, n_samples = kw['ns'], kw['n_samples']
    gap, subst, wordlen = kw['gap'], kw['subst'], kw['wordlen']
    log('simulating # of seeds (%d samples), lengths = %s' %
        (n_samples, str(ns)))

    def _zero(): return np.zeros((len(ns), n_samples))

    A = Alphabet('ACGT')
    seed_index_kw = {
        'alphabet': A,
        'wordlen': wordlen,
        'path': kw.get('path', ':memory:'),
        'log_level': WARN,
    }
    sim_data = {
        'time': {'pos': _zero(), 'neg': _zero()},
        'n_seeds': {'all': {'pos': _zero(), 'neg': _zero()},
                    'band': {'pos': _zero(), 'neg': _zero()}},
        'gap': gap,
        'match': (1 - gap) * (1 - subst),
        'ns': ns,
        'seed_index_kw': seed_index_kw,
    }
    M = MutationProcess(A, go_prob=gap, ge_prob=gap, subst_probs=subst)
    for n_idx, n in enumerate(ns):
        radius = band_radius(n, .4, 1 - 1e-4)

        log('n = %d ' % n, newline=False)
        for idx in range(n_samples):
            sys.stderr.write('.')
            S_rel, T_rel = seq_pair(n, A, mutation_process=M)
            S_urel, T_urel = rand_seq(A, n), rand_seq(A, n)
            for key, (S, T) in zip(['pos', 'neg'],
                                   [(S_rel, T_rel), (S_urel, T_urel)]):
                t = time()
                seed_index = SeedIndex(S, T, **seed_index_kw)
                n_seeds = seed_index.seed_count()
                sim_data['time'][key][n_idx][idx] = time() - t
                sim_data['n_seeds']['all'][key][n_idx][idx] = n_seeds

                n_seeds = seed_index.seed_count(d_band=(-radius, radius))
                sim_data['n_seeds']['band'][key][n_idx][idx] = n_seeds
        sys.stderr.write('\n')
    return sim_data


def plot_count_seeds_moments(sim_data, K=None, suffix=''):
    ns, wordlen = sim_data['ns'], sim_data['seed_index_kw']['wordlen']
    match = sim_data['match']

    def _zero(): return {'all': [], 'band': []}

    mus_H0, mus_H1, sds_H0, sds_H1 = _zero(), _zero(), _zero(), _zero()
    for n in ns:
        for mode in ['all', 'band']:
            if mode == 'all':
                area = n ** 2
            else:
                radius = band_radius(n, .4, 1 - 1e-4)
                area = n * 2 * radius

            mu_H0, sd_H0 = H0_moments(4, wordlen, area)
            mus_H0[mode].append(mu_H0)
            sds_H0[mode].append(sd_H0)

            mu_H1, sd_H1 = H1_moments(4, wordlen, area, n, match)
            mus_H1[mode].append(mu_H1)
            sds_H1[mode].append(sd_H1)

    fig = plt.figure(figsize=(11, 5))
    ax_t = fig.add_subplot(1, 3, 1)
    ax_mu = fig.add_subplot(1, 3, 2)
    ax_sd = fig.add_subplot(1, 3, 3)

    # time to find all seeds
    kw = {'marker': 'o', 'markersize': 4, 'lw': 1, 'alpha': .8}
    plot_with_sd(ax_t, ns, 1000 * sim_data['time']['neg'], axis=1, color='r',
                 label='unrelated', **kw)
    plot_with_sd(ax_t, ns, 1000 * sim_data['time']['pos'], axis=1, color='g',
                 label='related', **kw)

    kw = {'markersize': 3, 'lw': 1.2}
    for mode, marker, alpha in zip(['all', 'band'], 'ox', [.7, .5]):
        kw['alpha'] = alpha
        kw['marker'] = marker
        pos = sim_data['n_seeds'][mode]['pos']
        neg = sim_data['n_seeds'][mode]['neg']

        # average no. of seeds
        kw['color'] = 'r'
        ax_mu.plot(ns, neg.mean(axis=1), label='unrelated (%s)' % mode, **kw)
        ax_mu.plot(ns, mus_H0[mode], ls='--', **kw)

        kw['color'] = 'g'
        ax_mu.plot(ns, pos.mean(axis=1), label='related (%s)' % mode, **kw)
        ax_mu.plot(ns, mus_H1[mode], ls='--', **kw)

        # std dev. of no of seeds
        kw['color'] = 'r'
        sds_emp = np.sqrt(neg.var(axis=1))
        ax_sd.plot(ns, sds_emp, label='unrelated (%s)' % mode, **kw)
        ax_sd.plot(ns, sds_H0[mode], ls='--', **kw)

        kw['color'] = 'g'
        sds_emp = np.sqrt(pos.var(axis=1))
        ax_sd.plot(ns, sds_emp, label='related (%s)' % mode, **kw)
        ax_sd.plot(ns, sds_H1[mode], ls='--', **kw)

    _ns = np.arange(min(ns), max(ns))
    ax_mu.plot(_ns, _ns, color='k', alpha=.9, lw=.5, ls='--')
    ax_t.plot(_ns, _ns, color='k', alpha=.9, lw=.5, ls='--')

    for ax in [ax_sd, ax_mu, ax_t]:
        # ax.set_xlim(None, 1.1 * max(ns))
        ax.set_xscale('log')
        ax.set_yscale('log')
        ax.set_xlabel('sequence length')
        ax.set_xticks(ns)
        ax.set_xticklabels(ns, rotation=90)
        ax.legend(loc='lower right', fontsize=8)

    ax_sd.set_ylabel('standard deviation of no. of matching %d-mers' % wordlen)
    ax_mu.set_ylabel('average no. of matching %d-mers' % wordlen)
    ax_t.set_ylabel('time to find matching %d-mers (ms)' % wordlen)

    # n_samples = pos.shape[1]
    savefig(fig, 'num_seeds_moments%s.png' % suffix)


[docs]def exp_count_seeds(): """Shows theoretical and simulation results for the mean and variance of the number of exactly matching kmers between related and unrelated sequences as a function of sequence length. Theoretical predictions are based on *m-dependent Central Limit Theorem* which suggests a limiting Normal distribution with mean and variance given by :func:`biseqt.blot.H0_moments` and :func:`biseqt.blot.H1_moments`. **Supported Claims** * The theoretical calculations of mean and variance for the number of seeds, given by :func:`biseqt.blot.H0_moments` and :func:`biseqt.blot.H1_moments` agree with simulations at least upto 25kbp sequence lengths. * Although the expected number of seeds is technically quadratic, the highest order coefficient is so small that it can be considered effectively linear in sequence length. Furthermore, we note that word length provides expoenentially strong control on the number of seeds as sequence lengths increase; hence maintaining a small quadratic coefficient across biologically relevant sequence lengths (up to 1 Gbp) is feasible with reasonable word lenghts (up to 30) .. figure:: https://www.dropbox.com/s/fkd6u6gec6rzrjm/ num_seeds_moments%5Bw%3D8%5D.png?raw=1 :target: https://www.dropbox.com/s/fkd6u6gec6rzrjm/ num_seeds_moments%5Bw%3D8%5D.png?raw=1 :alt: lightbox Time to find all exactly matching 8-mers (*left*) for related (*green*) and unrelated (*red*) sequences of varying lengths (n=50 samples for each length; shaded regions show one standard deviation). Related sequences are mutations of each other with substitution and gap probabilities both equal to 0.1. For the same simulations, mean (*middle*) and standard deviation (*right*) of the number of seeds as a function of sequence length are shown (solid lines) with theoretical predictions for each case (dashed lines). All axes are in log scale, and the dotted black lines in the left and middle plots are y=x lines for comparison. """ ns = [200 * 2 ** i for i in range(8)] gap = .1 subst = .1 n_samples = 50 wordlen = 8 suffix = '[w=%d]' % wordlen dumpfile = 'num_seeds%s.txt' % suffix sim_data = sim_count_seeds( ns=ns, n_samples=n_samples, gap=gap, subst=subst, wordlen=wordlen, dumpfile=dumpfile, ignore_existing=False) plot_count_seeds_moments(sim_data, suffix=suffix)
@with_dumpfile def sim_count_seeds_segment(**kw): Ks, g_radii, n_samples = kw['Ks'], kw['g_radii'], kw['n_samples'] gap, subst, wordlen = kw['gap'], kw['subst'], kw['wordlen'] def _zero(): return np.zeros((len(Ks), len(g_radii), n_samples)) A = Alphabet('ACGT') seed_index_kw = { 'alphabet': A, 'wordlen': wordlen, 'path': kw.get('path', ':memory:'), 'log_level': WARN, } sim_data = { 'n_seeds': {'pos': _zero(), 'neg': _zero()}, 'p_hat': {'pos': _zero(), 'neg': _zero()}, 'gap': gap, 'match': (1 - gap) * (1 - subst), 'Ks': Ks, 'g_radii': g_radii, 'seed_index_kw': seed_index_kw, } M = MutationProcess(A, go_prob=gap, ge_prob=gap, subst_probs=subst) for K_idx, K in enumerate(Ks): log('K = %d' % K, newline=False) for idx in range(n_samples): sys.stderr.write('.') S_rel, T_rel = seq_pair(K, A, mutation_process=M) S_urel, T_urel = rand_seq(A, K), rand_seq(A, K) for key, (S, T) in zip(['pos', 'neg'], [(S_rel, T_rel), (S_urel, T_urel)]): seed_index = SeedIndex(S, T, **seed_index_kw) for g_idx, g_max in enumerate(g_radii): radius = band_radius(K, g_max, 1 - 1e-4) d_band = (-radius, radius) n_seeds = seed_index.seed_count(d_band=d_band) sim_data['n_seeds'][key][K_idx, g_idx, idx] = n_seeds area = 2 * radius * K word_p_null = (1./len(A)) ** wordlen word_p = (n_seeds - area * word_p_null) / K try: p_hat = np.exp(np.log(word_p) / wordlen) except Warning: # presumably this happened because word_p was too small p_hat = 0 p_hat = min(p_hat, 1) sim_data['p_hat'][key][K_idx, g_idx, idx] = p_hat sys.stderr.write('\n') return sim_data def plot_count_seeds_segment(sim_data, suffix=''): Ks, g_radii = sim_data['Ks'], sim_data['g_radii'] match = sim_data['match'] fig = plt.figure(figsize=(10, 4)) grids = gridspec.GridSpec(1, 2, width_ratios=[5, 3]) ax_p = fig.add_subplot(grids[0]) ax_rad = fig.add_subplot(grids[1]) pad = min(Ks) / 3 colors = color_code(g_radii) arrow_kw = {'marker': '>', 'c': 'k', 'markevery': 2, 'markersize': 2, 'lw': 1, 'alpha': .8} ax_p.plot([Ks[0] - .2 * pad, Ks[0] - .9 * pad], [match, match], **arrow_kw) ax_p.plot([Ks[0] - .2 * pad, Ks[0] - .9 * pad], [.25, .25], ls='--', **arrow_kw) kw_rad = {'marker': 'o', 'markersize': 3, 'lw': 1, 'alpha': .8} kw_p = {'marker': 'o', 'markersize': 5, 'lw': 3, 'alpha': .5} for g_idx, (g_max, color) in enumerate(zip(g_radii, colors)): label = '$g_{\max} = %.2f$' % g_max pos = sim_data['p_hat']['pos'][:, g_idx, :] neg = sim_data['p_hat']['neg'][:, g_idx, :] plot_with_sd(ax_p, Ks, neg, axis=1, color=color, ls='--', **kw_p) plot_with_sd(ax_p, Ks, pos, axis=1, color=color, label=label, **kw_p) ax_rad.plot(Ks, band_radii(Ks, g_max, 1 - 1e-4), color=color, label=label, **kw_rad) ax_p.set_ylim(-.2, 1.1) for ax in [ax_p, ax_rad]: ax.set_xlim(Ks[0] - pad, Ks[-1] + pad) ax.set_xscale('log') ax.set_xlabel('similarity length') ax.set_xticks(Ks) ax.set_xticklabels(Ks) ax.legend(loc='best') ax_p.set_ylabel('estimated match probability') ax_rad.set_ylabel('diagonal band radius') # n_samples = pos.shape[1] savefig(fig, 'num_seeds_segment%s.png' % suffix)
[docs]def exp_count_seeds_segment(): """Shows simulation results for alignment-free estimated match probability for globally homologous sequence pairs of length up to 25kbp. **Supported Claims** * The match probability estimator provided by :func:`biseqt.blot.WordBlot.estimate_match_probability` using the band radius provided by :func:`biseqt.blot.band_radius` are accurate for similarities of lengths up to 25kbp regardless of gap probability upper bound. Therefore, we can justify using generous overestimates of gap probability (e.g. :math:`g_{\max}=0.4`) in typical contexts. * For unrelated sequences our estimator reports a number close to .25 (one standard deviation range :math:`[0, .4]`). .. figure:: https://www.dropbox.com/s/gnvb8eiiezyysuq/ num_seeds_segment%5Bw%3D6%5D.png?raw=1 :target: https://www.dropbox.com/s/gnvb8eiiezyysuq/ num_seeds_segment%5Bw%3D6%5D.png?raw=1 :alt: lightbox For multiple values of maximum allowed gap probability :math:`g_{\max}` (which dictates diagonal band radius), estimated match probability (using word length 6) is shown as a function of sequence length (*left*) for globally homologous sequences (solid lines) and unrelated sequences (dashed lines), n=50 samples, shaded regions show one standard deviation. Homologous sequences were simulated by mutations with gap probability 0.1 and substitution probability 0.15 (hence a match probability of 0.77 indicated by a solid arrow (note agreement with Word-Blot estimation), the dashed arrow shows the 0.25 point). For each value of :math:`g_{\max}`, the corresponding band radius is shown as a function of similarity length (right). Horizontal axes in both plots is in log scale. """ Ks = [200 * 2 ** i for i in range(8)] g_radii = [.05, .1, .2, .4] gap = .1 subst = .15 n_samples = 50 wordlen = 6 suffix = '[w=%d]' % wordlen dumpfile = 'num_seeds_segment%s.txt' % suffix sim_data = sim_count_seeds_segment( Ks=Ks, g_radii=g_radii, n_samples=n_samples, gap=gap, subst=subst, wordlen=wordlen, dumpfile=dumpfile) plot_count_seeds_segment(sim_data, suffix=suffix)
if __name__ == '__main__': exp_count_seeds() exp_count_seeds_segment()