Source code for experiments.band_radius

#!/usr/bin/env python
import numpy as np
from matplotlib import gridspec
from matplotlib import pyplot as plt
from bisect import bisect_left
from scipy.special import erf, erfcinv
from scipy.ndimage.filters import gaussian_filter1d
import re

from util import log, color_code, plot_with_sd, with_dumpfile, savefig
from util import sample_opseq, estimate_gap_probs_in_opseq
from util import load_fasta, opseq_path

from biseqt.seeds import SeedIndex
from biseqt.blot import band_radius


def time_in_band(K, gap, radius):
    alpha = radius / (np.sqrt(2 * gap * K))

    return erf(alpha) + alpha * (2 / np.sqrt(np.pi) * np.exp(-alpha ** 2) -
                                 2 * alpha * (1 - erf(alpha)))


def sample_edit_sequences(K, g, n_samples, bio=False):
    if bio:
        pws_path = 'data/actn2/actn2-7vet-pws.fa'
        opseq = ''
        with open(pws_path) as f:
            for seq, name, _ in load_fasta(f):
                opseq += seq
        opseq = re.sub('D{10,}', 'D', opseq)
        opseq = re.sub('I{10,}', 'I', opseq)
        return sample_opseq(opseq, K, n_samples, g)
    else:
        return (''.join(np.random.choice(list('MID'),
                        p=[1-g, g / 2, g / 2], size=K))
                for _ in range(n_samples))


@with_dumpfile
def sim_time_in_band(K, gs, rs, n_samples, **kw):
    sim_data = {
        'in_band': {'simulated': np.zeros((len(gs), len(rs), n_samples)),
                    'biological': np.zeros((len(gs), len(rs), n_samples))},
        'gs': gs,
        'rs': rs,
        'K': K,
    }
    for g_idx, g in enumerate(gs):
        d0 = K
        for key, bio in zip(['simulated', 'biological'], [False, True]):
            log('sampling homologies for K = %d, g = %.2f (%s data)' %
                (K, g, key))
            samples = sample_edit_sequences(K, g, n_samples, bio=bio)
            for sample_idx, opseq in enumerate(samples):
                time_at_d_ = np.zeros(2*K)
                i, j = 0, 0
                for op in opseq:
                    d = i - j
                    time_at_d_[d + d0] += 1
                    if op in 'DM':
                        i += 1
                    if op in 'IM':
                        j += 1
                cum_time_at_d_ = np.cumsum(time_at_d_)
                for r_idx, r in enumerate(rs):
                    in_band = cum_time_at_d_[d0 + r] - cum_time_at_d_[d0 - r]
                    prop = in_band / K
                    assert prop <= 1
                    sim_data['in_band'][key][g_idx][r_idx][sample_idx] = prop
    return sim_data


def plot_time_in_band(sim_data, cutoff_epsilon):
    gs = sim_data['gs']
    rs = sim_data['rs']
    K = sim_data['K']
    n_samples = sim_data['in_band']['simulated'].shape[2]

    fig = plt.figure(figsize=(10, 8))

    grids = gridspec.GridSpec(3, 2, height_ratios=[1, 1, 1])

    ax_mod = fig.add_subplot(grids[:2, 0])  # model
    ax_sim = fig.add_subplot(grids[2, 0])  # simulated data
    ax_bio = fig.add_subplot(grids[2, 1])  # biological data
    ax_rw1 = fig.add_subplot(grids[0, 1])
    ax_rw2 = fig.add_subplot(grids[1, 1])

    colors = color_code(gs, cmap='PuOr')

    for g_idx, (color, g, ax_rw) in enumerate(zip(colors, gs,
                                                  [ax_rw1, ax_rw2])):
        r_lim = rs[bisect_left(
            sim_data['in_band']['biological'][g_idx].mean(axis=1),
            1 - .5 * cutoff_epsilon
        )]
        r_cutoff = band_radius(K, g, 1 - cutoff_epsilon)

        vs = [erf(r / (2 * np.sqrt(g * K))) for r in rs]
        us = [time_in_band(K, g, r) for r in rs]

        kw = {'color': color, 'lw': 1.5, 'alpha': .8}
        ax_mod.plot(rs, vs, label='$g = %.2f$' % g, **kw)  # simplified model
        ax_mod.plot(rs, us, ls='--', **kw)                 # full correct model
        ax_mod.axvline(r_cutoff, color=color, lw=5, alpha=.3)

        ax_mod.set_title('model predictions', fontsize=10)
        ax_mod.grid(True)
        ax_mod.set_xlabel('diagonal band radius')
        ax_mod.set_ylabel('proportion of time in band')
        ax_mod.legend(loc='lower right', fontsize=12)
        ax_mod.set_xlim(0, r_lim)
        ax_mod.set_ylim(0, 1.2)

        kw = {'lw': 1, 'alpha': .2, 'color': color}
        for opseq in sample_edit_sequences(K, g, n_samples, bio=False):
            xs, ys = opseq_path(opseq, x0=0, y0=0)
            ds = [SeedIndex.to_diagonal_coordinates(x, y)[0]
                  for x, y in zip(xs, ys)]
            ax_rw.plot(range(K + 1), ds, **kw)

        for eps, lw, alpha, label in zip([1e-1, 1e-6], [2, 3], [.7, .3],
                                         ['$\epsilon=10^{-1}$',
                                          '$\epsilon=10^{-6}$']):

            def radius_(k): return erfcinv(eps) * np.sqrt(2 * g) * np.sqrt(k)

            plot_kw = {'c': 'k', 'alpha': alpha, 'lw': lw}
            radii = np.array([radius_(k) for k in range(K)])
            ax_rw.plot(range(K), radii, label=label, **plot_kw)
            ax_rw.plot(range(K), -radii, **plot_kw)

        ax_rw.legend(loc='upper left', fontsize=8)
        ax_rw.set_ylim(-50, 50)
        ax_rw.set_xlabel('number of alignment steps')
        ax_rw.set_ylabel('diagonal position $d$')

        for key, ax in zip(['simulated', 'biological'], [ax_sim, ax_bio]):
            res = sim_data['in_band'][key][g_idx, :, :]

            r_eff = rs[bisect_left(vs, 1 - cutoff_epsilon)]
            in_band_eff = sim_data['in_band'][key][g_idx, r_eff, :].mean()
            label = 'out of band: \%%%.2f' % (100 * (1 - in_band_eff))

            plot_with_sd(ax, rs, res, axis=1, y_max=1, color=color, lw=1.5,
                         label=label)
            ax.axvline(r_cutoff, color=color, lw=5, alpha=.3)
            ax.set_xlim(0, r_lim)
            ax.set_ylim(0, 1.2)
            ax.grid(True)
            ax.set_title('%s data' % key, fontsize=10)
            ax.set_xlabel('diagonal band radius', fontsize=8)
            ax.set_ylabel('proportion of time in band', fontsize=8)
            ax.legend(loc='lower right', fontsize=8)

    savefig(fig, 'band_radius.png', comment='time in band')


[docs]def exp_time_in_band(): """Shows theoretical and empirical results for proportion of alignment time (i.e. number of edit operations) spent within diagonal bands of varying radius. **Supported Claims** * The random walk model for capturing the probability distribution of diagonal positions is accurate. * Calculating the band radius such that the *endpoint* of an alignment is within diagonal band with probability at least :math:`1 - \epsilon` is a good conservative estimate for the more reasonable requirement that the *average proportion* of time spent in band is at least :math:`1 - \epsilon`. * Calculated band radii are accurate and reliable when applied to simulated as well as biological edit sequences. .. figure:: https://www.dropbox.com/s/93cg9t4oh0h2guh/band_radius.png?raw=1 :target: https://www.dropbox.com/s/93cg9t4oh0h2guh/band_radius.png?raw=1 :alt: lightbox Predicted proportion of time spent in diagonal band (*top left*) in alignments of total length K=500 according to full (dashed) and approximate (solid) probabilistic model and for two values of gap probabilities. Vertical lines indicate the calculated band radius for %.01 accuracy. Simulated alignments (n=500 samples) are drawn in diagonal coordinates as a function of alignment time for gap probability 0.05 (*right top*) and 0.15 (*right middle*) with bounds obtained from :func:`biseqt.blot.band_radius` for two sensitivity values. Proportion of time spent in band as a function of band radius is shown for simulated sequences (*bottom left*) and real biological data (*bottom right*), and in each case the empirical percentage of time spent *outside* of band is reported; shaded regions indicate one standard deviation. Biological alignments are chosen from projected pairwise alignments of *alpha-Actinin 2 (ACTN2)* for 7 vertebrates obtained from UCSC genome browser. Each sample biological edit sequence is chosen such that it has the desired length and gap probability; gaps of length more than 10 nucleotides are removed to avoid nonstationary gap probabilities. """ K = 500 gs = [.05, .15] n_samples = 500 rs = range(0, 400) cutoff_epsilon = 1e-4 # for vertical lines showing calculated cutoff dumpfile = 'band_radius.txt' sim_data = sim_time_in_band(K, gs, rs, n_samples, dumpfile=dumpfile) plot_time_in_band(sim_data, cutoff_epsilon)
[docs]def exp_gap_probs(): """Shows gap probability variability in simulated and biological edit sequences. **Supported Claims** * Gap probabilities are nonstationary in biological data. * Removing long indel stretches mitigates this nonstationarity for the most part and allows us to treat gap probability in biological alignments as stationary. .. figure:: https://www.dropbox.com/s/vplg0x22ykc47am/gap_probabilities.png?raw=1 :target: https://www.dropbox.com/s/vplg0x22ykc47am/gap_probabilities.png?raw=1 :alt: lightbox Gap probability variability along alignments (length K=1200) in simulated (black) and biological edit sequences (green), before (left) and after (right) removing long indel stretches (deletion or insertion stretches of length at least 10) from biological samples; n=50 samples in both cases. Local gap probabilites are calculated in a window of radius 100 nt. Simulated edit sequences (black) are generated using stationary gap probability (i.e constant along alignment) of 0.15. Biological edit sequences are sampled from projected pairwise alignments of *Actinin 2* for 7 vertebrates obtained from UCSC genome browser such that the overall gap probability (i.e over K=1200 operations) is 0.15. """ pws_path = 'data/actn2/actn2-7vet-pws.fa' full_opseq = '' with open(pws_path) as f: for seq, name, _ in load_fasta(f): full_opseq += seq fig = plt.figure(figsize=(10, 4)) ax = fig.add_subplot(1, 2, 1) ax_stationary = fig.add_subplot(1, 2, 2) gap = .15 n_samples = 50 K = 1200 radius = 100 for mode, ax in zip(['stationary', 'original'], [ax_stationary, ax]): opseq = full_opseq if mode == 'stationary': opseq = re.sub('D{10,}', 'D', opseq) opseq = re.sub('I{10,}', 'I', opseq) real_samples = sample_opseq(opseq, K, n_samples, gap) real_labeled = False for real_sample in real_samples: label = 'biological' if not real_labeled else None real_gaps = gaussian_filter1d( estimate_gap_probs_in_opseq(real_sample, radius), radius/5) ax.plot(range(K), real_gaps, c='g', lw=1, alpha=.2, label=label) real_labeled = True sim_labeled = False for _ in range(n_samples): sim_sample = ''.join( np.random.choice( list('MID'), p=[1 - gap, gap / 2, gap / 2], size=K ) ) label = 'simulated' if not sim_labeled else None sim_gaps = gaussian_filter1d( estimate_gap_probs_in_opseq(sim_sample, radius), radius/5) ax.plot(range(K), sim_gaps, c='k', lw=1, alpha=.4, label=label) sim_labeled = True ax.axhline(y=gap, c='k', alpha=.2, lw=3) ax.grid(True) ax.legend(loc='upper right', fontsize=8) ax.set_xlim(radius, K - radius) ax.set_xlabel('position in edit sequence') ax.set_ylabel('gap probability') ax.set_ylim(-.05, .6) savefig(fig, 'gap_probabilities.png')
if __name__ == '__main__': exp_time_in_band() exp_gap_probs()