Source code for experiments.band_radius

#!/usr/bin/env python
import numpy as np
from matplotlib import gridspec
from matplotlib import pyplot as plt
from bisect import bisect_left
from scipy.special import erf, erfcinv
from scipy.ndimage.filters import gaussian_filter1d
import re

from util import log, color_code, plot_with_sd, with_dumpfile, savefig
from util import sample_opseq, estimate_gap_probs_in_opseq
from util import load_fasta, opseq_path

from biseqt.seeds import SeedIndex
from biseqt.blot import band_radius


def time_in_band(K, gap, radius):
    alpha = radius / (np.sqrt(2 * gap * K))

    return erf(alpha) + alpha * (2 / np.sqrt(np.pi) * np.exp(-alpha ** 2) -
                                 2 * alpha * (1 - erf(alpha)))


def sample_edit_sequences(K, g, n_samples, bio=False):
    if bio:
        pws_path = 'data/actn2/actn2-7vet-pws.fa'
        opseq = ''
        with open(pws_path) as f:
            for seq, name, _ in load_fasta(f):
                opseq += seq
        opseq = re.sub('D{10,}', 'D', opseq)
        opseq = re.sub('I{10,}', 'I', opseq)
        return sample_opseq(opseq, K, n_samples, g)
    else:
        return (''.join(np.random.choice(list('MID'),
                        p=[1-g, g / 2, g / 2], size=K))
                for _ in range(n_samples))


@with_dumpfile
def sim_time_in_band(K, gs, rs, n_samples, **kw):
    sim_data = {
        'in_band': {'simulated': np.zeros((len(gs), len(rs), n_samples)),
                    'biological': np.zeros((len(gs), len(rs), n_samples))},
        'gs': gs,
        'rs': rs,
        'K': K,
    }
    for g_idx, g in enumerate(gs):
        d0 = K
        for key, bio in zip(['simulated', 'biological'], [False, True]):
            log('sampling homologies for K = %d, g = %.2f (%s data)' %
                (K, g, key))
            samples = sample_edit_sequences(K, g, n_samples, bio=bio)
            for sample_idx, opseq in enumerate(samples):
                time_at_d_ = np.zeros(2*K)
                i, j = 0, 0
                for op in opseq:
                    d = i - j
                    time_at_d_[d + d0] += 1
                    if op in 'DM':
                        i += 1
                    if op in 'IM':
                        j += 1
                cum_time_at_d_ = np.cumsum(time_at_d_)
                for r_idx, r in enumerate(rs):
                    in_band = cum_time_at_d_[d0 + r] - cum_time_at_d_[d0 - r]
                    prop = in_band / K
                    assert prop <= 1
                    sim_data['in_band'][key][g_idx][r_idx][sample_idx] = prop
    return sim_data


def plot_time_in_band(sim_data, cutoff_epsilon):
    gs = sim_data['gs']
    rs = sim_data['rs']
    K = sim_data['K']
    n_samples = sim_data['in_band']['simulated'].shape[2]

    fig = plt.figure(figsize=(10, 8))

    grids = gridspec.GridSpec(3, 2, height_ratios=[1, 1, 1])

    ax_mod = fig.add_subplot(grids[:2, 0])  # model
    ax_sim = fig.add_subplot(grids[2, 0])  # simulated data
    ax_bio = fig.add_subplot(grids[2, 1])  # biological data
    ax_rw1 = fig.add_subplot(grids[0, 1])
    ax_rw2 = fig.add_subplot(grids[1, 1])

    colors = color_code(gs, cmap='PuOr')

    for g_idx, (color, g, ax_rw) in enumerate(zip(colors, gs,
                                                  [ax_rw1, ax_rw2])):
        r_lim = rs[bisect_left(
            sim_data['in_band']['biological'][g_idx].mean(axis=1),
            1 - .5 * cutoff_epsilon
        )]
        r_cutoff = band_radius(K, g, 1 - cutoff_epsilon)

        vs = [erf(r / (2 * np.sqrt(g * K))) for r in rs]
        us = [time_in_band(K, g, r) for r in rs]

        kw = {'color': color, 'lw': 1.5, 'alpha': .8}
        ax_mod.plot(rs, vs, label='$g = %.2f$' % g, **kw)  # simplified model
        ax_mod.plot(rs, us, ls='--', **kw)                 # full correct model
        ax_mod.axvline(r_cutoff, color=color, lw=5, alpha=.3)

        ax_mod.set_title('model predictions', fontsize=10)
        ax_mod.grid(True)
        ax_mod.set_xlabel('diagonal band radius')
        ax_mod.set_ylabel('proportion of time in band')
        ax_mod.legend(loc='lower right', fontsize=12)
        ax_mod.set_xlim(0, r_lim)
        ax_mod.set_ylim(0, 1.2)

        kw = {'lw': 1, 'alpha': .2, 'color': color}
        for opseq in sample_edit_sequences(K, g, n_samples, bio=False):
            xs, ys = opseq_path(opseq, x0=0, y0=0)
            ds = [SeedIndex.to_diagonal_coordinates(x, y)[0]
                  for x, y in zip(xs, ys)]
            ax_rw.plot(range(K + 1), ds, **kw)

        for eps, lw, alpha, label in zip([1e-1, 1e-6], [2, 3], [.7, .3],
                                         ['$\epsilon=10^{-1}$',
                                          '$\epsilon=10^{-6}$']):

            def radius_(k): return erfcinv(eps) * np.sqrt(2 * g) * np.sqrt(k)

            plot_kw = {'c': 'k', 'alpha': alpha, 'lw': lw}
            radii = np.array([radius_(k) for k in range(K)])
            ax_rw.plot(range(K), radii, label=label, **plot_kw)
            ax_rw.plot(range(K), -radii, **plot_kw)

        ax_rw.legend(loc='upper left', fontsize=8)
        ax_rw.set_ylim(-50, 50)
        ax_rw.set_xlabel('number of alignment steps')
        ax_rw.set_ylabel('diagonal position $d$')

        for key, ax in zip(['simulated', 'biological'], [ax_sim, ax_bio]):
            res = sim_data['in_band'][key][g_idx, :, :]

            r_eff = rs[bisect_left(vs, 1 - cutoff_epsilon)]
            in_band_eff = sim_data['in_band'][key][g_idx, r_eff, :].mean()
            label = 'out of band: \%%%.2f' % (100 * (1 - in_band_eff))

            plot_with_sd(ax, rs, res, axis=1, y_max=1, color=color, lw=1.5,
                         label=label)
            ax.axvline(r_cutoff, color=color, lw=5, alpha=.3)
            ax.set_xlim(0, r_lim)
            ax.set_ylim(0, 1.2)
            ax.grid(True)
            ax.set_title('%s data' % key, fontsize=10)
            ax.set_xlabel('diagonal band radius', fontsize=8)
            ax.set_ylabel('proportion of time in band', fontsize=8)
            ax.legend(loc='lower right', fontsize=8)

    savefig(fig, 'band_radius.png', comment='time in band')


[docs]def exp_time_in_band():
    """Shows theoretical and empirical results for proportion of alignment time
    (i.e. number of edit operations) spent within diagonal bands of varying
    radius.

    **Supported Claims**

    * The random walk model for capturing the probability distribution of
      diagonal positions is accurate.
    * Calculating the band radius such that the *endpoint* of an alignment is
      within diagonal band with probability at least :math:`1 - \epsilon` is a
      good conservative estimate for the more reasonable requirement that the
      *average proportion* of time spent in band is at least :math:`1 -
      \epsilon`.
    * Calculated band radii are accurate and reliable when applied to
      simulated as well as biological edit sequences.

    .. figure::
        https://www.dropbox.com/s/93cg9t4oh0h2guh/band_radius.png?raw=1
       :target:
        https://www.dropbox.com/s/93cg9t4oh0h2guh/band_radius.png?raw=1
       :alt: lightbox

       Predicted proportion of time spent in diagonal band (*top left*) in
       alignments of total length K=500 according to full (dashed) and
       approximate (solid) probabilistic model and for two values of gap
       probabilities. Vertical lines indicate the calculated band radius for
       %.01 accuracy.
       Simulated alignments (n=500 samples) are drawn in diagonal coordinates
       as a function of alignment time for gap probability 0.05 (*right top*)
       and 0.15 (*right middle*) with bounds obtained from
       :func:`biseqt.blot.band_radius` for two sensitivity values.
       Proportion of time spent in band as a function of band radius is shown
       for simulated sequences (*bottom left*) and real biological data
       (*bottom right*), and in each case the empirical percentage of time
       spent *outside* of band is reported; shaded regions indicate one
       standard deviation.  Biological alignments are chosen from projected
       pairwise alignments of *alpha-Actinin 2 (ACTN2)* for 7 vertebrates
       obtained from UCSC genome browser. Each sample biological edit sequence
       is chosen such that it has the desired length and gap probability; gaps
       of length more than 10 nucleotides are removed to avoid nonstationary
       gap probabilities.
    """
    K = 500
    gs = [.05, .15]
    n_samples = 500
    rs = range(0, 400)
    cutoff_epsilon = 1e-4  # for vertical lines showing calculated cutoff

    dumpfile = 'band_radius.txt'
    sim_data = sim_time_in_band(K, gs, rs, n_samples, dumpfile=dumpfile)
    plot_time_in_band(sim_data, cutoff_epsilon)


[docs]def exp_gap_probs():
    """Shows gap probability variability in simulated and biological edit
    sequences.

    **Supported Claims**

    * Gap probabilities are nonstationary in biological data.
    * Removing long indel stretches mitigates this nonstationarity for the most
      part and allows us to treat gap probability in biological alignments
      as stationary.

    .. figure::
        https://www.dropbox.com/s/vplg0x22ykc47am/gap_probabilities.png?raw=1
       :target:
        https://www.dropbox.com/s/vplg0x22ykc47am/gap_probabilities.png?raw=1
       :alt: lightbox

       Gap probability variability along alignments (length K=1200) in
       simulated (black) and biological edit sequences (green), before (left)
       and after (right) removing long indel stretches (deletion or insertion
       stretches of length at least 10) from biological samples; n=50 samples
       in both cases. Local gap probabilites are calculated in a window of
       radius 100 nt.  Simulated edit sequences (black) are generated using
       stationary gap probability (i.e constant along alignment) of 0.15.
       Biological edit sequences are sampled from projected pairwise alignments
       of *Actinin 2* for 7 vertebrates obtained from UCSC genome browser such
       that the overall gap probability (i.e over K=1200 operations) is 0.15.
    """
    pws_path = 'data/actn2/actn2-7vet-pws.fa'
    full_opseq = ''
    with open(pws_path) as f:
        for seq, name, _ in load_fasta(f):
            full_opseq += seq

    fig = plt.figure(figsize=(10, 4))
    ax = fig.add_subplot(1, 2, 1)
    ax_stationary = fig.add_subplot(1, 2, 2)

    gap = .15
    n_samples = 50
    K = 1200
    radius = 100

    for mode, ax in zip(['stationary', 'original'], [ax_stationary, ax]):
        opseq = full_opseq
        if mode == 'stationary':
            opseq = re.sub('D{10,}', 'D', opseq)
            opseq = re.sub('I{10,}', 'I', opseq)

        real_samples = sample_opseq(opseq, K, n_samples, gap)
        real_labeled = False
        for real_sample in real_samples:
            label = 'biological' if not real_labeled else None
            real_gaps = gaussian_filter1d(
                estimate_gap_probs_in_opseq(real_sample, radius), radius/5)
            ax.plot(range(K), real_gaps, c='g', lw=1, alpha=.2, label=label)
            real_labeled = True

        sim_labeled = False
        for _ in range(n_samples):
            sim_sample = ''.join(
                np.random.choice(
                    list('MID'), p=[1 - gap, gap / 2, gap / 2], size=K
                )
            )
            label = 'simulated' if not sim_labeled else None
            sim_gaps = gaussian_filter1d(
                estimate_gap_probs_in_opseq(sim_sample, radius), radius/5)
            ax.plot(range(K), sim_gaps, c='k', lw=1, alpha=.4, label=label)
            sim_labeled = True

        ax.axhline(y=gap, c='k', alpha=.2, lw=3)
        ax.grid(True)
        ax.legend(loc='upper right', fontsize=8)
        ax.set_xlim(radius, K - radius)
        ax.set_xlabel('position in edit sequence')
        ax.set_ylabel('gap probability')
        ax.set_ylim(-.05, .6)

    savefig(fig, 'gap_probabilities.png')


if __name__ == '__main__':
    exp_time_in_band()
    exp_gap_probs()