Source code for experiments.blot_stats

import numpy as np
from matplotlib import pyplot as plt
from matplotlib import gridspec
from itertools import product
from scipy.ndimage.filters import gaussian_filter1d

import sys
import logging

from biseqt.blot import WordBlot, find_peaks, band_radius
from biseqt.sequence import Alphabet
from biseqt.stochastics import rand_seq, MutationProcess

from util import plot_with_sd, color_code, plot_classifier
from util import with_dumpfile, log, savefig, load_fasta
from util import seq_pair

from util import plot_global_alignment, adjust_pw_plot
from util import estimate_match_probs_in_opseq, fill_in_unknown
from util import plot_scored_seeds, seeds_from_opseq
from util import opseq_path, plot_local_alignment
from util import plot_cdf
from util import estimate_gap_probs_in_opseq
from biseqt.pw import Aligner, BANDED_MODE, B_GLOBAL


@with_dumpfile
def sim_simulated_K_p(Ks, ps, n_samples, **kw):
    shape = (len(Ks), len(ps), n_samples)

    A = Alphabet('ACGT')
    wordlen = kw['wordlen']
    WB_kw = {
        'g_max': kw.get('g_max', .6),
        'sensitivity': kw.get('sensitivity', .99),
        'wordlen': wordlen,
        'alphabet': A,
        'path': kw.get('db_path', ':memory:'),
        'log_level': kw.get('log_level', logging.WARNING),
    }
    sim_data = {
        'scores': {'H0': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
                   'H1': {'pos': np.zeros(shape), 'neg': np.zeros(shape)}},
        'K_hat': {'pos': {'all': np.zeros(shape), 'longest': np.zeros(shape)},
                  'neg': {'all': np.zeros(shape), 'longest': np.zeros(shape)}},
        'p_hat': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'a_hat': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'd_min': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'd_max': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'WB_kw': WB_kw,
        'Ks': Ks,
        'ps': ps,
    }
    K_min = 100
    assert K_min <= min(Ks)

    for (K_idx, K), (p_idx, p_match) in product(enumerate(Ks), enumerate(ps)):
        p_min = p_match

        log('simulating (%d samples) K = %d, p = %.2f' %
            (n_samples, K, p_match), newline=False)
        for idx in range(n_samples):
            sys.stderr.write('.')
            # distribute p_match evenly over gap and subst
            subst = gap = 1 - np.sqrt(p_match)
            assert abs((1 - gap) * (1 - subst) - p_match) < 1e-3
            M = MutationProcess(A, subst_probs=subst, ge_prob=gap,
                                go_prob=gap)
            S_rel, T_rel = seq_pair(K, A, mutation_process=M)
            S_rel = rand_seq(A, K / 2) + S_rel + rand_seq(A, K / 2)
            T_rel = rand_seq(A, K / 2) + T_rel + rand_seq(A, K / 2)
            S_urel, T_urel = rand_seq(A, 2 * K), rand_seq(A, 2 * K)

            for key, (S, T) in zip(['pos', 'neg'],
                                   [(S_rel, T_rel), (S_urel, T_urel)]):
                WB = WordBlot(S, T, **WB_kw)

                # calculate H0/H1 scores with perfect information:
                band_r = WB.band_radius(K)
                num_seeds = WB.seed_count(d_band=(-band_r, band_r),
                                          a_band=(K, 3 * K))
                s0, s1 = WB.score_num_seeds(num_seeds=num_seeds,
                                            area=2 * band_r * K,
                                            seglen=K, p_match=p_match)
                sim_data['scores']['H0'][key][K_idx, p_idx, idx] = s0
                sim_data['scores']['H1'][key][K_idx, p_idx, idx] = s1

                results = list(WB.similar_segments(K_min, p_min,
                                                   at_least_one=True))

                # sum of K_hat and longest K_hat
                sim_data['K_hat'][key]['all'][K_idx, p_idx, idx] = sum(
                    r['segment'][1][1] - r['segment'][1][0]
                    for r in results) / 2
                sim_data['K_hat'][key]['longest'][K_idx, p_idx, idx] = max(
                    r['segment'][1][1] - r['segment'][1][0]
                    for r in results) / 2

                # pick the longest detected homology for p_hat, d_hat, a_hat
                hom = max(
                    results,
                    key=lambda r: r['segment'][1][1] - r['segment'][1][0]
                )
                sim_data['p_hat'][key][K_idx, p_idx, idx] = hom['p']

                sim_data['a_hat'][key][K_idx, p_idx, idx] = \
                    (hom['segment'][1][1] + hom['segment'][1][0]) / 2
                sim_data['d_min'][key][K_idx, p_idx, idx] = \
                    hom['segment'][0][0]
                sim_data['d_max'][key][K_idx, p_idx, idx] = \
                    hom['segment'][0][1]

        # H0_neg = sim_data['scores']['H0']['neg'][K_idx, p_idx, :].mean()
        # H1_pos = sim_data['scores']['H1']['pos'][K_idx, p_idx, :].mean()
        # sys.stderr.write('H0: %.2f, H1: %.2f' % (H0_neg, H1_pos))
        sys.stderr.write('\n')
    return sim_data


def plot_simulated_K_p(sim_data, select_Ks, select_ps, suffix=''):
    Ks, ps = sim_data['Ks'], sim_data['ps']
    scores = sim_data['scores']

    kw = {'marker': 'o', 'markersize': 3, 'alpha': .7, 'lw': 1}
    truth_kw = {'ls': '--', 'alpha': .3, 'color': 'k', 'lw': 3}

    # ======================================
    # score by varying K for select ps
    # score varying p for select Ks
    fig_scores = plt.figure(figsize=(8, 7))
    ax_H0_K = fig_scores.add_subplot(2, 2, 1)
    ax_H0_p = fig_scores.add_subplot(2, 2, 2)
    ax_H1_K = fig_scores.add_subplot(2, 2, 3)
    ax_H1_p = fig_scores.add_subplot(2, 2, 4)

    for mode, ax in zip(['H0', 'H1'], [ax_H0_K, ax_H1_K]):
        colors = color_code(select_ps)
        for p, color in zip(select_ps, colors):
            p_idx = ps.index(p)
            for case, ls in zip(['pos', 'neg'], ['-', '--']):
                label = '' if case == 'neg' else 'p = %.2f' % p
                plot_with_sd(ax, Ks, scores[mode][case][:, p_idx, :], axis=1,
                             color=color, ls=ls, label=label, **kw)
        ax.set_ylabel('%s score' % mode)
        ax.set_xlabel('similarity length')
        ax.set_xscale('log')
        ax.set_xticks(Ks)
        ax.set_xticklabels(Ks)
        ax.legend(loc='best')

    for mode, ax in zip(['H0', 'H1'], [ax_H0_p, ax_H1_p]):
        colors = color_code(select_Ks)
        for K, color in zip(select_Ks, colors):
            K_idx = Ks.index(K)
            for case, ls in zip(['pos', 'neg'], ['-', '--']):
                label = '' if case == 'neg' else 'K = %d' % K
                plot_with_sd(ax, ps, scores[mode][case][K_idx, :, :], axis=1,
                             color=color, ls=ls, label=label, **kw)
        ax.set_ylabel('%s score' % mode)
        ax.set_xlabel('similarity match probability')
        ax.set_xticks(ps)
        ax.set_xticklabels(ps)
        ax.legend(loc='best')
    savefig(fig_scores, 'simulations[scores]%s.png' % suffix)

    # =====================================
    fig_hats = plt.figure(figsize=(8, 8))
    ax_K = fig_hats.add_subplot(2, 2, 1)
    ax_p = fig_hats.add_subplot(2, 2, 3)
    ax_d = fig_hats.add_subplot(2, 2, 2)
    ax_a = fig_hats.add_subplot(2, 2, 4)

    # wordblot p_hats are with respect to projected alignment lengths whereas
    # p_true which is a property of MutationProcess is with respect to
    # alignment length; correction via p_hats *= (K/L) = (1 - g/2)

    p_hats = sim_data['p_hat']
    p_hats['pos'] *= (1 + p_hats['pos']) / 2
    p_hats['neg'] *= (1 + p_hats['neg']) / 2

    # estimated Ks for select ps
    K_hats = [sim_data['K_hat']['pos']['all'],
              sim_data['K_hat']['pos']['longest'],
              sim_data['K_hat']['neg']['longest']]
    lss = ['-', '--', ':']
    markers = ['o', 'o', 'x']
    show_sds = [True, False, False]

    colors = color_code(select_ps)
    for p, color in zip(select_ps, colors):
        p_idx = ps.index(p)

        kw_ = {'markersize': 3, 'alpha': .7, 'lw': 1, 'color': color,
               'label': 'p = %.2f' % p}
        labeled = False
        for K_hat, ls, marker, sd in zip(K_hats, lss, markers, show_sds):
            kw_['ls'], kw_['marker'] = ls, marker
            if sd:
                plot_with_sd(ax_K, Ks, K_hat[:, p_idx, :], axis=1, **kw_)
            else:
                ax_K.plot(Ks, K_hat[:, p_idx, :].mean(axis=1), **kw_)
            if not labeled:
                kw_.pop('label')
                labeled = True

        ax_K.set_xscale('log')
        ax_K.set_yscale('symlog')
        ax_K.set_xticks(Ks)
        ax_K.set_xticklabels(Ks, rotation=90)
    ax_K.set_ylabel('estimated similarity length')
    ax_K.set_xlabel('true similarity length')
    ax_K.plot(Ks, Ks, **truth_kw)
    ax_K.legend(loc='upper left')

    # estimated ps for select Ks
    colors = color_code(select_Ks)
    kw_ = kw.copy()
    kw_['ls'] = '--'
    for K, color in zip(select_Ks, colors):
        K_idx = Ks.index(K)
        p_hats = sim_data['p_hat']['pos']
        plot_with_sd(ax_p, ps, p_hats[K_idx, :, :], axis=1,
                     color=color, label='K = %d' % K, **kw)

        p_hats = sim_data['p_hat']['neg']
        ax_p.plot(ps, p_hats[K_idx, :, :].mean(axis=1), color=color, **kw_)

        ax_p.set_xticks(ps)
        ax_p.set_xticklabels(ps, rotation=90)
    ax_p.set_ylabel('estimated match probability')
    ax_p.set_xlabel('true match probability')
    ax_p.plot(ps, ps, **truth_kw)
    ax_p.legend(loc='upper left')

    # ======================================
    # estimated diagonal and antidiagonal position and band radius for select
    # match probabilities (select_ps), as a function of K
    d_mins = sim_data['d_min']['pos']
    d_maxs = sim_data['d_max']['pos']
    a_hats = sim_data['a_hat']['pos']

    colors = color_code(select_ps)
    for p, color in zip(select_ps, colors):
        label = 'p = %.2f' % p
        kw = {'color': color, 'alpha': .6, 'lw': 1,
              'marker': 'o', 'markersize': 3}
        p_idx = ps.index(p)
        d_ctrs = (d_mins[:, p_idx, :] + d_maxs[:, p_idx, :]) / 2
        plot_with_sd(ax_d, Ks, d_ctrs, axis=1, label=label, **kw)
        plot_with_sd(ax_a, Ks, a_hats[:, p_idx, :], axis=1, label=label, **kw)
        for ax in [ax_d, ax_a]:
            ax.set_xscale('log')
            ax.set_xticks(Ks)
            ax.set_xticklabels(Ks, rotation=90, fontsize=6)
            ax.set_xlabel('similarity length')
        ax_a.set_yscale('log')

    max_d = np.sqrt(max(Ks))
    ax_d.set_ylim(-max_d, max_d)
    ax_d.set_ylabel('estimated diagonal position')
    ax_d.legend(loc='upper left', fontsize=8)

    ax_a.plot(Ks, [2 * K for K in Ks], **truth_kw)
    ax_a.set_ylabel('estimated antidiagonal position')
    ax_a.legend(loc='upper left', fontsize=8)

    savefig(fig_hats, 'simulations[estimates]%s.png' % suffix)


[docs]def exp_simulated_K_p():
    """Performance of Word-Blot and associated statistical scores for pairwise
    local similarity search in *simulations*:

    * z-scores assigned to diagonal strips with and without
      similarities under the corresponding limiting Normal distributions
      (:math:`H0, H1`), for varying similarity lengths and match probabilities.
    * Estimated coordinates, lengths and match probabilities of local
      similarities for varying similarity lengths and match probabilities. In
      each trial with similarity length :math:`K` and match probability
      :math:`p`, two pairs of input sequences are provided to Word-Blot:
      a pair of sequences of length :math:`2K` whose substrings at positions
      :math:`[\\frac{K}{2}, \\frac{3K}{2}]` are homologies of length :math:`K`
      and match probability :math:`p`, and two unrelated sequences of length
      :math:`2K` are considered.

    **Supported Claims**

    * z-scores calculated by :func:`biseqt.blot.WordBlot.score_num_seeds`
      against the limiting normal distributions :math:`H_0, H_1` (unrelated and
      related) are stable and comparable across different similarity lengths
      and match probabilities; thus both scores are reliable statistics.
    * Local similarities found by Word-Blot as per
      :func:`biseqt.blot.WordBlot.similar_segments` are accurate in length,
      estimated match probability, and coordinates.

    .. figure::
        https://www.dropbox.com/s/s38hi2oo9pp78ul/
        simulations%5Bscores%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/s38hi2oo9pp78ul/
        simulations%5Bscores%5D.png?raw=1
       :alt: lightbox

       Z-scores of the number of seeds in diagonal strip for related (solid
       lines) and unrelated (dashed lines) segments of varying lengths,
       calculated against the limiting distribution for unrelated pairs of
       sequences (*left*) and related pairs (*right*) as a function of
       similarity length (*top*) and match probability (*bottom*), n=50
       samples, shaded regions indicate one standard deviation. Note that, as
       desired, H0 score is stable for unrelated sequences of any length and H1
       score is stable for related sequences of any length or match
       probability.

    .. figure::
        https://www.dropbox.com/s/lqm4s4evmbkk4as/
        simulations%5Bestimates%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/lqm4s4evmbkk4as/
        simulations%5Bestimates%5D.png?raw=1
       :alt: lightbox

       Estimated length (*top left*), match probability (*bottom left*),
       diagonal position (*top right*) and antidiagonal position (*bottom
       right*) of local similarities, word length is 7, n=20 samples, shaded
       regions indicate one standard deviation. Dashed black lines are ground
       truth for comparison.  For length estimates both the sum of the lengths
       of all reported similar segments (solid) and the length of the longest
       similar segment (dashed) are shown. Estimated match probablity and
       Length estimates for unrelated pairs are also shown (dotted). Estimated
       match probability, diagonal and antidiagonal coordinates correspond to
       the longest local similarity.
    """
    Ks = [100 * 2 ** i for i in range(1, 7)]
    select_Ks = Ks[1], Ks[3], Ks[5]

    ps = [round(1 - .06 * i, 2) for i in range(1, 8)]
    select_ps = ps[0], ps[2], ps[5]

    n_samples = 20
    wordlen = 7

    suffix = ''
    dumpfile = 'simulations%s.txt' % suffix
    sim_data = sim_simulated_K_p(
        Ks, ps, n_samples, wordlen=wordlen,
        dumpfile=dumpfile, ignore_existing=False)
    plot_simulated_K_p(sim_data, select_Ks, select_ps, suffix=suffix)


@with_dumpfile
def sim_comp_aligned_genes(seqs, pws, **kw):
    A = Alphabet('ACGT')
    wordlen, K_min, p_min = kw['wordlen'], kw['K_min'], kw['p_min']
    WB_kw = {
        'g_max': kw.get('g_max', .3),
        'sensitivity': kw.get('sensitivity', .99),
        'wordlen': wordlen,
        'alphabet': A,
        'path': kw.get('db_path', ':memory:'),
        'log_level': kw.get('log_level', logging.WARNING),
    }
    qM = 1 / p_min - 1
    qS = qG = -1

    aligner_kw = {
        'match_score': qM,
        'mismatch_score': qS,
        'ge_score': qG,
        'go_score': 0,
        'alnmode': BANDED_MODE,
        'alntype': B_GLOBAL,
    }

    similar_segments_kw = {'K_min': K_min, 'p_min': p_min}
    sim_data = {
        'pws': pws,
        'seqlens': {name: len(seqs[name]) for name in seqs},
        'seed_pos': [],
        'seed_neg': [],
        'seeds': {key: {} for key in pws},
        'opseq_seeds': {key: [] for key in pws},
        'segments': {key: {} for key in pws},
        'WB_kw': WB_kw,
        'similar_segments_kw': similar_segments_kw,
        'aligner_kw': aligner_kw,
    }
    for idx, ((id1, id2), opseq) in enumerate(pws.items()):
        log('finding local homologies between %s (%d) and %s (%d)' %
            (id1, sim_data['seqlens'][id1], id2, sim_data['seqlens'][id2]))
        S = seqs[id1]
        T = seqs[id2]

        WB = WordBlot(S, T, **WB_kw)
        sim_data['seeds'][(id1, id2)] = {
            WB.to_ij_coordinates(*rec['seed']): rec['p']
            for rec in WB.score_seeds(K_min)
        }
        log('-> found %d exactly matching %d-mers' %
            (len(sim_data['seeds'][(id1, id2)]), wordlen))

        # separate +/- seeds with their estimated probabilities
        band_r = band_radius(K_min, sim_data['WB_kw']['g_max'],
                             sim_data['WB_kw']['sensitivity'])
        in_band = np.zeros((len(S), len(T)))
        opseq_seeds = list(seeds_from_opseq(opseq, wordlen))
        sim_data['opseq_seeds'][(id1, id2)] = opseq_seeds
        for _, seed in opseq_seeds:
            d, a = WB.to_diagonal_coordinates(*seed)
            for d_ in range(d - band_r, d + band_r):
                for a_ in range(a - band_r, a + band_r):
                    i_, j_ = WB.to_ij_coordinates(d_, a_)
                    if 0 <= i_ < len(S) and 0 <= j_ < len(T):
                        in_band[i_, j_] = 1

        for seed, p in sim_data['seeds'][(id1, id2)].items():
            if in_band[seed[0], seed[1]]:
                sim_data['seed_pos'].append(p)
            else:
                sim_data['seed_neg'].append(p)
        log('separated +/- seeds')

        sim_data['segments'][(id1, id2)] = list(
            WB.similar_segments(**similar_segments_kw)
        )
        log('found %d similar segments' %
            len(sim_data['segments'][(id1, id2)]))

        # separate +/- coords
        probs = estimate_match_probs_in_opseq(opseq, wordlen)
        xs, ys = opseq_path(opseq, x0=0, y0=0)
        segments_on_opseq = find_peaks(probs, 5, p_min)
        hom_coords = np.zeros((len(S) + 1, len(T) + 1))
        for start, end in segments_on_opseq:
            for aln_pos in range(start, end + 1):
                i, j = xs[aln_pos], ys[aln_pos]
                hom_coords[i, j] = 1

        # run DP alignment on found segments
        for idx, seg_info in enumerate(sim_data['segments'][(id1, id2)]):
            seg = seg_info['segment']
            (i_start, i_end), (j_start, j_end) = WB.to_ij_coordinates_seg(seg)
            # to_ij_coordinates_seg might overflow; fix it
            i_end = min(sim_data['seqlens'][id1] - 1, i_end)
            j_end = min(sim_data['seqlens'][id2] - 1, j_end)
            # allow longer alignments to be found
            start_shift = min(i_start, j_start, 2 * wordlen)
            end_shift = min(len(seqs[id1]) - i_end,
                            len(seqs[id2]) - j_end, 2 * wordlen)
            i_start, j_start = i_start - start_shift, j_start - start_shift
            i_end, j_end = i_end + end_shift, j_end + end_shift
            S_ = S[i_start: i_end]
            T_ = T[j_start: j_end]
            rad = (seg[0][1] - seg[0][0]) / 2
            rad = min(len(S_), len(T_), max(rad, abs(len(S_) - len(T_)) + 2))
            aligner_kw['diag_range'] = (-rad, rad)
            aligner = Aligner(S_, T_, **aligner_kw)
            with aligner:
                aligner.solve()
                alignment = aligner.traceback()
                len_orig = alignment.projected_len(alignment.transcript)
                alignment = alignment.truncate_to_match()
                sim_data['segments'][(id1, id2)][idx]['alignment'] = alignment
                sim_data['segments'][(id1, id2)][idx]['frame'] = \
                    (i_start, i_end), (j_start, j_end)
                if alignment is not None:
                    tx = alignment.transcript
                    proj_len = alignment.projected_len(tx)
                    print round(seg_info['p'], 3), \
                        round(1. * tx.count('M') / proj_len, 3), \
                        proj_len, len_orig
    return sim_data


def plot_comp_aligned_genes(sim_data, suffix='', example_pair=None,
                            naming_style=None):
    seqlens = sim_data['seqlens']
    pws = sim_data['pws']
    K_min = sim_data['similar_segments_kw']['K_min']
    p_min = sim_data['similar_segments_kw']['p_min']

    # dims of plot containing subplots for all pairs of sequences; these plots
    # are there just for posterity
    fig_num = int(np.ceil(np.sqrt(len(pws))))
    fig_seeds = plt.figure(figsize=(6 * fig_num, 5 * fig_num))
    fig_profiles = plt.figure(figsize=(8 * fig_num, 4 * fig_num))

    fig_main = plt.figure(figsize=(13, 7))
    grids = gridspec.GridSpec(2, 6, height_ratios=[3, 1.3])

    ax_seeds_ex = fig_main.add_subplot(grids[0, :2])
    ax_p = fig_main.add_subplot(grids[0, 2:4])
    ax_p_aln = fig_main.add_subplot(grids[0, 4:])
    ax_profile_ex = fig_main.add_subplot(grids[1, :3])
    ax_p_cdf = fig_main.add_subplot(grids[1, 3:])

    labels = []
    p_alns = []
    for idx, ((id1, id2), opseq) in enumerate(pws.items()):
        key = (id1, id2)

        if naming_style == 'ensembl':
            id1 = id1.split('/')[0].replace('_', ' ')
            id2 = id2.split('/')[0].replace('_', ' ')
        elif naming_style == 'ucsc':
            id1 = id1.split('.')[0]
            id2 = id2.split('.')[0]

        labels.append((id1, id2))

        # ============================
        # match probability estimation
        # ============================
        radius = K_min / 2

        ps_true = estimate_match_probs_in_opseq(opseq, radius)
        ps_true_smooth = gaussian_filter1d(ps_true, 10)
        ps_hat_pos, ps_hat = [], []
        for pos, seed in sim_data['opseq_seeds'][key]:
            if ps_true[pos] == 0:
                # flanking zero regions in estimated gap probabilities
                continue
            ps_hat.append(sim_data['seeds'][key][seed])
            ps_hat_pos.append(pos)

        ax_profile = fig_profiles.add_subplot(fig_num, fig_num, idx + 1)
        axes_profile = [ax_profile]
        if key == example_pair:
            axes_profile.append(ax_profile_ex)

        for ax in axes_profile:
            ax.plot(range(len(ps_true)), ps_true_smooth, c='g', alpha=.8, lw=1)
            ax.set_title('similarity profile (%s vs. %s)' % (id1, id2))
            ax.set_xlabel('position along global alignment')
            ax.set_ylabel('match probability')
            ax.scatter(ps_hat_pos, ps_hat, lw=0, c='k', s=7, alpha=.4)

        # ps_true is calculated from the alignment and is against alignment
        # length; whereas our p_hat is estimated againsted the projected
        # length. Correct ps_true to be against projected length too.
        gs_true = estimate_gap_probs_in_opseq(opseq, radius)
        ps_true_proj = [
            (1 / (1 - gs_true[ps_hat_pos][i])) * ps_true[ps_hat_pos[i]]
            for i in range(len(ps_hat))]
        ax_p.scatter(ps_hat, ps_true_proj, lw=0, color='g', s=3, alpha=.5)

        # =============
        # Dot Plots
        # =============
        ax_seeds = fig_seeds.add_subplot(fig_num, fig_num, idx + 1)
        axes_seeds = [ax_seeds]
        if key == example_pair:
            axes_seeds.append(ax_seeds_ex)
        for ax in axes_seeds:
            plot_scored_seeds(ax, sim_data['seeds'][key].items(),
                              threshold=p_min, alpha=.7, zorder=9)
            plot_global_alignment(ax, opseq, lw=5, alpha=.2, color='g')
            adjust_pw_plot(ax, seqlens[key[0]], seqlens[key[1]])
            ax.set_ylabel(id1)
            ax.set_xlabel(id2)

        # ================================
        # match probability estimation vs local alignments
        # ================================
        for seg_info in sim_data['segments'][key]:
            p_hat = seg_info['p']
            alignment = seg_info['alignment']
            if alignment is None:
                p_aln = 0
            else:
                transcript = alignment.transcript
                K_aln = alignment.projected_len(transcript)
                p_aln = 1. * transcript.count('M') / K_aln
                for ax in axes_seeds:
                    plot_local_alignment(ax, transcript,
                                         seg_info['frame'][0][0],
                                         seg_info['frame'][1][0],
                                         lw=1, alpha=.9, color='b', zorder=10)
            p_alns.append(p_aln)
            ax_p_aln.scatter([p_hat], [p_aln], lw=0, color='b', s=10, alpha=.3)

    for ax in [ax_p, ax_p_aln]:
        ax.plot([0, 1], [0, 1], lw=2, ls='--', alpha=.6, c='k')
        ax.set_xlabel('estimated match probability')
        ax.set_ylabel('true match probability')
        ax.set_xlim(-.1, 1.1)
        ax.set_ylim(-.1, 1.1)
        ax.set_aspect('equal')
    ax_p.set_title('homologous seeds')
    ax_p_aln.set_title('similar segments')
    ax_p_aln.plot([0, 1], [p_min, p_min], lw=2, ls='--', alpha=.6, c='k')

    plot_cdf(ax_p_cdf, p_alns, c='b', lw=1, alpha=.9, smooth_radius=.5)
    ax_p_cdf.axvline(x=p_min, ls='--', lw=2, c='k', alpha=.6)
    ax_p_cdf.set_xlabel('local alignment match probability')
    ax_p_cdf.set_ylabel('cumulative distribution')
    ax_p_cdf.set_xlim(0.4, 1)

    for ax in [ax_p_cdf, ax_profile_ex]:
        ticks = [i * .1 for i in range(11)]
        ax.set_yticks(ticks)
        ax.set_yticklabels(ticks, fontsize=6)
    ax_profile_ex.axhline(y=p_min, ls='--', lw=2, c='k', alpha=.6)

    for fig in [fig_main, fig_seeds, fig_profiles]:
        fig.tight_layout()
    savefig(fig_main, 'comp_aligned_genes[estimates]%s.png' % suffix)
    savefig(fig_seeds, 'comp_aligned_genes[seeds]%s.png' % suffix)
    savefig(fig_profiles, 'comp_aligned_genes[profiles]%s.png' % suffix)

    fig, _ = plot_classifier(sim_data['seed_pos'], sim_data['seed_neg'],
                             labels=['homologous', 'non-homologous'],
                             mark_threshold=.8)
    savefig(fig, 'comp_aligned_genes[classifier]%s.png' % suffix)


[docs]def exp_comp_aligned_genes():
    """Performance of Word-Blot and associated statistical scores for pairwise
    local similarity search on *biological data*. Given aligned copies of a
    gene in multiple species we consider the following for each pair:

    * estimated match probability :math:`\hat{p}` at *homologous seeds* (i.e.
      those exactly matching kmers that lie on the known global pairwise
      alignment) compared to local match probability of said global alignment.
    * discrimination power of :math:`\hat{p}` to separate homologous seeds from
      non-homologous seeds.
    * all identified similar segments between the two sequences verified by
      comparison to banded dynamic programming alignment in the identified
      diagonal strip.

    **Supported Claims**

    * Word-Blot accurately estimates match probabilities, length, and
      coordinates of local similarities in biological data.
    * estimated match probabilities :math:`\hat{p}` are effective statistics to
      separate homologous and non-homologous seeds.
    * Dynamic programming alignment confirms that Word-Blot correctly
      identifies local similarities, even those that are inevitably missed by
      global alignments.

    .. figure::
        https://www.dropbox.com/s/8ajed2uo0qobyjw/
        comp_aligned_genes%5Bestimates%5D%5Birx1%5D%5Bp%3D0.80%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/8ajed2uo0qobyjw/
        comp_aligned_genes%5Bestimates%5D%5Birx1%5D%5Bp%3D0.80%5D.png?raw=1
       :alt: lightbox

       Word-Blot performance in identifying local similarities among *Iroquois
       homeobox protein 1 (IRX1)* genes for 10 amniota obtain from Ensembl with
       word length 8, minimum match probability 0.8, and minimum similarity
       length 200nt. An example pairwise comparison (*top left*) shows seeds
       (grey) color coded by intensity according to their estimated match
       probability with those exceeding the threshold shown larger for clarity.
       Known global alignment is shown in green and aligned local similarities
       identified by Word-Blot are shown in blue. Estimated similarity profile
       for the same example pair (*bottom left*) is shown at positions
       exceeding the match probability minimum (black dots) and compared to the
       profile obtained from the known global alignment (green). For all
       pairwise comparisons estimated and true match probability at homologous
       seeds are shown (*top middle*, green), the diagonal shows ground truth
       for comparison (dashed black).  Similarly for all identified similar
       segments, estimated match probability and true probability obtained from
       banded global alignment are shown (*top right*, blue), the diagonal
       showing ground truth (dashed black) and the horizontal line shows the
       cutoff used by Word-Blot (dashed black).

    .. figure::
        https://www.dropbox.com/s/2mc1rlnazodeu6l/
        comp_aligned_genes%5Bclassifier%5D%5Birx1%5D%5Bp%3D0.80%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/2mc1rlnazodeu6l/
        comp_aligned_genes%5Bclassifier%5D%5Birx1%5D%5Bp%3D0.80%5D.png?raw=1
       :alt: lightbox

       Performance of Word-Blot estimated match probability at discriminating
       between homologous and non-homologous seeds in the same context as the
       figure above. The cumulative distribution of estimated match probability
       in each case (*bottom right*), the resulting ROC curve (*left*) and the
       predictive ROC curve (*top right*). The threshold 0.8 is indicated in
       blue in all three plots.


    .. figure::
        https://www.dropbox.com/s/38xgx8gbvziq126/
        comp_aligned_genes%5Bestimates%5D%5Bactb%5D%5Bp%3D0.50%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/38xgx8gbvziq126/
        comp_aligned_genes%5Bestimates%5D%5Bactb%5D%5Bp%3D0.50%5D.png?raw=1
       :alt: lightbox

       Word-Blot performance in identifying local similarities among *Beta
       Actin (ACTB)* genes for 7 vertebrates obtain from UCSC genome browser
       with word length 6, minimum match probability 0.5, and minimum
       similarity length 200nt. All subplots are similar to those of *IRX1*
       experiment shown above.

    .. figure::
        https://www.dropbox.com/s/t5pff0nk54m8swr/
        comp_aligned_genes%5Bclassifier%5D%5Bactb%5D%5Bp%3D0.50%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/t5pff0nk54m8swr/
        comp_aligned_genes%5Bclassifier%5D%5Bactb%5D%5Bp%3D0.50%5D.png?raw=1
       :alt: lightbox

       Performance of Word-Blot estimated match probability at discriminating
       between homologous and non-homologous seeds in the same context as the
       figure above.
    """
    A = Alphabet('ACGT')

    # p_min = .5
    # K_min = 200
    # wordlen = 6
    # style = 'ucsc'
    # suffix = '[actb][p=%.2f]' % p_min
    # dumpfile = 'comp_aligned_genes%s.txt' % suffix
    # seqs_path = 'data/actb/actb-7vet.fa'
    # pws_path = 'data/actb/actb-7vet-pws.fa'
    # example_pair = ('hg38.chr7', 'mm10.chr5')

    p_min = .8
    wordlen = 8
    K_min = 200
    style = 'ensembl'
    suffix = '[irx1][p=%.2f]' % p_min
    dumpfile = 'comp_aligned_genes%s.txt' % suffix
    seqs_path = 'data/irx1/irx1-vert-amniota-indiv.fa'
    pws_path = 'data/irx1/irx1-vert-amniota-pws.fa'
    example_pair = ('homo_sapiens/1-10720', 'bos_taurus/1-10720')

    with open(seqs_path) as f:
        seqs = {name: A.parse(fill_in_unknown(seq.upper(), A))
                for seq, name, _ in load_fasta(f)}
    with open(pws_path) as f:
        pws = {tuple(name.split(':')): seq for seq, name, _ in load_fasta(f)}
        pws = {key: value for key, value in pws.items()}
    sim_data = sim_comp_aligned_genes(
        seqs, pws, wordlen=wordlen, p_min=p_min, K_min=K_min,
        dumpfile=dumpfile, ignore_existing=False)
    plot_comp_aligned_genes(sim_data, suffix=suffix, example_pair=example_pair,
                            naming_style=style)


if __name__ == '__main__':
    exp_simulated_K_p()
    exp_comp_aligned_genes()