Source code for experiments.blot_stats

import numpy as np
from matplotlib import pyplot as plt
from matplotlib import gridspec
from itertools import product
from scipy.ndimage.filters import gaussian_filter1d

import sys
import logging

from biseqt.blot import WordBlot, find_peaks, band_radius
from biseqt.sequence import Alphabet
from biseqt.stochastics import rand_seq, MutationProcess

from util import plot_with_sd, color_code, plot_classifier
from util import with_dumpfile, log, savefig, load_fasta
from util import seq_pair

from util import plot_global_alignment, adjust_pw_plot
from util import estimate_match_probs_in_opseq, fill_in_unknown
from util import plot_scored_seeds, seeds_from_opseq
from util import opseq_path, plot_local_alignment
from util import plot_cdf
from util import estimate_gap_probs_in_opseq
from biseqt.pw import Aligner, BANDED_MODE, B_GLOBAL


@with_dumpfile
def sim_simulated_K_p(Ks, ps, n_samples, **kw):
    shape = (len(Ks), len(ps), n_samples)

    A = Alphabet('ACGT')
    wordlen = kw['wordlen']
    WB_kw = {
        'g_max': kw.get('g_max', .6),
        'sensitivity': kw.get('sensitivity', .99),
        'wordlen': wordlen,
        'alphabet': A,
        'path': kw.get('db_path', ':memory:'),
        'log_level': kw.get('log_level', logging.WARNING),
    }
    sim_data = {
        'scores': {'H0': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
                   'H1': {'pos': np.zeros(shape), 'neg': np.zeros(shape)}},
        'K_hat': {'pos': {'all': np.zeros(shape), 'longest': np.zeros(shape)},
                  'neg': {'all': np.zeros(shape), 'longest': np.zeros(shape)}},
        'p_hat': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'a_hat': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'd_min': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'd_max': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'WB_kw': WB_kw,
        'Ks': Ks,
        'ps': ps,
    }
    K_min = 100
    assert K_min <= min(Ks)

    for (K_idx, K), (p_idx, p_match) in product(enumerate(Ks), enumerate(ps)):
        p_min = p_match

        log('simulating (%d samples) K = %d, p = %.2f' %
            (n_samples, K, p_match), newline=False)
        for idx in range(n_samples):
            sys.stderr.write('.')
            # distribute p_match evenly over gap and subst
            subst = gap = 1 - np.sqrt(p_match)
            assert abs((1 - gap) * (1 - subst) - p_match) < 1e-3
            M = MutationProcess(A, subst_probs=subst, ge_prob=gap,
                                go_prob=gap)
            S_rel, T_rel = seq_pair(K, A, mutation_process=M)
            S_rel = rand_seq(A, K / 2) + S_rel + rand_seq(A, K / 2)
            T_rel = rand_seq(A, K / 2) + T_rel + rand_seq(A, K / 2)
            S_urel, T_urel = rand_seq(A, 2 * K), rand_seq(A, 2 * K)

            for key, (S, T) in zip(['pos', 'neg'],
                                   [(S_rel, T_rel), (S_urel, T_urel)]):
                WB = WordBlot(S, T, **WB_kw)

                # calculate H0/H1 scores with perfect information:
                band_r = WB.band_radius(K)
                num_seeds = WB.seed_count(d_band=(-band_r, band_r),
                                          a_band=(K, 3 * K))
                s0, s1 = WB.score_num_seeds(num_seeds=num_seeds,
                                            area=2 * band_r * K,
                                            seglen=K, p_match=p_match)
                sim_data['scores']['H0'][key][K_idx, p_idx, idx] = s0
                sim_data['scores']['H1'][key][K_idx, p_idx, idx] = s1

                results = list(WB.similar_segments(K_min, p_min,
                                                   at_least_one=True))

                # sum of K_hat and longest K_hat
                sim_data['K_hat'][key]['all'][K_idx, p_idx, idx] = sum(
                    r['segment'][1][1] - r['segment'][1][0]
                    for r in results) / 2
                sim_data['K_hat'][key]['longest'][K_idx, p_idx, idx] = max(
                    r['segment'][1][1] - r['segment'][1][0]
                    for r in results) / 2

                # pick the longest detected homology for p_hat, d_hat, a_hat
                hom = max(
                    results,
                    key=lambda r: r['segment'][1][1] - r['segment'][1][0]
                )
                sim_data['p_hat'][key][K_idx, p_idx, idx] = hom['p']

                sim_data['a_hat'][key][K_idx, p_idx, idx] = \
                    (hom['segment'][1][1] + hom['segment'][1][0]) / 2
                sim_data['d_min'][key][K_idx, p_idx, idx] = \
                    hom['segment'][0][0]
                sim_data['d_max'][key][K_idx, p_idx, idx] = \
                    hom['segment'][0][1]

        # H0_neg = sim_data['scores']['H0']['neg'][K_idx, p_idx, :].mean()
        # H1_pos = sim_data['scores']['H1']['pos'][K_idx, p_idx, :].mean()
        # sys.stderr.write('H0: %.2f, H1: %.2f' % (H0_neg, H1_pos))
        sys.stderr.write('\n')
    return sim_data


def plot_simulated_K_p(sim_data, select_Ks, select_ps, suffix=''):
    Ks, ps = sim_data['Ks'], sim_data['ps']
    scores = sim_data['scores']

    kw = {'marker': 'o', 'markersize': 3, 'alpha': .7, 'lw': 1}
    truth_kw = {'ls': '--', 'alpha': .3, 'color': 'k', 'lw': 3}

    # ======================================
    # score by varying K for select ps
    # score varying p for select Ks
    fig_scores = plt.figure(figsize=(8, 7))
    ax_H0_K = fig_scores.add_subplot(2, 2, 1)
    ax_H0_p = fig_scores.add_subplot(2, 2, 2)
    ax_H1_K = fig_scores.add_subplot(2, 2, 3)
    ax_H1_p = fig_scores.add_subplot(2, 2, 4)

    for mode, ax in zip(['H0', 'H1'], [ax_H0_K, ax_H1_K]):
        colors = color_code(select_ps)
        for p, color in zip(select_ps, colors):
            p_idx = ps.index(p)
            for case, ls in zip(['pos', 'neg'], ['-', '--']):
                label = '' if case == 'neg' else 'p = %.2f' % p
                plot_with_sd(ax, Ks, scores[mode][case][:, p_idx, :], axis=1,
                             color=color, ls=ls, label=label, **kw)
        ax.set_ylabel('%s score' % mode)
        ax.set_xlabel('similarity length')
        ax.set_xscale('log')
        ax.set_xticks(Ks)
        ax.set_xticklabels(Ks)
        ax.legend(loc='best')

    for mode, ax in zip(['H0', 'H1'], [ax_H0_p, ax_H1_p]):
        colors = color_code(select_Ks)
        for K, color in zip(select_Ks, colors):
            K_idx = Ks.index(K)
            for case, ls in zip(['pos', 'neg'], ['-', '--']):
                label = '' if case == 'neg' else 'K = %d' % K
                plot_with_sd(ax, ps, scores[mode][case][K_idx, :, :], axis=1,
                             color=color, ls=ls, label=label, **kw)
        ax.set_ylabel('%s score' % mode)
        ax.set_xlabel('similarity match probability')
        ax.set_xticks(ps)
        ax.set_xticklabels(ps)
        ax.legend(loc='best')
    savefig(fig_scores, 'simulations[scores]%s.png' % suffix)

    # =====================================
    fig_hats = plt.figure(figsize=(8, 8))
    ax_K = fig_hats.add_subplot(2, 2, 1)
    ax_p = fig_hats.add_subplot(2, 2, 3)
    ax_d = fig_hats.add_subplot(2, 2, 2)
    ax_a = fig_hats.add_subplot(2, 2, 4)

    # wordblot p_hats are with respect to projected alignment lengths whereas
    # p_true which is a property of MutationProcess is with respect to
    # alignment length; correction via p_hats *= (K/L) = (1 - g/2)

    p_hats = sim_data['p_hat']
    p_hats['pos'] *= (1 + p_hats['pos']) / 2
    p_hats['neg'] *= (1 + p_hats['neg']) / 2

    # estimated Ks for select ps
    K_hats = [sim_data['K_hat']['pos']['all'],
              sim_data['K_hat']['pos']['longest'],
              sim_data['K_hat']['neg']['longest']]
    lss = ['-', '--', ':']
    markers = ['o', 'o', 'x']
    show_sds = [True, False, False]

    colors = color_code(select_ps)
    for p, color in zip(select_ps, colors):
        p_idx = ps.index(p)

        kw_ = {'markersize': 3, 'alpha': .7, 'lw': 1, 'color': color,
               'label': 'p = %.2f' % p}
        labeled = False
        for K_hat, ls, marker, sd in zip(K_hats, lss, markers, show_sds):
            kw_['ls'], kw_['marker'] = ls, marker
            if sd:
                plot_with_sd(ax_K, Ks, K_hat[:, p_idx, :], axis=1, **kw_)
            else:
                ax_K.plot(Ks, K_hat[:, p_idx, :].mean(axis=1), **kw_)
            if not labeled:
                kw_.pop('label')
                labeled = True

        ax_K.set_xscale('log')
        ax_K.set_yscale('symlog')
        ax_K.set_xticks(Ks)
        ax_K.set_xticklabels(Ks, rotation=90)
    ax_K.set_ylabel('estimated similarity length')
    ax_K.set_xlabel('true similarity length')
    ax_K.plot(Ks, Ks, **truth_kw)
    ax_K.legend(loc='upper left')

    # estimated ps for select Ks
    colors = color_code(select_Ks)
    kw_ = kw.copy()
    kw_['ls'] = '--'
    for K, color in zip(select_Ks, colors):
        K_idx = Ks.index(K)
        p_hats = sim_data['p_hat']['pos']
        plot_with_sd(ax_p, ps, p_hats[K_idx, :, :], axis=1,
                     color=color, label='K = %d' % K, **kw)

        p_hats = sim_data['p_hat']['neg']
        ax_p.plot(ps, p_hats[K_idx, :, :].mean(axis=1), color=color, **kw_)

        ax_p.set_xticks(ps)
        ax_p.set_xticklabels(ps, rotation=90)
    ax_p.set_ylabel('estimated match probability')
    ax_p.set_xlabel('true match probability')
    ax_p.plot(ps, ps, **truth_kw)
    ax_p.legend(loc='upper left')

    # ======================================
    # estimated diagonal and antidiagonal position and band radius for select
    # match probabilities (select_ps), as a function of K
    d_mins = sim_data['d_min']['pos']
    d_maxs = sim_data['d_max']['pos']
    a_hats = sim_data['a_hat']['pos']

    colors = color_code(select_ps)
    for p, color in zip(select_ps, colors):
        label = 'p = %.2f' % p
        kw = {'color': color, 'alpha': .6, 'lw': 1,
              'marker': 'o', 'markersize': 3}
        p_idx = ps.index(p)
        d_ctrs = (d_mins[:, p_idx, :] + d_maxs[:, p_idx, :]) / 2
        plot_with_sd(ax_d, Ks, d_ctrs, axis=1, label=label, **kw)
        plot_with_sd(ax_a, Ks, a_hats[:, p_idx, :], axis=1, label=label, **kw)
        for ax in [ax_d, ax_a]:
            ax.set_xscale('log')
            ax.set_xticks(Ks)
            ax.set_xticklabels(Ks, rotation=90, fontsize=6)
            ax.set_xlabel('similarity length')
        ax_a.set_yscale('log')

    max_d = np.sqrt(max(Ks))
    ax_d.set_ylim(-max_d, max_d)
    ax_d.set_ylabel('estimated diagonal position')
    ax_d.legend(loc='upper left', fontsize=8)

    ax_a.plot(Ks, [2 * K for K in Ks], **truth_kw)
    ax_a.set_ylabel('estimated antidiagonal position')
    ax_a.legend(loc='upper left', fontsize=8)

    savefig(fig_hats, 'simulations[estimates]%s.png' % suffix)


[docs]def exp_simulated_K_p(): """Performance of Word-Blot and associated statistical scores for pairwise local similarity search in *simulations*: * z-scores assigned to diagonal strips with and without similarities under the corresponding limiting Normal distributions (:math:`H0, H1`), for varying similarity lengths and match probabilities. * Estimated coordinates, lengths and match probabilities of local similarities for varying similarity lengths and match probabilities. In each trial with similarity length :math:`K` and match probability :math:`p`, two pairs of input sequences are provided to Word-Blot: a pair of sequences of length :math:`2K` whose substrings at positions :math:`[\\frac{K}{2}, \\frac{3K}{2}]` are homologies of length :math:`K` and match probability :math:`p`, and two unrelated sequences of length :math:`2K` are considered. **Supported Claims** * z-scores calculated by :func:`biseqt.blot.WordBlot.score_num_seeds` against the limiting normal distributions :math:`H_0, H_1` (unrelated and related) are stable and comparable across different similarity lengths and match probabilities; thus both scores are reliable statistics. * Local similarities found by Word-Blot as per :func:`biseqt.blot.WordBlot.similar_segments` are accurate in length, estimated match probability, and coordinates. .. figure:: https://www.dropbox.com/s/s38hi2oo9pp78ul/ simulations%5Bscores%5D.png?raw=1 :target: https://www.dropbox.com/s/s38hi2oo9pp78ul/ simulations%5Bscores%5D.png?raw=1 :alt: lightbox Z-scores of the number of seeds in diagonal strip for related (solid lines) and unrelated (dashed lines) segments of varying lengths, calculated against the limiting distribution for unrelated pairs of sequences (*left*) and related pairs (*right*) as a function of similarity length (*top*) and match probability (*bottom*), n=50 samples, shaded regions indicate one standard deviation. Note that, as desired, H0 score is stable for unrelated sequences of any length and H1 score is stable for related sequences of any length or match probability. .. figure:: https://www.dropbox.com/s/lqm4s4evmbkk4as/ simulations%5Bestimates%5D.png?raw=1 :target: https://www.dropbox.com/s/lqm4s4evmbkk4as/ simulations%5Bestimates%5D.png?raw=1 :alt: lightbox Estimated length (*top left*), match probability (*bottom left*), diagonal position (*top right*) and antidiagonal position (*bottom right*) of local similarities, word length is 7, n=20 samples, shaded regions indicate one standard deviation. Dashed black lines are ground truth for comparison. For length estimates both the sum of the lengths of all reported similar segments (solid) and the length of the longest similar segment (dashed) are shown. Estimated match probablity and Length estimates for unrelated pairs are also shown (dotted). Estimated match probability, diagonal and antidiagonal coordinates correspond to the longest local similarity. """ Ks = [100 * 2 ** i for i in range(1, 7)] select_Ks = Ks[1], Ks[3], Ks[5] ps = [round(1 - .06 * i, 2) for i in range(1, 8)] select_ps = ps[0], ps[2], ps[5] n_samples = 20 wordlen = 7 suffix = '' dumpfile = 'simulations%s.txt' % suffix sim_data = sim_simulated_K_p( Ks, ps, n_samples, wordlen=wordlen, dumpfile=dumpfile, ignore_existing=False) plot_simulated_K_p(sim_data, select_Ks, select_ps, suffix=suffix)
@with_dumpfile def sim_comp_aligned_genes(seqs, pws, **kw): A = Alphabet('ACGT') wordlen, K_min, p_min = kw['wordlen'], kw['K_min'], kw['p_min'] WB_kw = { 'g_max': kw.get('g_max', .3), 'sensitivity': kw.get('sensitivity', .99), 'wordlen': wordlen, 'alphabet': A, 'path': kw.get('db_path', ':memory:'), 'log_level': kw.get('log_level', logging.WARNING), } qM = 1 / p_min - 1 qS = qG = -1 aligner_kw = { 'match_score': qM, 'mismatch_score': qS, 'ge_score': qG, 'go_score': 0, 'alnmode': BANDED_MODE, 'alntype': B_GLOBAL, } similar_segments_kw = {'K_min': K_min, 'p_min': p_min} sim_data = { 'pws': pws, 'seqlens': {name: len(seqs[name]) for name in seqs}, 'seed_pos': [], 'seed_neg': [], 'seeds': {key: {} for key in pws}, 'opseq_seeds': {key: [] for key in pws}, 'segments': {key: {} for key in pws}, 'WB_kw': WB_kw, 'similar_segments_kw': similar_segments_kw, 'aligner_kw': aligner_kw, } for idx, ((id1, id2), opseq) in enumerate(pws.items()): log('finding local homologies between %s (%d) and %s (%d)' % (id1, sim_data['seqlens'][id1], id2, sim_data['seqlens'][id2])) S = seqs[id1] T = seqs[id2] WB = WordBlot(S, T, **WB_kw) sim_data['seeds'][(id1, id2)] = { WB.to_ij_coordinates(*rec['seed']): rec['p'] for rec in WB.score_seeds(K_min) } log('-> found %d exactly matching %d-mers' % (len(sim_data['seeds'][(id1, id2)]), wordlen)) # separate +/- seeds with their estimated probabilities band_r = band_radius(K_min, sim_data['WB_kw']['g_max'], sim_data['WB_kw']['sensitivity']) in_band = np.zeros((len(S), len(T))) opseq_seeds = list(seeds_from_opseq(opseq, wordlen)) sim_data['opseq_seeds'][(id1, id2)] = opseq_seeds for _, seed in opseq_seeds: d, a = WB.to_diagonal_coordinates(*seed) for d_ in range(d - band_r, d + band_r): for a_ in range(a - band_r, a + band_r): i_, j_ = WB.to_ij_coordinates(d_, a_) if 0 <= i_ < len(S) and 0 <= j_ < len(T): in_band[i_, j_] = 1 for seed, p in sim_data['seeds'][(id1, id2)].items(): if in_band[seed[0], seed[1]]: sim_data['seed_pos'].append(p) else: sim_data['seed_neg'].append(p) log('separated +/- seeds') sim_data['segments'][(id1, id2)] = list( WB.similar_segments(**similar_segments_kw) ) log('found %d similar segments' % len(sim_data['segments'][(id1, id2)])) # separate +/- coords probs = estimate_match_probs_in_opseq(opseq, wordlen) xs, ys = opseq_path(opseq, x0=0, y0=0) segments_on_opseq = find_peaks(probs, 5, p_min) hom_coords = np.zeros((len(S) + 1, len(T) + 1)) for start, end in segments_on_opseq: for aln_pos in range(start, end + 1): i, j = xs[aln_pos], ys[aln_pos] hom_coords[i, j] = 1 # run DP alignment on found segments for idx, seg_info in enumerate(sim_data['segments'][(id1, id2)]): seg = seg_info['segment'] (i_start, i_end), (j_start, j_end) = WB.to_ij_coordinates_seg(seg) # to_ij_coordinates_seg might overflow; fix it i_end = min(sim_data['seqlens'][id1] - 1, i_end) j_end = min(sim_data['seqlens'][id2] - 1, j_end) # allow longer alignments to be found start_shift = min(i_start, j_start, 2 * wordlen) end_shift = min(len(seqs[id1]) - i_end, len(seqs[id2]) - j_end, 2 * wordlen) i_start, j_start = i_start - start_shift, j_start - start_shift i_end, j_end = i_end + end_shift, j_end + end_shift S_ = S[i_start: i_end] T_ = T[j_start: j_end] rad = (seg[0][1] - seg[0][0]) / 2 rad = min(len(S_), len(T_), max(rad, abs(len(S_) - len(T_)) + 2)) aligner_kw['diag_range'] = (-rad, rad) aligner = Aligner(S_, T_, **aligner_kw) with aligner: aligner.solve() alignment = aligner.traceback() len_orig = alignment.projected_len(alignment.transcript) alignment = alignment.truncate_to_match() sim_data['segments'][(id1, id2)][idx]['alignment'] = alignment sim_data['segments'][(id1, id2)][idx]['frame'] = \ (i_start, i_end), (j_start, j_end) if alignment is not None: tx = alignment.transcript proj_len = alignment.projected_len(tx) print round(seg_info['p'], 3), \ round(1. * tx.count('M') / proj_len, 3), \ proj_len, len_orig return sim_data def plot_comp_aligned_genes(sim_data, suffix='', example_pair=None, naming_style=None): seqlens = sim_data['seqlens'] pws = sim_data['pws'] K_min = sim_data['similar_segments_kw']['K_min'] p_min = sim_data['similar_segments_kw']['p_min'] # dims of plot containing subplots for all pairs of sequences; these plots # are there just for posterity fig_num = int(np.ceil(np.sqrt(len(pws)))) fig_seeds = plt.figure(figsize=(6 * fig_num, 5 * fig_num)) fig_profiles = plt.figure(figsize=(8 * fig_num, 4 * fig_num)) fig_main = plt.figure(figsize=(13, 7)) grids = gridspec.GridSpec(2, 6, height_ratios=[3, 1.3]) ax_seeds_ex = fig_main.add_subplot(grids[0, :2]) ax_p = fig_main.add_subplot(grids[0, 2:4]) ax_p_aln = fig_main.add_subplot(grids[0, 4:]) ax_profile_ex = fig_main.add_subplot(grids[1, :3]) ax_p_cdf = fig_main.add_subplot(grids[1, 3:]) labels = [] p_alns = [] for idx, ((id1, id2), opseq) in enumerate(pws.items()): key = (id1, id2) if naming_style == 'ensembl': id1 = id1.split('/')[0].replace('_', ' ') id2 = id2.split('/')[0].replace('_', ' ') elif naming_style == 'ucsc': id1 = id1.split('.')[0] id2 = id2.split('.')[0] labels.append((id1, id2)) # ============================ # match probability estimation # ============================ radius = K_min / 2 ps_true = estimate_match_probs_in_opseq(opseq, radius) ps_true_smooth = gaussian_filter1d(ps_true, 10) ps_hat_pos, ps_hat = [], [] for pos, seed in sim_data['opseq_seeds'][key]: if ps_true[pos] == 0: # flanking zero regions in estimated gap probabilities continue ps_hat.append(sim_data['seeds'][key][seed]) ps_hat_pos.append(pos) ax_profile = fig_profiles.add_subplot(fig_num, fig_num, idx + 1) axes_profile = [ax_profile] if key == example_pair: axes_profile.append(ax_profile_ex) for ax in axes_profile: ax.plot(range(len(ps_true)), ps_true_smooth, c='g', alpha=.8, lw=1) ax.set_title('similarity profile (%s vs. %s)' % (id1, id2)) ax.set_xlabel('position along global alignment') ax.set_ylabel('match probability') ax.scatter(ps_hat_pos, ps_hat, lw=0, c='k', s=7, alpha=.4) # ps_true is calculated from the alignment and is against alignment # length; whereas our p_hat is estimated againsted the projected # length. Correct ps_true to be against projected length too. gs_true = estimate_gap_probs_in_opseq(opseq, radius) ps_true_proj = [ (1 / (1 - gs_true[ps_hat_pos][i])) * ps_true[ps_hat_pos[i]] for i in range(len(ps_hat))] ax_p.scatter(ps_hat, ps_true_proj, lw=0, color='g', s=3, alpha=.5) # ============= # Dot Plots # ============= ax_seeds = fig_seeds.add_subplot(fig_num, fig_num, idx + 1) axes_seeds = [ax_seeds] if key == example_pair: axes_seeds.append(ax_seeds_ex) for ax in axes_seeds: plot_scored_seeds(ax, sim_data['seeds'][key].items(), threshold=p_min, alpha=.7, zorder=9) plot_global_alignment(ax, opseq, lw=5, alpha=.2, color='g') adjust_pw_plot(ax, seqlens[key[0]], seqlens[key[1]]) ax.set_ylabel(id1) ax.set_xlabel(id2) # ================================ # match probability estimation vs local alignments # ================================ for seg_info in sim_data['segments'][key]: p_hat = seg_info['p'] alignment = seg_info['alignment'] if alignment is None: p_aln = 0 else: transcript = alignment.transcript K_aln = alignment.projected_len(transcript) p_aln = 1. * transcript.count('M') / K_aln for ax in axes_seeds: plot_local_alignment(ax, transcript, seg_info['frame'][0][0], seg_info['frame'][1][0], lw=1, alpha=.9, color='b', zorder=10) p_alns.append(p_aln) ax_p_aln.scatter([p_hat], [p_aln], lw=0, color='b', s=10, alpha=.3) for ax in [ax_p, ax_p_aln]: ax.plot([0, 1], [0, 1], lw=2, ls='--', alpha=.6, c='k') ax.set_xlabel('estimated match probability') ax.set_ylabel('true match probability') ax.set_xlim(-.1, 1.1) ax.set_ylim(-.1, 1.1) ax.set_aspect('equal') ax_p.set_title('homologous seeds') ax_p_aln.set_title('similar segments') ax_p_aln.plot([0, 1], [p_min, p_min], lw=2, ls='--', alpha=.6, c='k') plot_cdf(ax_p_cdf, p_alns, c='b', lw=1, alpha=.9, smooth_radius=.5) ax_p_cdf.axvline(x=p_min, ls='--', lw=2, c='k', alpha=.6) ax_p_cdf.set_xlabel('local alignment match probability') ax_p_cdf.set_ylabel('cumulative distribution') ax_p_cdf.set_xlim(0.4, 1) for ax in [ax_p_cdf, ax_profile_ex]: ticks = [i * .1 for i in range(11)] ax.set_yticks(ticks) ax.set_yticklabels(ticks, fontsize=6) ax_profile_ex.axhline(y=p_min, ls='--', lw=2, c='k', alpha=.6) for fig in [fig_main, fig_seeds, fig_profiles]: fig.tight_layout() savefig(fig_main, 'comp_aligned_genes[estimates]%s.png' % suffix) savefig(fig_seeds, 'comp_aligned_genes[seeds]%s.png' % suffix) savefig(fig_profiles, 'comp_aligned_genes[profiles]%s.png' % suffix) fig, _ = plot_classifier(sim_data['seed_pos'], sim_data['seed_neg'], labels=['homologous', 'non-homologous'], mark_threshold=.8) savefig(fig, 'comp_aligned_genes[classifier]%s.png' % suffix)
[docs]def exp_comp_aligned_genes(): """Performance of Word-Blot and associated statistical scores for pairwise local similarity search on *biological data*. Given aligned copies of a gene in multiple species we consider the following for each pair: * estimated match probability :math:`\hat{p}` at *homologous seeds* (i.e. those exactly matching kmers that lie on the known global pairwise alignment) compared to local match probability of said global alignment. * discrimination power of :math:`\hat{p}` to separate homologous seeds from non-homologous seeds. * all identified similar segments between the two sequences verified by comparison to banded dynamic programming alignment in the identified diagonal strip. **Supported Claims** * Word-Blot accurately estimates match probabilities, length, and coordinates of local similarities in biological data. * estimated match probabilities :math:`\hat{p}` are effective statistics to separate homologous and non-homologous seeds. * Dynamic programming alignment confirms that Word-Blot correctly identifies local similarities, even those that are inevitably missed by global alignments. .. figure:: https://www.dropbox.com/s/8ajed2uo0qobyjw/ comp_aligned_genes%5Bestimates%5D%5Birx1%5D%5Bp%3D0.80%5D.png?raw=1 :target: https://www.dropbox.com/s/8ajed2uo0qobyjw/ comp_aligned_genes%5Bestimates%5D%5Birx1%5D%5Bp%3D0.80%5D.png?raw=1 :alt: lightbox Word-Blot performance in identifying local similarities among *Iroquois homeobox protein 1 (IRX1)* genes for 10 amniota obtain from Ensembl with word length 8, minimum match probability 0.8, and minimum similarity length 200nt. An example pairwise comparison (*top left*) shows seeds (grey) color coded by intensity according to their estimated match probability with those exceeding the threshold shown larger for clarity. Known global alignment is shown in green and aligned local similarities identified by Word-Blot are shown in blue. Estimated similarity profile for the same example pair (*bottom left*) is shown at positions exceeding the match probability minimum (black dots) and compared to the profile obtained from the known global alignment (green). For all pairwise comparisons estimated and true match probability at homologous seeds are shown (*top middle*, green), the diagonal shows ground truth for comparison (dashed black). Similarly for all identified similar segments, estimated match probability and true probability obtained from banded global alignment are shown (*top right*, blue), the diagonal showing ground truth (dashed black) and the horizontal line shows the cutoff used by Word-Blot (dashed black). .. figure:: https://www.dropbox.com/s/2mc1rlnazodeu6l/ comp_aligned_genes%5Bclassifier%5D%5Birx1%5D%5Bp%3D0.80%5D.png?raw=1 :target: https://www.dropbox.com/s/2mc1rlnazodeu6l/ comp_aligned_genes%5Bclassifier%5D%5Birx1%5D%5Bp%3D0.80%5D.png?raw=1 :alt: lightbox Performance of Word-Blot estimated match probability at discriminating between homologous and non-homologous seeds in the same context as the figure above. The cumulative distribution of estimated match probability in each case (*bottom right*), the resulting ROC curve (*left*) and the predictive ROC curve (*top right*). The threshold 0.8 is indicated in blue in all three plots. .. figure:: https://www.dropbox.com/s/38xgx8gbvziq126/ comp_aligned_genes%5Bestimates%5D%5Bactb%5D%5Bp%3D0.50%5D.png?raw=1 :target: https://www.dropbox.com/s/38xgx8gbvziq126/ comp_aligned_genes%5Bestimates%5D%5Bactb%5D%5Bp%3D0.50%5D.png?raw=1 :alt: lightbox Word-Blot performance in identifying local similarities among *Beta Actin (ACTB)* genes for 7 vertebrates obtain from UCSC genome browser with word length 6, minimum match probability 0.5, and minimum similarity length 200nt. All subplots are similar to those of *IRX1* experiment shown above. .. figure:: https://www.dropbox.com/s/t5pff0nk54m8swr/ comp_aligned_genes%5Bclassifier%5D%5Bactb%5D%5Bp%3D0.50%5D.png?raw=1 :target: https://www.dropbox.com/s/t5pff0nk54m8swr/ comp_aligned_genes%5Bclassifier%5D%5Bactb%5D%5Bp%3D0.50%5D.png?raw=1 :alt: lightbox Performance of Word-Blot estimated match probability at discriminating between homologous and non-homologous seeds in the same context as the figure above. """ A = Alphabet('ACGT') # p_min = .5 # K_min = 200 # wordlen = 6 # style = 'ucsc' # suffix = '[actb][p=%.2f]' % p_min # dumpfile = 'comp_aligned_genes%s.txt' % suffix # seqs_path = 'data/actb/actb-7vet.fa' # pws_path = 'data/actb/actb-7vet-pws.fa' # example_pair = ('hg38.chr7', 'mm10.chr5') p_min = .8 wordlen = 8 K_min = 200 style = 'ensembl' suffix = '[irx1][p=%.2f]' % p_min dumpfile = 'comp_aligned_genes%s.txt' % suffix seqs_path = 'data/irx1/irx1-vert-amniota-indiv.fa' pws_path = 'data/irx1/irx1-vert-amniota-pws.fa' example_pair = ('homo_sapiens/1-10720', 'bos_taurus/1-10720') with open(seqs_path) as f: seqs = {name: A.parse(fill_in_unknown(seq.upper(), A)) for seq, name, _ in load_fasta(f)} with open(pws_path) as f: pws = {tuple(name.split(':')): seq for seq, name, _ in load_fasta(f)} pws = {key: value for key, value in pws.items()} sim_data = sim_comp_aligned_genes( seqs, pws, wordlen=wordlen, p_min=p_min, K_min=K_min, dumpfile=dumpfile, ignore_existing=False) plot_comp_aligned_genes(sim_data, suffix=suffix, example_pair=example_pair, naming_style=style)
if __name__ == '__main__': exp_simulated_K_p() exp_comp_aligned_genes()