Source code for experiments.multiple_sequence

#!/usr/bin/env python
import numpy as np
import sys
import logging
from itertools import product
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import gridspec
from mpl_toolkits.mplot3d import Axes3D
from Bio import AlignIO
from time import time

from biseqt.sequence import Alphabet
from biseqt.stochastics import rand_seq, MutationProcess
from biseqt.blot import WordBlotMultiple
from biseqt.blot import WordBlotMultipleFast
from util import log, get_seqs_from_mse, with_dumpfile, fill_in_unknown
from util import color_code, savefig, plot_roc, plot_with_sd


# seeds are assumed to be in standard coordinates
def plot_scored_seeds_3d(fig, ax, scored_seeds, threshold=.5):
    idx_S, idx_T1, idx_T2, cs, ss = [], [], [], [], []
    cmap = plt.cm.get_cmap('Greys')
    scores = [score for _, score in scored_seeds]
    max_score = max(scores)
    for (i, j, k), score in scored_seeds:
        idx_S.append(i)
        idx_T1.append(j)
        idx_T2.append(k)
        cs.append(cmap(score/max_score)[:3])
        ss.append(10 if score > threshold else 1)

    ax.scatter(idx_S, idx_T1, idx_T2, facecolor=cs, lw=0, s=ss, alpha=.3)
    ax.set_aspect('equal')
    ax.elev = 10
    ax.azim = 240
    norm = matplotlib.colors.Normalize(vmin=0, vmax=max_score)
    m = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
    m.set_array(scores)
    fig.colorbar(m, shrink=.7)


# segment is a list of 3 tuples (min, max of each coordinate)
# NOTE util.plot_similar_segment for pw takes segments in diagonal coordinates!
def plot_similar_segment_3d(ax, segment, **kw):
    assert len(segment) == 3
    assert all(len(range) == 2 for range in segment)
    ax.plot(segment[0], segment[1], segment[2], **kw)


# ========================================================
# Biological data: compare aligned genomes
# ========================================================
def seeds_from_maf(mse_path, wordlen, fmt='maf', ids=None):
    alignments = list(AlignIO.parse(mse_path, fmt))
    # NOTE trust the first alignment to have all the ids
    all_ids = set()
    for alignment in alignments:
        all_ids = all_ids.union(set(rec.id for rec in alignment))
    if ids is None:
        ids = all_ids
    else:
        assert set(ids).issubset(all_ids)
    assert len(ids)
    seqs = {id_: '' for id_ in ids}
    for alignment in alignments:
        updated = {id_: False for id_ in ids}
        line_len = len(alignment[0])
        for rec in alignment:
            if rec.id not in ids:
                continue
            seqs[rec.id] += ''.join(rec.upper())
            updated[rec.id] = True
        for id_ in seqs:
            if not updated[id_]:
                seqs[id_] += '-' * line_len

    seq_lens = set(len(seq) for seq in seqs.values())
    assert len(seq_lens) == 1, \
        'all aligned sequences must have the same length'
    seqs = np.array([list(seqs[id_]) for id_ in ids])
    num_seqs, seq_len = seqs.shape
    pos = np.zeros(num_seqs)
    for idx in range(seq_len - wordlen):
        for i in range(num_seqs):
            if seqs[i, idx] != '-':
                pos[i] += 1
        if all(len(set(seqs[i, idx + j] for i in range(num_seqs))) == 1
               for j in range(wordlen)):
            yield tuple(pos)


[docs]def exp_biological_multiple_sequences():
    """Multiple sequence similarities with Word-Blot on *biological data*.

    **Supported Claimes**

    * Word-Blot estimated match probabilities are good classifiers for
      homologous and non-homologous seeds in multiple sequences.

    .. figure::
        https://www.dropbox.com/s/qhgl9vtomhfzomx/
        multiple-sequence%5Bactb%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/qhgl9vtomhfzomx/
        multiple-sequence%5Bactb%5D.png?raw=1
       :alt: lightbox

       Multiple sequence comparison on *Beta Actin (ACTB)* gene on the 7
       vertebrate USCS dataset (excluding *monodelphis domestica* in which the
       gene is split between chromosomes 1 and 6) with word length 6, minimum
       similarity length 100nt, and minimum match probability 0.75. For an
       arbitrary group of 3 sequences a dot plot similar to those of pairwise
       comparisons is shown (*left*). Seeds are color coded by intensity to
       represent their estimated match probability, blue ligns show local
       similarities identified by Word-Blot. The performance of estimated match
       probabilities in the full data set to discriminate between homologous
       and non-homologous seeds is shown by an ROC curve (*right*).

    .. figure::
        https://www.dropbox.com/s/96xf8p42gbazjyw/
        multiple-sequence%5Birx1%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/96xf8p42gbazjyw/
        multiple-sequence%5Birx1%5D.png?raw=1
       :alt: lightbox

       Same as figure above but with the *IRX1* data set from Ensembl
       (excluding *rattus norvegicus* whose gene copy is much shorter than the
       rest) with word length 10.
    """
    # mse_path = 'data/actb/actb-7vet.maf'
    # gene = 'ACTB'
    # mse_fmt = 'maf'
    # mse_names = 'ucsc'
    # suffix = '[%s]' % gene.lower()
    # wordlen = 6
    # ids_to_exclude = ['monDom5.chr1', 'monDom5.chr6']  # split gene
    # scatter_ids = ['hg38.chr7', 'panTro4.chr7', 'canFam3.chr6']

    mse_path = 'data/irx1/irx1-vert-amniota.aln'
    gene = 'IRX1'
    mse_fmt = 'clustal'
    mse_names = 'ensembl'
    wordlen = 10
    ids_to_exclude = ['rattus_norvegicus/1-10720']  # much shorter than rest
    scatter_ids = ['homo_sapiens/1-10720', 'canis_familiaris/1-10720',
                   'bos_taurus/1-10720']

    p_min = .75
    K_min = 100

    A = Alphabet('ACGT')
    WB_kw = {'g_max': .4, 'sensitivity': .9, 'alphabet': A,
             'wordlen': wordlen, 'path': ':memory:'}

    # 3 sequences for scatter plot
    assert len(scatter_ids) == 3
    seqs = {id_: A.parse(fill_in_unknown(seq.upper(), A))
            for id_, seq in get_seqs_from_mse(mse_path, fmt=mse_fmt)
            if id_ not in ids_to_exclude}
    ids = seqs.keys()
    WB = WordBlotMultipleFast(*[seq for id_, seq in seqs.items()
                                if id_ in scatter_ids], **WB_kw)
    scored_seeds = [(WB.to_ij_coordinates(*rec['seed']), rec['p'])
                    for rec in WB.score_seeds(K_min)]
    log('found %d seeds for 3 sequences' % len(scored_seeds))

    fig = plt.figure(figsize=(8, 5))
    gs = gridspec.GridSpec(1, 2, width_ratios=[2, 1.3])
    ax_scatter = plt.subplot(gs[0], projection=Axes3D.name)
    plot_scored_seeds_3d(fig, ax_scatter, scored_seeds, threshold=p_min)
    for axis in 'xyz':
        ax_scatter.tick_params(axis=axis, labelsize=4)

    if mse_names == 'ucsc':
        ax_scatter.set_xlabel(scatter_ids[0].split('.')[0])
        ax_scatter.set_ylabel(scatter_ids[1].split('.')[0])
        ax_scatter.set_zlabel(scatter_ids[2].split('.')[0])
    if mse_names == 'ensembl':
        ax_scatter.set_xlabel(scatter_ids[0].split('/')[0].replace('_', ' '))
        ax_scatter.set_ylabel(scatter_ids[1].split('/')[0].replace('_', ' '))
        ax_scatter.set_zlabel(scatter_ids[2].split('/')[0].replace('_', ' '))
    ax_scatter.set_title('Estimated similarity at %d ' % len(scored_seeds) +
                         'seeds in 3 copiees of %s' % gene, fontsize=8)

    for res in WB.similar_segments(K_min=K_min, p_min=p_min, score=False):
        std_ranges = WB.to_ij_coordinates_seg(res['segment'])
        plot_similar_segment_3d(ax_scatter, std_ranges, c='b', lw=1, zorder=10)

    # ============================
    # hom/non-hom seed classifier
    # ============================
    real_seeds = list(seeds_from_maf(mse_path, wordlen, fmt=mse_fmt, ids=ids))
    real_seeds = list(tuple(int(x) for x in seed) for seed in real_seeds)
    log('found %d homologous seeds for %d sequences' %
        (len(real_seeds), len(ids)))

    WB = WordBlotMultipleFast(*[seq for id_, seq in seqs.items()
                                if id_ in ids], **WB_kw)
    scored_seeds = [(WB.to_ij_coordinates(*rec['seed']), rec['p'])
                    for rec in WB.score_seeds(K_min)]
    log('found %d seeds for %d sequences' % (len(scored_seeds), len(ids)))
    pos, neg = [], []
    for coords, p_hat in scored_seeds:
        if coords in real_seeds:
            pos.append(p_hat)
        else:
            neg.append(p_hat)

    ax_roc = plt.subplot(gs[1])
    plot_roc(ax_roc, pos, neg, color='k', lw=1)
    avg_len = sum(len(seq) for seq in seqs.values()) / len(seqs)
    title = 'Classifing %d(+) and %d(-) seeds\n' % (len(pos), len(neg))
    title += 'in %d copies of %s (avg. len. %dnt)' % (len(ids), gene, avg_len)
    ax_roc.set_title(title, fontsize=8)

    fig.tight_layout()
    savefig(fig, 'multiple-sequence[%s].png' % gene.lower())


# ========================================================
# Simulations (K, p, r, ds, a)
# ========================================================
@with_dumpfile
def sim_simulated_K_p(Ks, ps, **kw):
    # ps are the probability that ALL seqs match
    n_samples, n_seqs = kw['n_samples'], kw['n_seqs']
    shape = (len(Ks), len(ps), n_samples)

    A = Alphabet('ACGT')
    WB_kw = {
        'g_max': kw.get('g_max', .6),
        'sensitivity': kw.get('sensitivity', .99),
        'alphabet': A,
        'path': kw.get('db_path', ':memory:'),
        'log_level': kw.get('log_level', logging.WARN),
    }
    sim_data = {
        'n_seqs': n_seqs,
        'scores': {'H0': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
                   'H1': {'pos': np.zeros(shape), 'neg': np.zeros(shape)}},
        'K_hat': {'pos': {'sum': np.zeros(shape), 'max': np.zeros(shape)},
                  'neg': {'sum': np.zeros(shape), 'max': np.zeros(shape)}},
        'p_hat': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'a_hat': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'd_min': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'd_max': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'time': {'pos': np.zeros(shape), 'neg': np.zeros(shape)},
        'WB_kw': WB_kw,
        'Ks': Ks,
        'ws': [int(np.ceil(np.log(K))) + 1 for K in Ks],
        'ps': ps,
    }
    K_min = 100
    assert K_min <= min(Ks)

    for (K_idx, K), (p_idx, p_match) in product(enumerate(Ks), enumerate(ps)):
        p_min = p_match
        # adjusted wordlens
        wordlen = sim_data['ws'][K_idx]
        WB_kw['wordlen'] = wordlen

        log('simulating (%d samples) K = %d (w=%d), p = %.2f' %
            (n_samples, K, WB_kw['wordlen'], p_match), newline=False)
        for idx in range(n_samples):
            sys.stderr.write('.')
            # distribute p_match evenly over all sequences
            p_match_indiv = np.exp(np.log(p_match) / n_seqs)
            # distribute p_match_indiv evenly over gap and subst
            subst = gap = 1 - np.sqrt(p_match_indiv)
            assert abs((1 - gap) * (1 - subst) - p_match_indiv) < 1e-3
            M = MutationProcess(A, subst_probs=subst, ge_prob=gap,
                                go_prob=gap)
            seqs_rel = [rand_seq(A, K)]
            for _ in range(1, n_seqs):
                seqs_rel.append(M.mutate(seqs_rel[0])[0])
            for seq_idx in range(n_seqs):
                seqs_rel[seq_idx] = rand_seq(A, K / 2) + \
                                    seqs_rel[seq_idx] + \
                                    rand_seq(A, K / 2)
            seqs_unrel = [rand_seq(A, 2 * K) for _ in range(n_seqs)]

            for key, seqs in zip(['pos', 'neg'], [seqs_rel, seqs_unrel]):
                t_start = time()
                WB = WordBlotMultiple(*seqs, **WB_kw)

                # calculate H1/H1 scores with perfect information:
                band_r = WB.band_radius(K)
                ds_band = [(-band_r, band_r)] * (n_seqs - 1)
                volume = K * (2 * band_r) ** (n_seqs - 1)
                num_seeds = WB.seed_count(
                    ds_band=ds_band,
                    a_band=(n_seqs * K / 2, 3 * n_seqs * K / 2)
                )
                s0, s1 = WB.score_num_seeds(num_seeds=num_seeds,
                                            volume=volume,
                                            seglen=K, p_match=p_match)
                sim_data['scores']['H0'][key][K_idx, p_idx, idx] = s0
                sim_data['scores']['H1'][key][K_idx, p_idx, idx] = s1
                sim_data['time'][key][K_idx, p_idx, idx] = time() - t_start

                def _len(seg): return (seg[1][1] - seg[1][0]) / n_seqs

                # for multiple sequences it's too much to ask for
                # at_least_one=True
                results = list(WB.similar_segments(K_min, p_min))
                if results:
                    # pick longest detected homology for p_hat, d_hat, a_hat
                    hom = max(results, key=lambda rec: _len(rec['segment']))
                    sim_data['p_hat'][K_idx, p_idx, idx] = hom['p']
                else:
                    # NOTE a fake segment with all zero properties
                    sim_data['p_hat'][K_idx, p_idx, idx] = 0
                    results = [{'segment': ([(1, 0)] * (n_seqs - 1), [0, 0]),
                                'p': 0}]

                # sum of K_hat, average of a_hat, d_min, d_max
                # sum of K_hat and longest K_hat
                sim_data['K_hat'][key]['sum'][K_idx, p_idx, idx] = sum(
                    _len(r['segment']) for r in results) / 2
                sim_data['K_hat'][key]['max'][K_idx, p_idx, idx] = max(
                    _len(r['segment']) for r in results) / 2

                sim_data['a_hat'][key][K_idx, p_idx, idx] = \
                    (hom['segment'][1][1] + hom['segment'][1][0]) / 2
                sim_data['d_min'][key][K_idx, p_idx, idx] = \
                    hom['segment'][0][0][0]
                sim_data['d_max'][key][K_idx, p_idx, idx] = \
                    hom['segment'][0][0][1]
                sim_data['p_hat'][key][K_idx, p_idx, idx] = hom['p']
        sys.stderr.write('\n')

    return sim_data


def plot_simulated_K_p(sim_data, select_Ks, select_ps, suffix=''):
    Ks, ps = sim_data['Ks'], sim_data['ps']
    n_seqs = sim_data['n_seqs']
    scores = sim_data['scores']

    kw = {'marker': 'o', 'markersize': 3, 'alpha': .6, 'lw': 1.5}
    truth_kw = {'ls': '--', 'alpha': .6, 'color': 'k', 'lw': 1}

    # ======================================
    # score by varying K for select ps
    # score varying p for select Ks
    fig_scores = plt.figure(figsize=(8, 7))
    ax_H0_K = fig_scores.add_subplot(2, 2, 1)
    ax_H0_p = fig_scores.add_subplot(2, 2, 2)
    ax_H1_K = fig_scores.add_subplot(2, 2, 3)
    ax_H1_p = fig_scores.add_subplot(2, 2, 4)

    for mode, ax in zip(['H0', 'H1'], [ax_H0_K, ax_H1_K]):
        colors = color_code(select_ps)
        for p, color in zip(select_ps, colors):
            p_idx = ps.index(p)
            for case, ls in zip(['pos', 'neg'], ['-', '--']):
                label = '' if case == 'neg' else 'p = %.2f' % p
                ax.plot(Ks, scores[mode][case][:, p_idx, :].mean(axis=1),
                        color=color, ls=ls, label=label, **kw)
        ax.set_xscale('log')
        ax.set_ylabel('%s score' % mode, fontsize=10)
        ax.set_xlabel('similarity length', fontsize=10)
        ax.set_xticks(Ks)
        ax.set_xticklabels(Ks)
    ax_H0_K.legend(loc='lower right', fontsize=10)
    ax_H1_K.legend(loc='upper right', fontsize=10)

    for mode, ax in zip(['H0', 'H1'], [ax_H0_p, ax_H1_p]):
        colors = color_code(select_Ks)
        for K, color in zip(select_Ks, colors):
            K_idx = Ks.index(K)
            for case, ls in zip(['pos', 'neg'], ['-', '--']):
                label = '' if case == 'neg' else 'K = %d' % K
                ax.plot(ps, scores[mode][case][K_idx, :, :].mean(axis=1),
                        color=color, ls=ls, label=label, **kw)
        ax.set_ylabel('%s score' % mode)
        ax.set_xlabel('similarity match probability')
        ax.set_xticks(ps)
        ax.set_xticklabels(ps)
        ax.legend(loc='lower left')

    for ax in [ax_H0_K, ax_H0_p]:
        ax.set_yscale('symlog')

    savefig(fig_scores, 'simulations_multiple[scores]%s.png' % suffix)

    # ======================================
    fig_hats = plt.figure(figsize=(8, 7))
    ax_K = fig_hats.add_subplot(2, 2, 1)
    ax_p = fig_hats.add_subplot(2, 2, 3)
    ax_d = fig_hats.add_subplot(2, 2, 2)
    ax_a = fig_hats.add_subplot(2, 2, 4)

    p_hats = sim_data['p_hat']
    p_hats['pos'] *= (1 + p_hats['pos']) / 2
    p_hats['neg'] *= (1 + p_hats['neg']) / 2

    # estimated Ks for select ps
    K_hats = [sim_data['K_hat']['pos']['sum'],
              sim_data['K_hat']['pos']['max'],
              sim_data['K_hat']['neg']['max']]
    lss = ['-', '--', ':']
    markers = ['o', 'o', 'x']
    show_sds = [True, False, False]

    # estimated Ks for select ps
    colors = color_code(select_ps)
    for p, color in zip(select_ps, colors):
        p_idx = ps.index(p)

        kw_ = {'markersize': 3, 'alpha': .7, 'lw': 1, 'color': color,
               'label': 'p = %.2f' % p}
        labeled = False
        for K_hat, ls, marker, sd in zip(K_hats, lss, markers, show_sds):
            kw_['ls'], kw_['marker'] = ls, marker
            if sd:
                plot_with_sd(ax_K, Ks, K_hat[:, p_idx, :], axis=1, **kw_)
            else:
                ax_K.plot(Ks, K_hat[:, p_idx, :].mean(axis=1), **kw_)
            if not labeled:
                kw_.pop('label')
                labeled = True

        ax_K.set_xscale('log')
        ax_K.set_yscale('symlog')
        ax_K.set_xticks(Ks)
        ax_K.set_xticklabels(Ks, rotation=90)

    ax_K.set_ylabel('estimated similarity length', fontsize=10)
    ax_K.set_xlabel('true similarity length', fontsize=10)
    ax_K.plot(Ks, Ks, **truth_kw)
    ax_K.legend(loc='upper left', fontsize=8)

    # estimated ps for select Ks
    colors = color_code(select_Ks)
    kw_ = kw.copy()
    kw_['ls'] = '--'
    for K, color in zip(select_Ks, colors):
        K_idx = Ks.index(K)
        p_hats = sim_data['p_hat']['pos']
        plot_with_sd(ax_p, ps, p_hats[K_idx, :, :], axis=1,
                     color=color, label='K = %d' % K, **kw)

        p_hats = sim_data['p_hat']['neg']
        ax_p.plot(ps, p_hats[K_idx, :, :].mean(axis=1), color=color, **kw_)
        ax_p.set_xticks(ps)
        ax_p.set_xticklabels(ps)
    ax_p.set_ylabel('estimated match probability', fontsize=10)
    ax_p.set_xlabel('true match probability', fontsize=10)
    ax_p.plot(ps, ps, **truth_kw)
    ax_p.legend(loc='upper left', fontsize=8)

    # ======================================
    # estimated diagonal and antidiagonal position and band radius for select
    # match probabilities (select_ps), as a function of K
    d_mins = sim_data['d_min']['pos']
    d_maxs = sim_data['d_max']['pos']
    a_hats = sim_data['a_hat']['pos']

    colors = color_code(select_ps)
    for p, color in zip(select_ps, colors):
        label = 'p = %.2f' % p
        kw = {'color': color, 'alpha': .6, 'lw': 1,
              'marker': 'o', 'markersize': 3}
        p_idx = ps.index(p)

        d_ctrs = (d_mins[:, p_idx, :] + d_maxs[:, p_idx, :]) / 2
        # ax_d.plot(Ks, d_ctrs.mean(axis=1), label=label, **kw)
        plot_with_sd(ax_d, Ks, d_ctrs, axis=1, label=label, **kw)
        plot_with_sd(ax_a, Ks, a_hats[:, p_idx, :], axis=1, label=label, **kw)
        # ax_a.plot(Ks, a_hats[:, p_idx, :].mean(axis=1), label=label, **kw)
        for ax in [ax_d, ax_a]:
            ax.set_xscale('log')
            ax.set_xticks(Ks)
            ax.set_xticklabels(Ks)
            ax.set_xlabel('similarity length')
        ax_a.set_yscale('symlog')

    ax_d.set_xlabel('similarity length')
    ax_d.set_ylabel('estimated diagonal positions')
    ax_d.plot(Ks, [0] * len(Ks), **truth_kw)
    ax_d.legend(loc='upper left', fontsize=8)

    ax_a.plot(Ks, [n_seqs * K for K in Ks], **truth_kw)
    ax_a.set_xlabel('similarity length')
    ax_a.set_ylabel('estimated antidiagonal position')
    ax_a.legend(loc='upper left', fontsize=8)

    savefig(fig_hats, 'simulations_multiple[estimates]%s.png' % suffix)

    # ======================
    # time to score seeds as a function of K (for select ps) and p (for select
    # Ks)
    fig_t = plt.figure(figsize=(8, 4))
    ax_t_K = fig_t.add_subplot(1, 2, 1)
    ax_t_p = fig_t.add_subplot(1, 2, 2)

    kw = {'alpha': .5, 'lw': 1, 'marker': 'o', 'markersize': 2}

    # times as a function of p for select Ks
    colors = color_code(select_Ks)
    for K, color in zip(select_Ks, colors):
        K_idx = Ks.index(K)
        label = 'K = %d (wordlen = %d)' % (K, int(np.ceil(np.log(K))))
        plot_with_sd(ax_t_p, ps, 1000 * sim_data['time']['pos'][K_idx, :, :],
                     axis=1, color=color, ls='-', label=label, **kw)
    ax_t_p.set_xticks(ps)
    ax_t_p.set_xticklabels(ps)
    # NOTE make room for legend manually
    ax_t_p.set_ylim(None, 1400)
    ax_t_p.set_ylabel('time (ms) to score all seeds')
    ax_t_p.set_xlabel('true match probability')
    ax_t_p.legend(loc='upper left', fontsize=6)

    # times as a function of K for select ps
    colors = color_code(select_ps)
    for p_match, color in zip(select_ps, colors):
        p_idx = ps.index(p_match)
        label = 'p = %.2f' % p_match
        plot_with_sd(ax_t_K, Ks, 1000 * sim_data['time']['pos'][:, p_idx, :],
                     axis=1, color=color, ls='-', label=label, **kw)
    ax_t_K.set_xticks(Ks)
    ax_t_K.set_xticklabels(Ks, rotation=90, fontsize=8)
    # NOTE make room for legend manually
    ax_t_K.set_ylim(None, 1800)
    ax_t_K.set_ylabel('time (ms) to score all seeds')
    ax_t_K.set_xlabel('similarity lengths')
    ax_t_K.legend(loc='upper left')

    savefig(fig_t, 'simulations_multiple[times]%s.png' % suffix)


[docs]def exp_simulated_K_p():
    """Performance of Word-Blot and associated statistical scores for multiple
    local similarity search in *simulations*:

    * z-scores assigned to diagonal n-dimensional volumes with and without
      similarities under the corresponding limiting Normal distributions
      (:math:`H0, H1`), for varying similarity lengths and match probabilities.
    * Estimated coordinates, lengths and match probabilities of local
      similarities for varying similarity lengths and match probabilities. Same
      procedure is used as in pairwise comparison simulations (i.e. trials
      containing a similarity consists of multiple sequences of length
      :math:`2K` whose substrings at positions :math:`[\\frac{K}{2},
      \\frac{3K}{2}]` are homologies of length :math:`K` and match probability
      :math:`p` in the multiple sequence sense).
    * Word lengths are automatically adjusted to ensure effectively linear
      growth of complexity with respect to sequence lengths.

    **Supported Claims**

    * z-scores calculated by
      :func:`biseqt.blot.WordBlotMultiuple.score_num_seeds` against the
      limiting normal distributions for multiple sequences are stable and
      comparable across different similarity lengths and match probabilities;
      thus both scores are reliable statistics.
    * Local similarities found by Word-Blot as per
      :func:`biseqt.blot.WordBlotMultiple.similar_segments` are accurate in
      length, estimated match probability, and coordinates.
    * Adjusting kmer word lengths in logarithmic relationship with sequence
      length maintains effectively linear complexity and up to sequences of
      length 6.4 kbp behaves well.

    .. figure::
        https://www.dropbox.com/s/un7v14rnng0gxru/
        simulations_multiple%5Bscores%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/un7v14rnng0gxru/
        simulations_multiple%5Bscores%5D.png?raw=1
       :alt: lightbox

       Z-scores of the number of seeds in diagonal volumes for related (solid
       lines) and unrelated (dashed lines) multiple sequence (N=6) segments of
       varying lengths, calculated against the limiting distribution for
       unrelated pairs of sequences (*left*) and related pairs (*right*) as a
       function of similarity length (*top*) and match probability (*bottom*),
       n=50 samples. Note that, as desired, H0 score is stable for unrelated
       sequences of any length and H1 score is stable for related sequences of
       any length or match probability.

    .. figure::
        https://www.dropbox.com/s/owyhm9ks1rt5v2c/
        simulations_multiple%5Bestimates%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/owyhm9ks1rt5v2c/
        simulations_multiple%5Bestimates%5D.png?raw=1
       :alt: lightbox

       Estimated length (*top left*), match probability (*bottom left*),
       diagonal position (*top right*) and antidiagonal position (*bottom
       right*) of multiple (N=6) local similarities, word length is adapted to
       sequence lengths, n=20 samples, shaded regions indicate one standard
       deviation. Dashed black lines are ground truth for comparison.  For
       length estimates both the sum of the lengths of all reported similar
       segments (solid) and the length of the longest similar segment (dashed)
       are shown. Estimated match probablity and Length estimates for unrelated
       pairs are also shown (dotted). Estimated match probability, diagonal and
       antidiagonal coordinates correspond to the longest local similarity.

    .. figure::
        https://www.dropbox.com/s/ticgyshozt6epi0/
        simulations_multiple%5Btimes%5D.png?raw=1
       :target:
        https://www.dropbox.com/s/ticgyshozt6epi0/
        simulations_multiple%5Btimes%5D.png?raw=1
       :alt: lightbox

       Time to find and score all multiple sequence (N=6) seeds as a function
       of varying similarity length (*left*) and varying match probability
       (*right*), n=20 samples, shaded regions indicated one standard
       deviation. Word lengths are logarithmically adjusted to sequence length.
    """
    Ks = [100 * 2 ** i for i in range(1, 7)]
    select_Ks = Ks[0], Ks[2], Ks[4]

    ps = [round(1 - .06 * i, 2) for i in range(1, 8)]
    select_ps = ps[0], ps[3], ps[5]

    n_samples = 20
    n_seqs = 6

    suffix = ''
    dumpfile = 'simulations_multiple%s.txt' % suffix
    sim_data = sim_simulated_K_p(
        Ks, ps, n_samples=n_samples, n_seqs=n_seqs,
        dumpfile=dumpfile, ignore_existing=False)
    plot_simulated_K_p(sim_data, select_Ks, select_ps, suffix=suffix)


# ========================================================
# Simulated Rearrangements
# ========================================================
@with_dumpfile
def sim_rearranged_sequences(**kw):
    A = Alphabet('ACGT')
    region_len, subst, gap = kw['region_len'], kw['subst'], kw['gap']
    n_regions, n_indivs = kw['n_regions'], kw['n_indivs']
    wordlen = kw['wordlen']
    M = MutationProcess(A, subst_probs=subst, ge_prob=gap, go_prob=gap)

    # we represnet a sequence by a list of region copies. In each generation
    # random mutations are applied to each region copy and a random pair of
    # regions swap place.
    def _evolve(region_copies):
        new_gen = [None] * len(region_copies)
        for idx, region in enumerate(region_copies):
            new_gen[idx], _ = M.mutate(region)
            new_gen[idx], _ = M.mutate(region)
        i, j = np.random.choice(range(n_regions), 2, replace=False)
        new_gen[i], new_gen[j] = region_copies[j], region_copies[i]
        return new_gen

    generations = [[rand_seq(A, region_len) for _ in range(n_regions)]]
    for _ in range(n_indivs - 1):
        generations.append(_evolve(generations[-1]))
    # each individual is a sequence of region copies
    seqs = [sum(individual, A.parse('')) for individual in generations]

    # NOTE if g is too small we get massive fragmentation that grows
    # exponentially with the number of sequences. The same presumably happens
    # in pairwaise but less noticable.
    # if g is too large we avoid fragmentation but we start not seeing
    # similarities because volume increases (and then what?)
    WB_kw = {'g_max': .6, 'sensitivity': .9, 'alphabet': A,
             'wordlen': wordlen, 'path': ':memory:'}

    K_min = .9 * region_len
    p_min = .6
    sim_data = {
        'n_indivs': n_indivs,
        'n_regions': n_regions,
        'region_len': region_len,
        'generations': generations,
        'seqs': seqs,
        'WB_kw': WB_kw,
        'p_min': p_min,
        'K_min': K_min,
        'segments': [],
    }

    WB = WordBlotMultiple(*seqs, **WB_kw)
    for res in WB.similar_segments(K_min=K_min, p_min=p_min, score=False):
        std_ranges = WB.to_ij_coordinates_seg(res['segment'])
        p_hat = round(res['p'], 3)
        sim_data['segments'].append({'p': p_hat, 'segment': std_ranges})

    return sim_data


def plot_rearranged_sequences(sim_data):
    seqs = sim_data['seqs']
    n_indivs = len(seqs)
    width = .5 * n_indivs
    height = sim_data['n_regions'] * sim_data['region_len'] / 400.
    fig = plt.figure(figsize=(width, height))
    ax = fig.add_subplot(1, 1, 1)

    colors = color_code(range(len(sim_data['segments'])), cmap='tab20')

    for color, rec in zip(colors, sim_data['segments']):
        seg = rec['segment']
        ctrs = [sum(r) / 2 for r in seg]
        xs = range(n_indivs)
        for idx, x in enumerate(xs):
            # ys = [seg[idx][0], sum(seg[idx]) / 2, seg[idx][1]]
            ax.plot([x, x], seg[idx], c=color, lw=2, alpha=.4)
        ax.plot(xs, ctrs, lw=1, marker='o', markersize=5, c=color, alpha=.6)

    for idx, indiv in enumerate(sim_data['generations']):
        positions = np.cumsum([len(seq) for seq in indiv])
        for pos in positions:
            ax.plot([idx - .02, idx + .02], [pos, pos], c='k', alpha=.7)

    ax.set_xticks(range(n_indivs))
    ax.set_xticklabels(['seq %d' % (i + 1) for i in range(n_indivs)],
                       fontsize=8)
    ax.set_xlim(-1, n_indivs)

    fig.tight_layout()
    savefig(fig, 'multiple_rearrangement.png')


def exp_rearrangement():
    suffix = ''
    dumpfile = 'multiple_rearangements%s.txt' % suffix
    subst = .03
    gap = .02

    # NOTE adjust wordlen according to sequence length; good pairs: region_len
    # (100), wordlen (10). region_len(1000), wordlen(12)
    region_len = 200
    n_regions = 10
    wordlen = 8
    n_indivs = 10
    sim_data = sim_rearranged_sequences(
        region_len=region_len, n_regions=n_regions, n_indivs=n_indivs,
        subst=subst, gap=gap, wordlen=wordlen,
        dumpfile=dumpfile, ignore_existing=True
    )
    plot_rearranged_sequences(sim_data)


if __name__ == '__main__':
    exp_biological_multiple_sequences()
    exp_rearrangement()
    exp_simulated_K_p()