Source code for experiments.blot_overlaps

# -*- coding: utf-8 -*-
from itertools import combinations
import logging
import numpy as np
import sys
from matplotlib import pyplot as plt
from time import time

from biseqt.util import ProgressIndicator
from biseqt.blot import WordBlotOverlap, WordBlotOverlapRef
from biseqt.sequence import Alphabet
from biseqt.stochastics import rand_seq, MutationProcess

from util import plot_classifier, log, with_dumpfile, plot_cdf, savefig
from util import plot_with_sd


def load_mapped_reads(path, max_num=-1):
    A = Alphabet('ACGT')
    reads, mappings = [], []
    log('loading sequencing reads from %s' % path)
    count = 0
    reads, mappings = [], []
    with open(path) as f:
        while True:
            line = f.readline()
            if not line:
                break
            rc, coordinates = line[2:].split()
            assert rc in '+-'
            m0, m1 = coordinates.split(':')
            mappings.append((rc, int(m0), int(m1)))
            reads.append(A.parse(f.readline().strip()))
            count += 1
            if count == max_num:
                break
    return reads, mappings


@with_dumpfile
def sim_overlap_simulations(**kw):
    A = Alphabet('ACGT')
    wordlen, gap, subst, Ks = kw['wordlen'], kw['gap'], kw['subst'], kw['Ks']
    n_samples = kw['n_samples']
    g_max = 2 * gap
    WB_kw = {
        'g_max': g_max,
        'sensitivity': .99,
        'alphabet': A,
        'wordlen': wordlen,
        'log_level': logging.WARN,
    }
    p_match = (1 - gap) * (1 - subst)

    def _zero():
        return {key: np.zeros((len(Ks), n_samples))
                for key in ['d_true', 'p_true', 'p', 'score', 'd', 'r']}

    sim_data = {
        'results': {
            'overlap': _zero(),
            'noisy overlap': _zero(),
            'incomplete overlap': _zero(),
            'unrelated': _zero(),
        },
        'Ks': Ks,
        'gap': gap,
        'subst': subst,
        'n_samples': n_samples,
        'WB_kw': WB_kw,
    }

    M = MutationProcess(A, subst_probs=subst, ge_prob=gap, go_prob=gap)
    M_noisy = MutationProcess(
        A, subst_probs=subst * 2, ge_prob=gap * 2, go_prob=gap * 2
    )
    for K_idx, K in enumerate(Ks):
        log('K = %d' % K, newline=False)
        for idx in range(n_samples):
            sys.stderr.write('.')
            d_true = np.random.randint(max(Ks))

            def _flank(length): return rand_seq(A, length)

            overlap_seq = rand_seq(A, K)
            seq_pairs = {
                'overlap': (
                    _flank(d_true) + overlap_seq,
                    M.mutate(overlap_seq)[0] + _flank(min(Ks))
                ),
                'noisy overlap': (
                    _flank(d_true) + overlap_seq,
                    M_noisy.mutate(overlap_seq)[0] + _flank(min(Ks))
                ),
                'incomplete overlap': (
                    _flank(K / 2 + d_true) + overlap_seq + _flank(K / 2),
                    _flank(K / 2) + M.mutate(overlap_seq)[0] + _flank(K / 2)
                ),
                'unrelated': (rand_seq(A, K), rand_seq(A, K))
            }
            p_true = {
                'overlap': p_match,
                'noisy overlap': (1 - 2 * gap) * (1 - 2 * subst),
                'incomplete overlap': None,
                'unrelated': None,
            }

            for key, (S, T) in seq_pairs.items():
                WB = WordBlotOverlap(S, T, **WB_kw)
                res = WB.highest_scoring_overlap_band(p_match * .95)
                sim_data['results'][key]['d_true'][K_idx, idx] = d_true
                sim_data['results'][key]['p'][K_idx, idx] = res['p']
                sim_data['results'][key]['p_true'][K_idx, idx] = p_true[key]
                sim_data['results'][key]['d'][K_idx, idx] = \
                    sum(res['d_band']) / 2
                sim_data['results'][key]['score'][K_idx, idx] = res['score']
                sim_data['results'][key]['r'][K_idx, idx] = \
                    (res['d_band'][1] - res['d_band'][0]) / 2
        sys.stderr.write('\n')
    return sim_data


def plot_overlap_simulations(sim_data):
    Ks = sim_data['Ks']
    colors = {
        'overlap': 'g',
        'unrelated': 'r',
        'noisy overlap': 'b',
        'incomplete overlap': 'm'
    }

    fig = plt.figure(figsize=(8, 4))
    ax_p_hat = fig.add_subplot(1, 2, 1)
    ax_cdf = fig.add_subplot(1, 2, 2)

    errors = {}

    # simulation results
    for key, res in sim_data['results'].items():
        color = colors[key]
        plot_with_sd(ax_p_hat, Ks, res['p'], axis=1, alpha=.8, marker='o',
                     markersize=3, color=color, label=key.replace('_', ' '))

        if res['p_true'][0, 0] is not None:
            p_true = res['p_true'][0, 0]
            ax_p_hat.plot([min(Ks), max(Ks)], [p_true, p_true], lw=5,
                          alpha=.3, color=color, ls='--')

        if key != 'unrelated':
            color = colors[key]
            missed = (np.abs(res['d_true'] - res['d']) > res['r']).flatten()
            errors[key] = 1. * np.sum(missed) / len(missed)

        plot_cdf(ax_cdf, res['p'].flatten(), color=color, smooth_radius=10,
                 label=key)

    for ax in [ax_p_hat, ax_cdf]:
        ax.legend(loc='best', fontsize=8)
    ax_cdf.set_xlabel('estimated match probability')
    ax_cdf.set_ylabel('cumulative distribution')

    ax_p_hat.set_ylim(-.1, 1.1)
    ax_p_hat.set_xlabel('overlap length')
    ax_p_hat.set_ylabel('estimated match probability')

    for key, res in sim_data['results'].items():
        if key == 'unrelated':
            continue
        msg = 'missed diagonal band in %s pairs: %%%.2f' % \
            (key, 100 * errors[key])
        print msg

    fig.tight_layout()
    savefig(fig, 'overlaps_simulations.png')


[docs]def exp_overlap_simulations(): """Performance of Word-Blot in overlap discovery *simulations*. In each trial for overlap length :math:`K`, a pair of sequences are presented to :class:`biseqt.blot.WordBlotOverlap` which have either of: * an overlap similarity of length :math:`K` with gap and substitution probabilities both equal to 0.1 (*overlap* pair), * an overlap similarity of length :math:`K` with twice as much mutation probabilities (*noisy overlap* pair), * a similarity of length :math:`K` with 0.1 substitution and gap probabilities, flanked by two unrelated regions of length :math:`\\frac{K}{2}` such that it does not qualify as a proper overlap similarity (*incomplete overlap* pair), * no similarities. In each case the highest *overlap band match probability* reported by Word-Blot is used as the discriminating statistic. **Supported Claims** * In simulations Word-Blot accurately detects overlaps, estimates their diagonal position, and their match probability and thus can be used for quality-sensitive overlap detection and as a guide for further banded alignment. .. figure:: https://www.dropbox.com/s/ob4lv65l91o0tua/overlaps_simulations.png?raw=1 :target: https://www.dropbox.com/s/ob4lv65l91o0tua/overlaps_simulations.png?raw=1 :alt: lightbox Highest overlap band match probability reported by Word-Blot (*left*) for each of the four cases; word length 6, n=20, shaded regions indicate one standard deviation. For overlap and noisy overlap pairs the known truth is shown in thick dashed lines of corresponding color. For each case the cumulative probability distribution of the match probability reported by Word-Blot is shown (*right*), the clear separation of which indicates that Word-Blot is capable of detecting overlaps of a certain quality with high accuracy. Percentage of cases where the correct diagonal is not contained in reported diagonal band: %0.44 (noisy overlap), %0 (overlap), %0 (incomplete overlap). """ wordlen = 8 gap = .1 subst = .1 Ks = [i * 500 for i in range(2, 11)] n_samples = 50 dumpfile = 'overlap_simulations.txt' sim_data = sim_overlap_simulations( wordlen=wordlen, gap=gap, subst=subst, Ks=Ks, n_samples=n_samples, dumpfile=dumpfile, ignore_existing=False, ) plot_overlap_simulations(sim_data)
@with_dumpfile def sim_overlap_sequencing_reads(**kw): reads_path, wordlen, g_max = kw['reads_path'], kw['wordlen'], kw['g_max'] sim_data = { 'overlap_band': {}, 'wordlen': wordlen, 'reads_path': reads_path, 'avg_time': 0, 'avg_len': 0, } A = Alphabet('ACGT') WB_kw = { 'g_max': g_max, 'sensitivity': .9, 'alphabet': A, 'wordlen': wordlen, 'path': ':memory:', 'log_level': logging.WARN, } reads, mappings = load_mapped_reads(reads_path) sim_data['avg_len'] = int( 1. * sum(len(read) for read in reads) / len(reads) ) num_reads = len(reads) assert len(reads) == len(mappings) num_total = (num_reads * (num_reads - 1)) / 2 log('finding all overlapping pairs of reads') indic = ProgressIndicator(num_total=num_total, percentage=False) indic.start() t_start = time() for i in range(num_reads - 1): WB = WordBlotOverlapRef(reads[i], **WB_kw) for j in range(i + 1, num_reads): indic.progress() res = WB.highest_scoring_overlap_band(reads[j]) sim_data['overlap_band'][(i, j)] = res sim_data['avg_time'] = (time() - t_start) / num_total indic.finish() return sim_data def plot_sequencing_reads_overlap(sim_data, suffix=''): reads, mappings = load_mapped_reads(sim_data['reads_path']) def _overlaps(map0, map1): rc0, from0, to0 = map0 rc1, from1, to1 = map1 if rc0 != rc1: return False overlap_len = min(to0, to1) - max(from0, from1) if overlap_len > 50: return True else: return False pos, neg = [], [] for i, j in combinations(range(len(reads)), 2): if sim_data['overlap_band'][(i, j)] is None: p_hat = 0 else: p_hat = sim_data['overlap_band'][(i, j)]['p'] if _overlaps(mappings[i], mappings[j]): pos.append(p_hat) else: neg.append(p_hat) labels = ['overlapping', 'non-overlapping'] fig, _ = plot_classifier(pos, neg, labels=labels, mark_threshold=.8) print 'avg_time', sim_data['avg_time'], 'avg_len', sim_data['avg_len'] savefig(fig, 'overlaps[roc]%s.png' % suffix, comment='classifier plot')
[docs]def exp_overlap_sequencing_reads(): """Performance of Word-Blot in overlap discovery on *Pac Bio sequencing* reads. Pac Bio reads from chromosome 1 of Leishmenia Donovani ($n=1055$ reads) were mapped to a reference genome using BLASR which provides a ground truth for classification of reads into overlapping and non-overlapping pairs. Word-Blot overlap discovery is then applied to all pairs of reads and the performance of the highest overlap band match probability as a classifier is shown. **Supported Claims** * Word-Blot overlap discovery is a good classifier for overlapping/non-overlapping pairs among Pac Bio sequencing reads. .. figure:: https://www.dropbox.com/s/8cmsnr5x0oe5bbt/ overlaps%5Broc%5D%5Bw%3D10%5D.png?raw=1 :target: https://www.dropbox.com/s/8cmsnr5x0oe5bbt/ overlaps%5Broc%5D%5Bw%3D10%5D.png?raw=1 :alt: lightbox ROC curve for Word-Blot (word length = 10) overlap band match probability estimates as a classifier statistic for overlapping/non-overlapping pairs in Pac Bio sequencing reads. Average comparison time for reads of average length 7kbp is 60 ms. With match probability threshold 0.75 (indicated in all subplots in blue) we obtains %99 specificity and %98 negative predictive value (the crucial factors in overlap detection) and %45 sensitivity and %46 positive predictive value. """ wordlen = 10 suffix = '[w=%d]' % wordlen reads_path = 'data/leishmenia/reads_mapped.fa' dumpfile = 'overlaps%s.txt' % suffix sim_data = sim_overlap_sequencing_reads( reads_path=reads_path, wordlen=wordlen, g_max=.2, dumpfile=dumpfile, ignore_existing=False, ) plot_sequencing_reads_overlap(sim_data, suffix=suffix)
if __name__ == '__main__': exp_overlap_simulations() exp_overlap_sequencing_reads()