Source code for experiments.blot_sv

#!/usr/bin/env python
import logging
from itertools import combinations, product
import sys
import numpy as np
import igraph
from time import time
from matplotlib import pyplot as plt

from biseqt.blot import WordBlotLocalRef
from biseqt.sequence import Alphabet
from biseqt.stochastics import rand_seq, rand_read, MutationProcess

from util import plot_with_sd, savefig, with_dumpfile, log


def gen_data_set(**kw):
    alphabet, sv_type = kw['alphabet'], kw['sv_type']
    ref_len, sv_len, sv_pos = kw['ref_len'], kw['sv_len'], kw['sv_pos']
    gap, subst = kw['gap'], kw['subst']
    coverage, margin = kw['coverage'], kw['margin']
    sequencing_kw = {
        'len_mean': sv_len * 2,
        'len_sd': sv_len * .1,
        'expected_coverage': coverage,
    }

    ref = rand_seq(alphabet, ref_len)
    if sv_type == 'insertion':
        indiv = ref[:sv_pos] + rand_seq(alphabet, sv_len) + ref[sv_pos:]
    elif sv_type == 'deletion':
        indiv = ref[:sv_pos] + ref[sv_pos + sv_len:]
    elif sv_type == 'duplication':
        sv_src = kw['sv_src'], kw['sv_src'] + sv_len
        indiv = ref[:sv_pos] + ref[sv_src[0]:sv_src[1]] + ref[sv_pos:]
    else:
        raise ValueError('unknown SV type %s' % sv_type)

    M = MutationProcess(alphabet, subst_probs=subst, ge_prob=gap, go_prob=gap)
    reads = [(M.mutate(read)[0], pos)
             for read, pos in rand_read(indiv, **sequencing_kw)]
    labels = [False] * len(reads)
    for idx, (read, start_pos) in enumerate(reads):
        overlap_with_sv = min(start_pos + len(read), sv_pos + sv_len) - \
                          max(start_pos, sv_pos)
        if overlap_with_sv < margin:
            continue

        start_overlap = start_pos < sv_pos
        end_overlap = start_pos + len(read) > sv_pos + sv_len
        if sv_type == 'insertion':
            labels[idx] = start_overlap and end_overlap
        elif sv_type == 'deletion':
            labels[idx] = start_overlap
        elif sv_type == 'duplication':
            labels[idx] = start_overlap or end_overlap
        else:
            raise ValueError('unknown SV type %s' % sv_type)

    reads = [read for read, pos in reads]
    return {'ref': ref, 'indiv': indiv, 'reads': reads, 'labels': labels}


def chain_paths_on_read(read, sims, **kw):
    margin = kw['margin']
    seg_graph = igraph.Graph()
    for sim in sims:
        seg_graph.add_vertex(rec=sim)
    seg_graph.add_vertex(name='start')
    seg_graph.add_vertex(name='end')
    for idx, sim in enumerate(sims):
        if sim['read'][0] < margin:
            seg_graph.add_edge('start', idx)
        if sim['read'][1] > len(read) - margin:
            seg_graph.add_edge(idx, 'end')

    for (idx0, sim0), (idx1, sim1) in combinations(enumerate(sims), 2):
        from0, to0 = sim0['read']
        from1, to1 = sim1['read']
        overlap_len = min(to0, to1) - max(from0, from1)
        if overlap_len > -margin:
            # HACK what to do with direction?
            if from0 <= from1:
                seg_graph.add_edge(idx0, idx1)
            else:
                seg_graph.add_edge(idx1, idx0)
    return seg_graph.get_all_shortest_paths('start', to='end', mode='out')


def chain_paths_on_ref(read, sims, **kw):
    margin = kw['margin']
    seg_graph = igraph.Graph()
    for sim in sims:
        seg_graph.add_vertex(rec=sim)
    seg_graph.add_vertex(name='start')
    seg_graph.add_vertex(name='end')
    min_start = min(sim['ref'][0] for sim in sims)
    max_end = max(sim['ref'][1] for sim in sims)
    for idx, sim in enumerate(sims):
        if sim['ref'][0] == min_start:
            seg_graph.add_edge('start', idx)
        if sim['ref'][1] == max_end:
            seg_graph.add_edge(idx, 'end')

    for (idx0, sim0), (idx1, sim1) in combinations(enumerate(sims), 2):
        from0, to0 = sim0['ref']
        from1, to1 = sim1['ref']
        overlap_len = min(to0, to1) - max(from0, from1)
        if overlap_len > -margin:
            # HACK what to do with direction?
            if from0 <= from1:
                seg_graph.add_edge(idx0, idx1)
            else:
                seg_graph.add_edge(idx1, idx0)
    return seg_graph.get_all_shortest_paths('start', to='end', mode='out')


# mode tells us whether the path is on read or on ref
def sv_coords(sims, path, mode='read'):
    assert len(path) == 4
    segs_in_order = sorted(path[1:-1], key=lambda i: sims[i]['ref'][0])
    coord0 = sims[segs_in_order[0]]['ref']
    coord1 = sims[segs_in_order[1]]['ref']
    if mode == 'read':
        return coord0[1], coord1[0]
    else:
        return (coord0[1] + coord1[0]) / 2


[docs]def call_svs(ref, reads, margin=50, K_min=200, p_min=.8, **WB_kw): """Calls structural variants based on single read mappings. For each read, the algorithm proceeds as follows: .. figure:: https://www.dropbox.com/s/rjqobtkp60snte7/SV-schematic.png?raw=1 :target: https://www.dropbox.com/s/rjqobtkp60snte7/SV-schematic.png?raw=1 :alt: lightbox Schematic representation of the chain graphs used to call structural variants. Each read (vertical axis) is compared against the reference sequence (horizontal axis) and in each case two chain graphs are created on each of the read and reference axes where two similar segments are connected with an edge if their projections on the corresponding axis can be consistently chained. * all local similarities are found via Word-Blot (:func:`biseqt.blot.WordBlotLocalRef.similar_segments` with given :math:`K_{\min}, p_{\min}`). * Two chain graphs are built based on the projections of similarities on the read and on the reference genome, henceforth the *read graph* and the *reference graph*. * Reads containining SVs are identified using the following rules: * Normal reads are characterized by a shortest path in both the read and reference graphs with exactly two edges between the start and the end of projections. * deletions and duplications produce shortest paths with four edges in the read graph. * insertions produce shortest paths with four edges in the reference graph. """ WB = WordBlotLocalRef(ref, **WB_kw) label_hats = [{'I': [False, []], # insertion 'D': [False, []]} # deletion / duplication for _ in reads] for read_idx, read in enumerate(reads): sims = list( WB.similar_segments(read, K_min, p_min) ) for sim_idx, sim in enumerate(sims): ref_pos, read_pos = WB.to_ij_coordinates_seg(sim['segment']) sims[sim_idx]['read'] = read_pos sims[sim_idx]['ref'] = ref_pos # for duplication or deletion d_paths = chain_paths_on_read(read, sims, margin=margin) if d_paths: d_path = d_paths[0] label_hats[read_idx]['D'][0] = len(d_path) > 3 if len(d_path) == 4: label_hats[read_idx]['D'][1] = sv_coords(sims, d_path, mode='read') i_paths = chain_paths_on_ref(read, sims, margin=margin) if i_paths: i_path = i_paths[0] label_hats[read_idx]['I'][0] = len(i_path) > 3 if len(i_path) == 4: label_hats[read_idx]['I'][1] = sv_coords(sims, i_path, mode='ref') sys.stderr.write('.') return label_hats
@with_dumpfile def sim_structural_variants(**kw): sv_len, ps, wordlen = kw['sv_len'], kw['ps'], kw['wordlen'] n_samples, coverage, margin = kw['n_samples'], kw['coverage'], kw['margin'] sv_types = ['insertion', 'deletion', 'duplication'] # maps sv types to sv call types sv_type_dict = {'insertion': 'I', 'duplication': 'D', 'deletion': 'D'} A = Alphabet('ACGT') WB_kw = {'g_max': .2, 'sensitivity': .9, 'alphabet': A, 'wordlen': wordlen, 'log_level': logging.WARN} def _zero(): return np.zeros((len(ps), n_samples)) sim_data = { 'coverage': coverage, 'n_samples': n_samples, 'sv_len': sv_len, 'ps': ps, 'WB_kw': WB_kw, 'coords': {sv_type: {p_idx: {idx: [] for idx in range(n_samples)} for p_idx in range(len(ps))} for sv_type in sv_types}, 'stats': {stat: {sv_type: _zero() for sv_type in sv_types} for stat in ['tpr', 'fpr']}, 'times': _zero(), } for sv_type, (p_idx, p_match) in product(sv_types, enumerate(ps)): ref_len = sv_len * 5 sv_src = sv_len sv_pos = sv_len * 3 # distribute p_match evenly over gap and subst subst = gap = 1 - np.sqrt(p_match) log('SV: %s, p = %.2f (n=%d rounds)' % (sv_type, p_match, n_samples)) for sample_idx in range(n_samples): dataset = gen_data_set( sv_type=sv_type, alphabet=A, ref_len=ref_len, sv_len=sv_len, sv_src=sv_src, sv_pos=sv_pos, gap=gap, subst=subst, coverage=coverage, margin=margin, ) labels = dataset['labels'] K_min = 200 p_min = .6 t_start = time() label_hats = call_svs(dataset['ref'], dataset['reads'], K_min=K_min, p_min=p_min, **WB_kw) sim_data['times'][p_idx, sample_idx] = time() - t_start tp, fp, tn, fn = 0, 0, 0, 0 for label, label_hat in zip(labels, label_hats): key = sv_type_dict[sv_type] if label and label_hat[key][0]: tp += 1 elif label and not label_hat[key][0]: fn += 1 elif not label and label_hat[key][0]: fp += 1 elif not label and not label_hat[key][0]: tn += 1 if label_hat[key][0]: sim_data['coords'][sv_type][p_idx][sample_idx].append( label_hat[key][1]) # almost imposible that tn + fp = 0, but tp + fn can be zero: if tp + fn == 0: tpr = 1. else: tpr = 1. * tp / (tp + fn) fpr = 1. * fp / (tn + fp) sim_data['stats']['tpr'][sv_type][p_idx, sample_idx] = tpr sim_data['stats']['fpr'][sv_type][p_idx, sample_idx] = fpr sys.stderr.write(' tpr = %.2f, fpr = %.2f' % (tpr, fpr)) sys.stderr.write('\n') return sim_data def plot_structural_variants(sim_data, suffix=''): ps = sim_data['ps'] sv_types = ['duplication', 'deletion', 'insertion'] fig = plt.figure(figsize=(11, 4)) ax_stats = {} # ax_coord = {} for idx, sv_type in enumerate(sv_types): ax_stats[sv_type] = fig.add_subplot(1, 3, idx + 1) kw = {'markersize': 5, 'marker': 'o', 'alpha': .7} ls = {'tpr': '-', 'fpr': '--'} label = {'tpr': 'TPR (sensitivity)', 'fpr': 'FPR (1 - specificity)'} for sv_type in sv_types: for stat, values in sim_data['stats'].items(): plot_with_sd(ax_stats[sv_type], ps, sim_data['stats'][stat][sv_type][:, :], axis=1, ls=ls[stat], label=label[stat], **kw) ax_stats[sv_type].set_ylim(-.1, 1.3) ax_stats[sv_type].set_title(sv_type) ax_stats[sv_type].legend(loc='upper left') ax_stats[sv_type].set_xlabel('match probability') ax_stats[sv_type].set_xticks(ps) ax_stats[sv_type].set_xticklabels(ps) times = sim_data['times'].flatten() print 'time: %.2f s (%.2f)' % (times.mean(), np.sqrt(times.var())) savefig(fig, 'structural_variants%s.png' % suffix)
[docs]def exp_structural_variants(): """Performance of Word-Blot in detecting structural variations (SV) in *simulations*: * Given a single read containing part of a SV and a reference sequence, one can deduce the presence of a SV from the topological arrangement of local similarities. The simple algorithm used here, :func:`call_svs`, simply pays attention to shortest paths in either of two directed graphs obtained by chaining projected similarities on either the reference or the read sequences. * For each of three copy number variation SVs, *insertion, deletion, duplication*, similar segments detected by Word-Blot are used to detect whether each read contains an SV. In each case the true positive rate and false positive rate are plotted as a function of similarity between individual and reference sequences. **Supported Claims** * The simple topological algorithm in :func:`call_svs` can accurately distinguish normal reads from those containing a SV. * The crux of our simple, single-read SV calling algorithm is that, roughly, SVs can be detected in a single read whenever there are more than one read/reference local similarities that, when chained, span the entire read (for duplication and deletion) or a whole region on reference (for insertion). This shows that Word-Blot accurately recovers local similarities at their maximal length without producing unnecessary fragmentation. * Since Word-Blot is a general purpose local similarity search and thus makes no assumption of collinearity it can accurately identify simple SVs (copy number variations) by simply looking at mapping of individual reads to a reference sequence. .. figure:: https://www.dropbox.com/s/ase6z7wzlbc6dy0/ structural_variants%5Bw%3D8%5D.png?raw=1 :target: https://www.dropbox.com/s/ase6z7wzlbc6dy0/ structural_variants%5Bw%3D8%5D.png?raw=1 :alt: lightbox True positive rate (solid) and false positive rate (dashed) for discriminating normal reads from those containing evidence for SV, duplications (*left*), deletion (*middle*), insertion (*right*) using the simple algorithm based on Word-Blot local similarities. SV length is 500nt, reference sequence length is 2500nt, sequencing coverage is 10, word length is 8, and each condition (match probability and SV type) is repeated n=5 times, average computation time for each read is 0.8 seconds. """ wordlen = 8 # kept in memory; don't go too high up sv_len = 500 margin = 50 ps = [round(.98 - .03 * i, 2) for i in range(6)] coverage = 10 n_samples = 5 suffix = '[w=%d]' % wordlen dumpfile = 'sv_calling%s.txt' % suffix sim_data = sim_structural_variants( sv_len=sv_len, ps=ps, wordlen=wordlen, n_samples=n_samples, margin=margin, coverage=coverage, dumpfile=dumpfile, ignore_existing=False, ) plot_structural_variants(sim_data, suffix=suffix)
if __name__ == '__main__': exp_structural_variants()