import numpy as np
import sys
import logging
from time import time
from matplotlib import pyplot as plt
from biseqt.pw import Aligner, BANDED_MODE, B_LOCAL
from biseqt.blot import WordBlotLocalRef
from biseqt.sequence import Alphabet
from util import log, savefig
from util import fill_in_unknown
from util import load_fasta, with_dumpfile
@with_dumpfile
def sim_ig_genotyping(reads, genes, **kw):
wordlens, p_min = kw['wordlens'], kw['p_min']
mismatch_scores = kw['mismatch_scores']
gap_open_score = kw['gap_open_score']
gap_extend_score = kw['gap_extend_score']
minlens = kw['minlens']
A = Alphabet('ACGT')
WB_kw = {'g_max': .5, 'sensitivity': .9, 'alphabet': A,
'log_level': logging.WARN}
sim_data = {
'genes': genes,
'wordlens': wordlens,
'minlens': minlens,
'reads': reads,
'mappings': {
read_name: {'V': None, 'D': None, 'J': None,
'D_start': None, 'D_end': None,
'time': 0}
for read_name in reads
},
'p_min': p_min,
'WB_kw': WB_kw,
'mismatch_scores': mismatch_scores,
'gap_open_score': gap_open_score,
'gap_extend_score': gap_extend_score,
}
def _matches_in_seg(similar_segment):
p_hat, seg = similar_segment['p'], similar_segment['segment']
seglen = seg[1][1] - seg[1][0]
return p_hat * seglen
def _map_gene_type(read_, gene_type):
t_start = time()
assert gene_type in 'VJD'
WB_kw['wordlen'] = wordlens[gene_type]
WB = WordBlotLocalRef(read_, **WB_kw)
candidates = {}
for idx, (gene, gene_rec) in enumerate(genes[gene_type].items()):
gene_len = len(gene_rec['seq'])
K_min = gene_len / 2
K_min = minlens[gene_type]
similarities = list(
WB.similar_segments(gene_rec['seq'], K_min, p_min)
)
if not similarities:
continue
res = max(similarities, key=lambda rec: _matches_in_seg(rec))
candidates[gene] = res
if not candidates:
sys.stderr.write(' no Ig %s gene found!\n' % gene_type)
return None, time() - t_start
chosen_genes = dict(
sorted(candidates.items(),
key=lambda rec: -_matches_in_seg(rec[1]))[:3]
)
return chosen_genes, time() - t_start
def _add_aln(read_, gene_seq, gene_type, rec):
d_band = rec['segment'][0]
aligner_kw = {
'match_score': 1,
'mismatch_score': mismatch_scores[gene_type],
'ge_score': gap_extend_score,
'go_score': gap_open_score,
'alnmode': BANDED_MODE,
'alntype': B_LOCAL,
'diag_range': (int(d_band[0]), int(d_band[1])),
}
with Aligner(read_, gene_seq, **aligner_kw) as aligner:
aligner.solve()
alignment = aligner.traceback()
tx = alignment.transcript
len_on_gene = sum(tx.count(op) for op in 'MSI')
num_matches = tx.count('M')
p_aln = 1. * num_matches / len_on_gene
rec['p_aln'] = round(p_aln, 2)
rec['len_aln'] = len_on_gene
rec['alignment'] = alignment
return rec
for read_idx, (read_name, read_rec) in enumerate(reads.items()):
read_seq = read_rec['seq']
sys.stderr.write('%d/%d %s\n' % (read_idx + 1, len(reads), read_name))
start_pos = 0
for gene_type in 'VDJ':
mapped_genes, t_elapsed = _map_gene_type(read_seq[start_pos:],
gene_type)
sim_data['mappings'][read_name]['time'] += t_elapsed
igblast = read_rec['igblast'][gene_type]
if not igblast:
# if igblast didn't map a read to any genes don't bother;
# we don't have a ground truth.
continue
min_igblast_p = round(min(rec['p'] for rec in igblast.values()), 2)
min_igblast_L = min(rec['length'] for rec in igblast.values())
min_igblast_m = min(rec['num_matches'] for rec in igblast.values())
sim_data['mappings'][read_name][gene_type] = {
'min_igblast_p': min_igblast_p,
'min_igblast_L': min_igblast_L,
'min_igblast_m': min_igblast_m,
}
if not mapped_genes:
continue
for gene in mapped_genes:
# run overlap NW for all chosen genes
gene_seq = genes[gene_type][gene]['seq']
# print gene
mapped_genes[gene] = _add_aln(read_seq[start_pos:], gene_seq,
gene_type, mapped_genes[gene])
aln = mapped_genes[gene]['alignment']
mapped_genes[gene]['start_pos'] = start_pos
mapped_genes[gene]['end_pos'] = start_pos \
+ aln.origin_start \
+ aln.projected_len(aln.transcript, on='origin')
true_pos = gene in reads[read_name]['igblast'][gene_type]
ours_m = aln.transcript.count('M')
p_aln = mapped_genes[gene]['p_aln']
higher_p = p_aln >= min_igblast_p and \
ours_m >= min_igblast_m - 2
higher_m = ours_m >= min_igblast_m and \
p_aln >= min_igblast_p - .01
true_pos_forgiving = higher_p or higher_m
sys.stderr.write(' %s: %s(%s) ' %
(gene_type,
'+' if true_pos else '-',
'+' if true_pos_forgiving else '-'))
sys.stderr.write(
'%s match=%d (%d),p=%.2f(%.2f)\n' %
(gene, ours_m, min_igblast_m, p_aln, min_igblast_p)
)
start_pos = min(rec['end_pos']
for gene, rec in mapped_genes.items())
sim_data['mappings'][read_name][gene_type].update(mapped_genes)
sys.stderr.write(' * %.2f s\n' %
(sim_data['mappings'][read_name]['time']))
return sim_data
def plot_ig_genotyping(sim_data, suffix=''):
reads = sim_data['reads']
comparison = {
'p': {'V': [], 'D': [], 'J': []},
'K': {'V': [], 'D': [], 'J': []},
'num_match': {'V': [], 'D': [], 'J': []},
}
accuracy = {
'strict': {'V': 0, 'D': 0, 'J': 0},
'forgiving': {'V': 0, 'D': 0, 'J': 0},
}
elapsed_times = []
for read, mappings in sim_data['mappings'].items():
elapsed_times.append(mappings['time'])
for gene_type in 'VDJ':
num_agreements_strict = 0
num_agreements_forgiving = 0
total_predictions = 0
for read, mappings in sim_data['mappings'].items():
if mappings[gene_type] is None:
continue
# pop these metrics so we can iterate over genes
min_igblast_p = mappings[gene_type].pop('min_igblast_p')
min_igblast_L = mappings[gene_type].pop('min_igblast_L')
min_igblast_m = mappings[gene_type].pop('min_igblast_m')
for gene, rec in mappings[gene_type].items():
total_predictions += 1
# NOTE we're duplicating this logic to allow reevaluating
# without redo-ing everything; eventually merge it into the
# sim_* function. Note that we need to distinguish between
# literal agreements between ours and igblast and those cases
# where we argue our match has comparable quality to those of
# igblast.
color = 'r'
if gene in reads[read]['igblast'][gene_type]:
num_agreements_strict += 1
num_agreements_forgiving += 1
color = 'g'
else:
p_aln = rec['p_aln']
ours_m = rec['alignment'].transcript.count('M')
higher_p = p_aln >= min_igblast_p and \
ours_m >= min_igblast_m - 1
higher_m = ours_m >= min_igblast_m and \
p_aln >= min_igblast_p - .01
if higher_p or higher_m:
num_agreements_forgiving += 1
color = 'g'
comparison['p'][gene_type].append(
(rec['p_aln'], min_igblast_p, color)
)
comparison['K'][gene_type].append(
(rec['len_aln'], min_igblast_L, color)
)
accuracy['strict'][gene_type] = \
100. * num_agreements_strict / total_predictions
accuracy['forgiving'][gene_type] = \
100. * num_agreements_forgiving / total_predictions
sys.stderr.write('%s %.2f %.2f' % (gene_type,
accuracy['strict'][gene_type],
accuracy['forgiving'][gene_type]))
avg_time = sum(elapsed_times) / len(sim_data['mappings'])
print 't', avg_time
def _extract_with_noise(mode, gene_type_):
mag = {'p': .005, 'K': .5, 'num_match': .5}[mode]
xs = [rec[0] for rec in comparison[mode][gene_type_]]
xs += np.random.randn(len(comparison[mode][gene_type_])) * mag
ys = [rec[1] for rec in comparison[mode][gene_type_]]
ys += np.random.randn(len(comparison[mode][gene_type_])) * mag
colors = [rec[2] for rec in comparison[mode][gene_type_]]
return xs, ys, colors
# probability of ours vs igblast
fig = plt.figure(figsize=(11, 8))
ax_p_V = fig.add_subplot(2, 3, 1)
ax_p_D = fig.add_subplot(2, 3, 2)
ax_p_J = fig.add_subplot(2, 3, 3)
ax_K_V = fig.add_subplot(2, 3, 4)
ax_K_D = fig.add_subplot(2, 3, 5)
ax_K_J = fig.add_subplot(2, 3, 6)
for gene_type, ax in zip('VDJ', [ax_p_V, ax_p_D, ax_p_J]):
xs, ys, colors = _extract_with_noise('p', gene_type)
ax.scatter(xs, ys, c=colors, alpha=.6, s=20, lw=0)
ax.set_title('%s genes: \\%%%.2f (\\%%%.2f)' %
(gene_type,
accuracy['strict'][gene_type],
accuracy['forgiving'][gene_type]))
ax.set_xlabel('WordBlot similarity')
ax.set_ylabel('IgBlast similarity')
# NOTE this can mislead if output is not checked already
ax.set_xlim(.7, 1.05)
ax.set_ylim(.7, 1.05)
ax.set_aspect('equal')
ax.plot([0, 1], [0, 1], c='k', lw=1, ls='--', alpha=.6)
print 'avg time', avg_time
for gene_type, ax in zip('VDJ', [ax_K_V, ax_K_D, ax_K_J]):
xs, ys, colors = _extract_with_noise('K', gene_type)
ax.scatter(xs, ys, c=colors, alpha=.4, s=20, lw=0)
ax.set_xlabel('WordBlot aligned length')
ax.set_ylabel('IgBlast aligned length')
ax.set_aspect('equal')
x_range, y_range = ax.get_xlim(), ax.get_ylim()
xy_range = (min(x_range[0], y_range[0]), max(x_range[1], y_range[1]))
ax.set_xlim(*xy_range)
ax.set_ylim(*xy_range)
ax.plot(ax.get_xlim(), ax.get_ylim(), c='k', lw=1, ls='--', alpha=.6)
savefig(fig, 'ig_genotyping%s.png' % suffix)
[docs]def exp_ig_genotyping():
"""Shows the performance of Word-Blot in genotyping Immunoglobin heavy
chain genotyping. A thousand reads (average length 240nt) covering the
mature VDJ region of chromosome 14 from the stanford S22 dataset are
genotyped by Word-Blot against the IMGT list of Immunoglobin heavy chain
genes:
* 358 genes and variants for V with average length 292nt,
* 44 genes and variants for D with average length 24nt,
* 13 genes and variants for J with average length 53nt.
For each read and each gene type (V, D, or J) 3 candidates are offered by
Word-Blot which are then compared against the same output of IgBlast.
For each gene type we consider two measures of accuracy:
* the percentage of Word-Blot top 3 candidates that are also IgBlast
top 3 candidates.
* a more forgiving measure which accepts the following candidates as "at
least as good as IgBlast candidates":
* those genes reported by Word-Blot that have a *longer* alignment than
the shortest IgBlast candidate alignment while their percent identity
is at most 1 point below that of the worst IgBlast candidate,
* those genes with a *higher percent identity* than the worst IgBlast
candidate with an aligned length at most 2 nucleotides shorter than
the shortest IgBlast candidate.
The relevant measures for Word-Blot candidates (exact value for percent
identity and aligned length) are obtained by *banded local* alignment in
the diagonal strip suggested by Word-Blot. To ensure comparability of
results the same alignment scores are used as those of IgBlast (gap open
-5, gap extend -2, and mismatch -1, -3, and -2 for V, D, and J
respectively).
**Supported Claims**
* Despite relying on statistical properties of kmers, by using short word
lengths Word-Blot performs well and acceptably fast even for small
sequence lengths and for discriminating highly similar sequences.
.. figure::
https://www.dropbox.com/s/k69jw7w8mwh2biv/
ig_genotyping_first_1000.png?raw=1
:target:
https://www.dropbox.com/s/k69jw7w8mwh2biv/
ig_genotyping_first_1000.png?raw=1
:alt: lightbox
For each of the gene types, V (*left*, word length 8), D (*middle*, word
length 4), and J (*right*, word length 6),
alignment lengths (*top*) and match probabilities (*bottom*) of top 3
candidates reported by Word-Blot and IgBlast are compared. For each
Word-Blot candidate the corresponding IgBlast coordinate is that of the
worst of the three reported by IgBlast for the same read. The two
measures of accuracy for each gene type are shown (more forgiving
measure in parentheses). Each gene mapping is color coded, greens are
those that are in some way "at least as good as IgBlast candidates" and
reds are the rest. Average mapping time for each read is 0.7 second.
"""
p_min = .8
wordlens = {'J': 6, 'V': 8, 'D': 4}
# cf. https://ncbiinsights.ncbi.nlm.nih.gov/tag/igblast/
# https://www.ncbi.nlm.nih.gov/books/NBK279684/
# note: I'm forcing these scores on blast as well:
mismatch_scores = {'V': -1, 'D': -3, 'J': -2}
minlens = {'V': 100, 'D': 10, 'J': 10}
gap_open_score = -5
gap_extend_score = -2
suffix = '_first_1000'
dumpfile = 'igh_s22%s.txt' % suffix
A = Alphabet('ACGT')
reads_file = 'data/igh-s22/s22%s.fa' % suffix
# TODO incorporate the commands in igblast_notes.md in here (as docs or
# automated code).
igblast_file = 'data/igh-s22/igblast%s_clean.out' % suffix
log('loading reads')
reads = {}
with open(reads_file) as f:
for raw_seq, name, _ in load_fasta(f, num_seqs=-1):
read = A.parse(fill_in_unknown(raw_seq, A))
reads[name] = {
'seq': read,
'igblast': {'V': {}, 'D': {}, 'J': {}},
}
with open(igblast_file) as f:
for line in f.readlines():
rec = dict(zip(['gene_type', 'read', 'gene', 'p', 'length'],
line.strip().split()))
gene, gene_type, name = rec['gene'], rec['gene_type'], rec['read']
num_m, length = rec['length'].split('/')
reads[name]['igblast'][gene_type][gene] = {
'p': float(rec['p']) / 100,
'length': int(length),
'num_matches': int(num_m),
}
genes = {'V': {}, 'D': {}, 'J': {}}
repertoire_prefix = 'data/igh-s22/imgt/'
for key in genes:
with open(repertoire_prefix + key + '.fa') as f:
for raw_seq, name, _ in load_fasta(f):
seq = A.parse(fill_in_unknown(raw_seq.upper(), A))
genes[key][name] = {
'seq': seq,
}
log('running Ig genotyping...')
sim_data = sim_ig_genotyping(
reads, genes, wordlens=wordlens, p_min=p_min, minlens=minlens,
mismatch_scores=mismatch_scores,
gap_open_score=gap_open_score,
gap_extend_score=gap_extend_score,
dumpfile=dumpfile, ignore_existing=False
)
plot_ig_genotyping(sim_data, suffix)
if __name__ == '__main__':
exp_ig_genotyping()