Mercurial > repos > sanbi-uwc > vcf_to_alignment
comparison vcf_to_msa.py @ 0:cc255feec53b draft
planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
author | sanbi-uwc |
---|---|
date | Wed, 01 Feb 2017 06:56:24 -0500 |
parents | |
children | a0c85f2d74a5 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cc255feec53b |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 import sys | |
5 from Bio import SeqIO | |
6 from Bio.SeqRecord import SeqRecord | |
7 from Bio.Seq import Seq | |
8 from Bio.Alphabet import IUPAC | |
9 import os.path | |
10 import vcf | |
11 import intervaltree | |
12 from operator import itemgetter | |
13 | |
14 difference = lambda x,y: 0 if x == y else 1 | |
15 | |
16 string_difference = lambda query, target, query_len: sum((difference(query[i], target[i])) for i in range(query_len)) | |
17 | |
18 def fuzzysearch(query, target): | |
19 query_len = len(query) | |
20 target_len = len(target) | |
21 assert query_len <= target_len, "query cannot be longer than target" | |
22 min_distance = string_difference(query, target, query_len) | |
23 best_pos = 0 | |
24 for i in range(0, target_len - query_len + 1): | |
25 distance = string_difference(query, target[i:i+query_len], query_len) | |
26 if distance < min_distance: | |
27 (min_distance, best_pos) = (distance, i) | |
28 return best_pos | |
29 | |
30 parser = argparse.ArgumentParser() | |
31 parser.add_argument('--vcf_files', nargs="+") | |
32 parser.add_argument('--reference_file', type=argparse.FileType()) | |
33 parser.add_argument('--output_file', type=argparse.FileType('w')) | |
34 args = parser.parse_args() | |
35 | |
36 do_inserts = False | |
37 do_deletes = False | |
38 do_snps = True | |
39 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) | |
40 # print(reference, file=open('/tmp/reference.txt', 'w')) | |
41 # vcf_files_dir = os.path.expanduser("~/Data/vcf") | |
42 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)] | |
43 # print(vcf_files) | |
44 reference_seq = SeqIO.read(args.reference_file, "fasta") | |
45 reference = str(reference_seq.seq) | |
46 # output_file = open(os.path.join(os.path.expanduser("~/Data/fasta/vcf_to_msa"), 'output.fasta'), 'w') | |
47 insertions = {} | |
48 insertion_sites = [] | |
49 tree = intervaltree.IntervalTree() | |
50 sequence_names = [] | |
51 sequences = {} | |
52 for i, vcf_descriptor in enumerate(args.vcf_files): | |
53 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] | |
54 (seqname,vcf_filename) = vcf_descriptor.split('^^^') | |
55 sequence_names.append(seqname) | |
56 sequence = list(reference) | |
57 sequences[seqname] = sequence | |
58 print(seqname) | |
59 # tsv_filename = vcf_filename.replace(".vcf", ".tsv") | |
60 # output = open(tsv_filename, "wb") | |
61 insertions[seqname] = [] | |
62 count = 0 | |
63 for record in vcf.VCFReader(filename=vcf_filename): | |
64 type="unknown" | |
65 if record.is_snp and do_snps: | |
66 type="snp" | |
67 try: | |
68 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele | |
69 except IndexError as e: | |
70 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) | |
71 count += 1 | |
72 elif record.is_indel: | |
73 length = record.affected_end - record.affected_start | |
74 if record.is_deletion and do_deletes: | |
75 type="del" | |
76 try: | |
77 sequence[record.affected_start:record.affected_end] = ['-'] * length | |
78 except IndexError as e: | |
79 print("del: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) | |
80 count += 1 | |
81 else: | |
82 if do_inserts: | |
83 print("Warning: insert processing from VCF is dangerously broken", file=sys.stderr) | |
84 type="ins" | |
85 # insertions[seqname].append(record) | |
86 ref = str(record.alleles[0]) | |
87 alt = str(record.alleles[1]) | |
88 # print("ins", alt.startswith(ref), fuzzysearch(ref, alt), ref, alt, record.affected_start, record.affected_end, len(alt) - len(ref), len(alt), len(ref), record.affected_end - record.affected_start + 1) | |
89 alt_sequence = alt[len(ref) - 1:] if alt.startswith(ref) else alt | |
90 insertion_sites.append((record.affected_start, record.affected_end, alt_sequence, seqname)) | |
91 # interval = intervaltree.Interval(record.affected_start, record.affected_start + length, data=[seqname]) | |
92 # if interval in tree: | |
93 # existing_interval = tree[interval.begin:interval.end + 1] | |
94 # start = min([existing_interval.begin, interval.begin]) | |
95 # end = max([existing_interval.end, interval.end]) | |
96 # tree.remove(existing_interval) | |
97 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data) | |
98 # tree.add(new_interval) | |
99 | |
100 SeqIO.write(reference_seq, args.output_file, "fasta") | |
101 offset = 0 | |
102 for name in sequence_names: | |
103 sequence = sequences[name] | |
104 for site in sorted(insertion_sites, key=itemgetter(0)): | |
105 (start, end, allele, seqname) = site | |
106 # print(start, allele, seqname) | |
107 length = len(allele) | |
108 # start += offset | |
109 # end += offset | |
110 # offset += length | |
111 try: | |
112 if name == seqname: | |
113 sequence[start:end] = list(str(allele)) | |
114 else: | |
115 sequence[start:end] = ['-'] * length | |
116 except IndexError as e: | |
117 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr) | |
118 SeqIO.write(SeqRecord(Seq(''.join(sequence), alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") | |
119 | |
120 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) | |
121 # output.close() | |
122 | |
123 args.output_file.close() |