comparison vcf_to_msa.py @ 0:cc255feec53b draft

planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
author sanbi-uwc
date Wed, 01 Feb 2017 06:56:24 -0500
parents
children a0c85f2d74a5
comparison
equal deleted inserted replaced
-1:000000000000 0:cc255feec53b
1 #!/usr/bin/env python
2
3 import argparse
4 import sys
5 from Bio import SeqIO
6 from Bio.SeqRecord import SeqRecord
7 from Bio.Seq import Seq
8 from Bio.Alphabet import IUPAC
9 import os.path
10 import vcf
11 import intervaltree
12 from operator import itemgetter
13
14 difference = lambda x,y: 0 if x == y else 1
15
16 string_difference = lambda query, target, query_len: sum((difference(query[i], target[i])) for i in range(query_len))
17
18 def fuzzysearch(query, target):
19 query_len = len(query)
20 target_len = len(target)
21 assert query_len <= target_len, "query cannot be longer than target"
22 min_distance = string_difference(query, target, query_len)
23 best_pos = 0
24 for i in range(0, target_len - query_len + 1):
25 distance = string_difference(query, target[i:i+query_len], query_len)
26 if distance < min_distance:
27 (min_distance, best_pos) = (distance, i)
28 return best_pos
29
30 parser = argparse.ArgumentParser()
31 parser.add_argument('--vcf_files', nargs="+")
32 parser.add_argument('--reference_file', type=argparse.FileType())
33 parser.add_argument('--output_file', type=argparse.FileType('w'))
34 args = parser.parse_args()
35
36 do_inserts = False
37 do_deletes = False
38 do_snps = True
39 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq)
40 # print(reference, file=open('/tmp/reference.txt', 'w'))
41 # vcf_files_dir = os.path.expanduser("~/Data/vcf")
42 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)]
43 # print(vcf_files)
44 reference_seq = SeqIO.read(args.reference_file, "fasta")
45 reference = str(reference_seq.seq)
46 # output_file = open(os.path.join(os.path.expanduser("~/Data/fasta/vcf_to_msa"), 'output.fasta'), 'w')
47 insertions = {}
48 insertion_sites = []
49 tree = intervaltree.IntervalTree()
50 sequence_names = []
51 sequences = {}
52 for i, vcf_descriptor in enumerate(args.vcf_files):
53 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0]
54 (seqname,vcf_filename) = vcf_descriptor.split('^^^')
55 sequence_names.append(seqname)
56 sequence = list(reference)
57 sequences[seqname] = sequence
58 print(seqname)
59 # tsv_filename = vcf_filename.replace(".vcf", ".tsv")
60 # output = open(tsv_filename, "wb")
61 insertions[seqname] = []
62 count = 0
63 for record in vcf.VCFReader(filename=vcf_filename):
64 type="unknown"
65 if record.is_snp and do_snps:
66 type="snp"
67 try:
68 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele
69 except IndexError as e:
70 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
71 count += 1
72 elif record.is_indel:
73 length = record.affected_end - record.affected_start
74 if record.is_deletion and do_deletes:
75 type="del"
76 try:
77 sequence[record.affected_start:record.affected_end] = ['-'] * length
78 except IndexError as e:
79 print("del: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
80 count += 1
81 else:
82 if do_inserts:
83 print("Warning: insert processing from VCF is dangerously broken", file=sys.stderr)
84 type="ins"
85 # insertions[seqname].append(record)
86 ref = str(record.alleles[0])
87 alt = str(record.alleles[1])
88 # print("ins", alt.startswith(ref), fuzzysearch(ref, alt), ref, alt, record.affected_start, record.affected_end, len(alt) - len(ref), len(alt), len(ref), record.affected_end - record.affected_start + 1)
89 alt_sequence = alt[len(ref) - 1:] if alt.startswith(ref) else alt
90 insertion_sites.append((record.affected_start, record.affected_end, alt_sequence, seqname))
91 # interval = intervaltree.Interval(record.affected_start, record.affected_start + length, data=[seqname])
92 # if interval in tree:
93 # existing_interval = tree[interval.begin:interval.end + 1]
94 # start = min([existing_interval.begin, interval.begin])
95 # end = max([existing_interval.end, interval.end])
96 # tree.remove(existing_interval)
97 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data)
98 # tree.add(new_interval)
99
100 SeqIO.write(reference_seq, args.output_file, "fasta")
101 offset = 0
102 for name in sequence_names:
103 sequence = sequences[name]
104 for site in sorted(insertion_sites, key=itemgetter(0)):
105 (start, end, allele, seqname) = site
106 # print(start, allele, seqname)
107 length = len(allele)
108 # start += offset
109 # end += offset
110 # offset += length
111 try:
112 if name == seqname:
113 sequence[start:end] = list(str(allele))
114 else:
115 sequence[start:end] = ['-'] * length
116 except IndexError as e:
117 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr)
118 SeqIO.write(SeqRecord(Seq(''.join(sequence), alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta")
119
120 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii"))
121 # output.close()
122
123 args.output_file.close()