annotate vcf_to_msa.py @ 2:a0c85f2d74a5 draft

planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 9612f06b8c60520dc0a047ec072ced317c7796e4
author sanbi-uwc
date Wed, 01 Feb 2017 08:45:12 -0500
parents cc255feec53b
children 62fbd3f96b30
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
1 #!/usr/bin/env python
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
2
2
a0c85f2d74a5 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 9612f06b8c60520dc0a047ec072ced317c7796e4
sanbi-uwc
parents: 0
diff changeset
3 from __future__ import print_function
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
4 import argparse
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
5 import sys
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
6 from Bio import SeqIO
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
7 from Bio.SeqRecord import SeqRecord
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
8 from Bio.Seq import Seq
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
9 from Bio.Alphabet import IUPAC
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
10 import os.path
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
11 import vcf
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
12 import intervaltree
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
13 from operator import itemgetter
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
14
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
15 difference = lambda x,y: 0 if x == y else 1
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
16
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
17 string_difference = lambda query, target, query_len: sum((difference(query[i], target[i])) for i in range(query_len))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
18
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
19 def fuzzysearch(query, target):
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
20 query_len = len(query)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
21 target_len = len(target)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
22 assert query_len <= target_len, "query cannot be longer than target"
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
23 min_distance = string_difference(query, target, query_len)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
24 best_pos = 0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
25 for i in range(0, target_len - query_len + 1):
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
26 distance = string_difference(query, target[i:i+query_len], query_len)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
27 if distance < min_distance:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
28 (min_distance, best_pos) = (distance, i)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
29 return best_pos
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
30
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
31 parser = argparse.ArgumentParser()
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
32 parser.add_argument('--vcf_files', nargs="+")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
33 parser.add_argument('--reference_file', type=argparse.FileType())
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
34 parser.add_argument('--output_file', type=argparse.FileType('w'))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
35 args = parser.parse_args()
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
36
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
37 do_inserts = False
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
38 do_deletes = False
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
39 do_snps = True
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
40 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
41 # print(reference, file=open('/tmp/reference.txt', 'w'))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
42 # vcf_files_dir = os.path.expanduser("~/Data/vcf")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
43 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)]
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
44 # print(vcf_files)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
45 reference_seq = SeqIO.read(args.reference_file, "fasta")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
46 reference = str(reference_seq.seq)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
47 # output_file = open(os.path.join(os.path.expanduser("~/Data/fasta/vcf_to_msa"), 'output.fasta'), 'w')
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
48 insertions = {}
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
49 insertion_sites = []
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
50 tree = intervaltree.IntervalTree()
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
51 sequence_names = []
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
52 sequences = {}
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
53 for i, vcf_descriptor in enumerate(args.vcf_files):
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
54 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0]
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
55 (seqname,vcf_filename) = vcf_descriptor.split('^^^')
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
56 sequence_names.append(seqname)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
57 sequence = list(reference)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
58 sequences[seqname] = sequence
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
59 print(seqname)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
60 # tsv_filename = vcf_filename.replace(".vcf", ".tsv")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
61 # output = open(tsv_filename, "wb")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
62 insertions[seqname] = []
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
63 count = 0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
64 for record in vcf.VCFReader(filename=vcf_filename):
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
65 type="unknown"
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
66 if record.is_snp and do_snps:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
67 type="snp"
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
68 try:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
69 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
70 except IndexError as e:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
71 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
72 count += 1
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
73 elif record.is_indel:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
74 length = record.affected_end - record.affected_start
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
75 if record.is_deletion and do_deletes:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
76 type="del"
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
77 try:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
78 sequence[record.affected_start:record.affected_end] = ['-'] * length
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
79 except IndexError as e:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
80 print("del: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
81 count += 1
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
82 else:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
83 if do_inserts:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
84 print("Warning: insert processing from VCF is dangerously broken", file=sys.stderr)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
85 type="ins"
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
86 # insertions[seqname].append(record)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
87 ref = str(record.alleles[0])
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
88 alt = str(record.alleles[1])
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
89 # print("ins", alt.startswith(ref), fuzzysearch(ref, alt), ref, alt, record.affected_start, record.affected_end, len(alt) - len(ref), len(alt), len(ref), record.affected_end - record.affected_start + 1)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
90 alt_sequence = alt[len(ref) - 1:] if alt.startswith(ref) else alt
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
91 insertion_sites.append((record.affected_start, record.affected_end, alt_sequence, seqname))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
92 # interval = intervaltree.Interval(record.affected_start, record.affected_start + length, data=[seqname])
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
93 # if interval in tree:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
94 # existing_interval = tree[interval.begin:interval.end + 1]
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
95 # start = min([existing_interval.begin, interval.begin])
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
96 # end = max([existing_interval.end, interval.end])
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
97 # tree.remove(existing_interval)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
98 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
99 # tree.add(new_interval)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
100
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
101 SeqIO.write(reference_seq, args.output_file, "fasta")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
102 offset = 0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
103 for name in sequence_names:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
104 sequence = sequences[name]
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
105 for site in sorted(insertion_sites, key=itemgetter(0)):
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
106 (start, end, allele, seqname) = site
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
107 # print(start, allele, seqname)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
108 length = len(allele)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
109 # start += offset
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
110 # end += offset
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
111 # offset += length
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
112 try:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
113 if name == seqname:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
114 sequence[start:end] = list(str(allele))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
115 else:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
116 sequence[start:end] = ['-'] * length
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
117 except IndexError as e:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
118 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
119 SeqIO.write(SeqRecord(Seq(''.join(sequence), alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
120
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
121 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii"))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
122 # output.close()
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
123
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
124 args.output_file.close()