annotate vcf_to_msa.py @ 3:62fbd3f96b30 draft

planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
author sanbi-uwc
date Fri, 03 Feb 2017 05:56:11 -0500
parents a0c85f2d74a5
children f58178c0f00d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
1 #!/usr/bin/env python
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
2
2
a0c85f2d74a5 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 9612f06b8c60520dc0a047ec072ced317c7796e4
sanbi-uwc
parents: 0
diff changeset
3 from __future__ import print_function
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
4 import argparse
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
5 import sys
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
6 from Bio import SeqIO
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
7 from Bio.SeqRecord import SeqRecord
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
8 from Bio.Seq import Seq
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
9 from Bio.Alphabet import IUPAC
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
10 import os.path
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
11 import vcf
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
12 import intervaltree
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
13 from operator import itemgetter
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
14
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
15 difference = lambda x,y: 0 if x == y else 1
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
16
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
17 string_difference = lambda query, target, query_len: sum((difference(query[i], target[i])) for i in range(query_len))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
18
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
19 def fuzzysearch(query, target):
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
20 query_len = len(query)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
21 target_len = len(target)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
22 assert query_len <= target_len, "query cannot be longer than target"
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
23 min_distance = string_difference(query, target, query_len)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
24 best_pos = 0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
25 for i in range(0, target_len - query_len + 1):
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
26 distance = string_difference(query, target[i:i+query_len], query_len)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
27 if distance < min_distance:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
28 (min_distance, best_pos) = (distance, i)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
29 return best_pos
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
30
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
31 parser = argparse.ArgumentParser()
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
32 parser.add_argument('--vcf_files', nargs="+")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
33 parser.add_argument('--reference_file', type=argparse.FileType())
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
34 parser.add_argument('--output_file', type=argparse.FileType('w'))
3
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
35 parser.add_argument('--remove_invariant', action='store_true', default=False)
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
36 args = parser.parse_args()
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
37
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
38 do_inserts = False
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
39 do_deletes = False
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
40 do_snps = True
3
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
41 if (do_inserts or do_deletes) and args.remove_invariant:
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
42 exit("Cannot combine indel processing with 'remove_invariant' argument")
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
43 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
44 # print(reference, file=open('/tmp/reference.txt', 'w'))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
45 # vcf_files_dir = os.path.expanduser("~/Data/vcf")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
46 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)]
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
47 # print(vcf_files)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
48 reference_seq = SeqIO.read(args.reference_file, "fasta")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
49 reference = str(reference_seq.seq)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
50 # output_file = open(os.path.join(os.path.expanduser("~/Data/fasta/vcf_to_msa"), 'output.fasta'), 'w')
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
51 insertions = {}
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
52 insertion_sites = []
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
53 tree = intervaltree.IntervalTree()
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
54 sequence_names = []
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
55 sequences = {}
3
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
56 if args.remove_invariant:
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
57 variant_sites = set()
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
58 for i, vcf_descriptor in enumerate(args.vcf_files):
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
59 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0]
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
60 (seqname,vcf_filename) = vcf_descriptor.split('^^^')
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
61 sequence_names.append(seqname)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
62 sequence = list(reference)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
63 sequences[seqname] = sequence
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
64 print(seqname)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
65 # tsv_filename = vcf_filename.replace(".vcf", ".tsv")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
66 # output = open(tsv_filename, "wb")
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
67 insertions[seqname] = []
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
68 count = 0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
69 for record in vcf.VCFReader(filename=vcf_filename):
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
70 type="unknown"
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
71 if record.is_snp and do_snps:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
72 type="snp"
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
73 try:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
74 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
75 except IndexError as e:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
76 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
3
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
77 if args.remove_invariant:
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
78 variant_sites.add(record.affected_start)
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
79 count += 1
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
80 elif record.is_indel:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
81 length = record.affected_end - record.affected_start
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
82 if record.is_deletion and do_deletes:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
83 type="del"
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
84 try:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
85 sequence[record.affected_start:record.affected_end] = ['-'] * length
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
86 except IndexError as e:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
87 print("del: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
88 count += 1
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
89 else:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
90 if do_inserts:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
91 print("Warning: insert processing from VCF is dangerously broken", file=sys.stderr)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
92 type="ins"
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
93 # insertions[seqname].append(record)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
94 ref = str(record.alleles[0])
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
95 alt = str(record.alleles[1])
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
96 # print("ins", alt.startswith(ref), fuzzysearch(ref, alt), ref, alt, record.affected_start, record.affected_end, len(alt) - len(ref), len(alt), len(ref), record.affected_end - record.affected_start + 1)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
97 alt_sequence = alt[len(ref) - 1:] if alt.startswith(ref) else alt
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
98 insertion_sites.append((record.affected_start, record.affected_end, alt_sequence, seqname))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
99 # interval = intervaltree.Interval(record.affected_start, record.affected_start + length, data=[seqname])
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
100 # if interval in tree:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
101 # existing_interval = tree[interval.begin:interval.end + 1]
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
102 # start = min([existing_interval.begin, interval.begin])
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
103 # end = max([existing_interval.end, interval.end])
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
104 # tree.remove(existing_interval)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
105 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
106 # tree.add(new_interval)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
107
3
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
108 if args.remove_invariant:
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
109 reference_str = ''.join([c for (i, c) in enumerate(reference) if i in variant_sites])
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
110 reference_seq_variant = SeqRecord(Seq(reference_str), id=reference_seq.id, description=reference_seq.description)
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
111 SeqIO.write(reference_seq_variant, args.output_file, "fasta")
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
112 else:
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
113 SeqIO.write(reference_seq, args.output_file, "fasta")
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
114
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
115 offset = 0
3
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
116 sequence_length = 0
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
117 for name in sequence_names:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
118 sequence = sequences[name]
3
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
119 sequence_length = len(sequence)
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
120 for site in sorted(insertion_sites, key=itemgetter(0)):
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
121 (start, end, allele, seqname) = site
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
122 # print(start, allele, seqname)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
123 length = len(allele)
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
124 # start += offset
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
125 # end += offset
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
126 # offset += length
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
127 try:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
128 if name == seqname:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
129 sequence[start:end] = list(str(allele))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
130 else:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
131 sequence[start:end] = ['-'] * length
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
132 except IndexError as e:
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
133 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr)
3
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
134
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
135 if args.remove_invariant:
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
136 seq_str = ''.join([c for (i, c) in enumerate(sequence) if i in variant_sites])
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
137 else:
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
138 seq_str = ''.join(sequence)
62fbd3f96b30 planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
sanbi-uwc
parents: 2
diff changeset
139 SeqIO.write(SeqRecord(Seq(seq_str, alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta")
0
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
140
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
141 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii"))
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
142 # output.close()
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
143
cc255feec53b planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
sanbi-uwc
parents:
diff changeset
144 args.output_file.close()