diff vcf_to_msa.py @ 0:cc255feec53b draft

planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
author sanbi-uwc
date Wed, 01 Feb 2017 06:56:24 -0500
parents
children a0c85f2d74a5
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_to_msa.py	Wed Feb 01 06:56:24 2017 -0500
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+import os.path
+import vcf
+import intervaltree
+from operator import itemgetter
+
+difference = lambda x,y: 0 if x == y else 1
+
+string_difference = lambda query, target, query_len: sum((difference(query[i], target[i])) for i in range(query_len))
+
+def fuzzysearch(query, target):
+    query_len = len(query)
+    target_len = len(target)
+    assert query_len <= target_len, "query cannot be longer than target"
+    min_distance = string_difference(query, target, query_len)
+    best_pos = 0
+    for i in range(0, target_len - query_len + 1):
+        distance = string_difference(query, target[i:i+query_len], query_len)
+        if distance < min_distance:
+            (min_distance, best_pos) = (distance, i)
+    return best_pos
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--vcf_files', nargs="+")
+parser.add_argument('--reference_file', type=argparse.FileType())
+parser.add_argument('--output_file', type=argparse.FileType('w'))
+args = parser.parse_args()
+
+do_inserts = False
+do_deletes = False
+do_snps = True
+# reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq)
+# print(reference, file=open('/tmp/reference.txt', 'w'))
+# vcf_files_dir = os.path.expanduser("~/Data/vcf")
+# vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)]
+# print(vcf_files)
+reference_seq  = SeqIO.read(args.reference_file, "fasta")
+reference = str(reference_seq.seq)
+# output_file = open(os.path.join(os.path.expanduser("~/Data/fasta/vcf_to_msa"), 'output.fasta'), 'w')
+insertions = {}
+insertion_sites = []
+tree = intervaltree.IntervalTree()
+sequence_names = []
+sequences = {}
+for i, vcf_descriptor in enumerate(args.vcf_files):
+    # seqname = os.path.splitext(os.path.basename(vcf_filename))[0]
+    (seqname,vcf_filename) = vcf_descriptor.split('^^^')
+    sequence_names.append(seqname)
+    sequence = list(reference)
+    sequences[seqname] = sequence
+    print(seqname)
+    # tsv_filename = vcf_filename.replace(".vcf", ".tsv")
+    # output = open(tsv_filename, "wb")
+    insertions[seqname] = []
+    count = 0
+    for record in vcf.VCFReader(filename=vcf_filename):
+        type="unknown"
+        if record.is_snp and do_snps:
+            type="snp"
+            try:
+                sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele
+            except IndexError as e:
+                print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
+            count += 1
+        elif record.is_indel:
+            length = record.affected_end - record.affected_start
+            if record.is_deletion and do_deletes:
+                type="del"
+                try:
+                    sequence[record.affected_start:record.affected_end] = ['-'] * length
+                except IndexError as e:
+                    print("del: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
+                count += 1
+            else:
+                if do_inserts:
+                    print("Warning: insert processing from VCF is dangerously broken", file=sys.stderr)
+                    type="ins"
+                    # insertions[seqname].append(record)
+                    ref = str(record.alleles[0])
+                    alt = str(record.alleles[1])
+                    # print("ins", alt.startswith(ref), fuzzysearch(ref, alt), ref, alt, record.affected_start, record.affected_end, len(alt) - len(ref), len(alt), len(ref), record.affected_end - record.affected_start + 1)
+                    alt_sequence = alt[len(ref) - 1:] if alt.startswith(ref) else alt
+                    insertion_sites.append((record.affected_start, record.affected_end, alt_sequence, seqname))
+                    # interval = intervaltree.Interval(record.affected_start, record.affected_start + length, data=[seqname])
+                    # if interval in tree:
+                    #     existing_interval = tree[interval.begin:interval.end + 1]
+                    #     start = min([existing_interval.begin, interval.begin])
+                    #     end = max([existing_interval.end, interval.end])
+                    #     tree.remove(existing_interval)
+                    #     new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data)
+                    #     tree.add(new_interval)
+
+SeqIO.write(reference_seq, args.output_file, "fasta")
+offset = 0
+for name in sequence_names:
+    sequence = sequences[name]
+    for site in sorted(insertion_sites, key=itemgetter(0)):
+        (start, end, allele, seqname) = site
+        # print(start, allele, seqname)
+        length = len(allele)
+        # start += offset
+        # end += offset
+        # offset += length
+        try:
+            if name == seqname:
+                sequence[start:end] = list(str(allele))
+            else:
+                sequence[start:end] = ['-'] * length
+        except IndexError as e:
+            print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr)
+    SeqIO.write(SeqRecord(Seq(''.join(sequence), alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta")
+
+        # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii"))
+    # output.close()
+
+args.output_file.close()