Mercurial > repos > sanbi-uwc > vcf_to_alignment
comparison vcf_to_msa.py @ 0:cc255feec53b draft
planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit bc8fd85986b54f9d000e7d5869876fc9e479b6eb
| author | sanbi-uwc |
|---|---|
| date | Wed, 01 Feb 2017 06:56:24 -0500 |
| parents | |
| children | a0c85f2d74a5 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:cc255feec53b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse | |
| 4 import sys | |
| 5 from Bio import SeqIO | |
| 6 from Bio.SeqRecord import SeqRecord | |
| 7 from Bio.Seq import Seq | |
| 8 from Bio.Alphabet import IUPAC | |
| 9 import os.path | |
| 10 import vcf | |
| 11 import intervaltree | |
| 12 from operator import itemgetter | |
| 13 | |
| 14 difference = lambda x,y: 0 if x == y else 1 | |
| 15 | |
| 16 string_difference = lambda query, target, query_len: sum((difference(query[i], target[i])) for i in range(query_len)) | |
| 17 | |
| 18 def fuzzysearch(query, target): | |
| 19 query_len = len(query) | |
| 20 target_len = len(target) | |
| 21 assert query_len <= target_len, "query cannot be longer than target" | |
| 22 min_distance = string_difference(query, target, query_len) | |
| 23 best_pos = 0 | |
| 24 for i in range(0, target_len - query_len + 1): | |
| 25 distance = string_difference(query, target[i:i+query_len], query_len) | |
| 26 if distance < min_distance: | |
| 27 (min_distance, best_pos) = (distance, i) | |
| 28 return best_pos | |
| 29 | |
| 30 parser = argparse.ArgumentParser() | |
| 31 parser.add_argument('--vcf_files', nargs="+") | |
| 32 parser.add_argument('--reference_file', type=argparse.FileType()) | |
| 33 parser.add_argument('--output_file', type=argparse.FileType('w')) | |
| 34 args = parser.parse_args() | |
| 35 | |
| 36 do_inserts = False | |
| 37 do_deletes = False | |
| 38 do_snps = True | |
| 39 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) | |
| 40 # print(reference, file=open('/tmp/reference.txt', 'w')) | |
| 41 # vcf_files_dir = os.path.expanduser("~/Data/vcf") | |
| 42 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)] | |
| 43 # print(vcf_files) | |
| 44 reference_seq = SeqIO.read(args.reference_file, "fasta") | |
| 45 reference = str(reference_seq.seq) | |
| 46 # output_file = open(os.path.join(os.path.expanduser("~/Data/fasta/vcf_to_msa"), 'output.fasta'), 'w') | |
| 47 insertions = {} | |
| 48 insertion_sites = [] | |
| 49 tree = intervaltree.IntervalTree() | |
| 50 sequence_names = [] | |
| 51 sequences = {} | |
| 52 for i, vcf_descriptor in enumerate(args.vcf_files): | |
| 53 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] | |
| 54 (seqname,vcf_filename) = vcf_descriptor.split('^^^') | |
| 55 sequence_names.append(seqname) | |
| 56 sequence = list(reference) | |
| 57 sequences[seqname] = sequence | |
| 58 print(seqname) | |
| 59 # tsv_filename = vcf_filename.replace(".vcf", ".tsv") | |
| 60 # output = open(tsv_filename, "wb") | |
| 61 insertions[seqname] = [] | |
| 62 count = 0 | |
| 63 for record in vcf.VCFReader(filename=vcf_filename): | |
| 64 type="unknown" | |
| 65 if record.is_snp and do_snps: | |
| 66 type="snp" | |
| 67 try: | |
| 68 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele | |
| 69 except IndexError as e: | |
| 70 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) | |
| 71 count += 1 | |
| 72 elif record.is_indel: | |
| 73 length = record.affected_end - record.affected_start | |
| 74 if record.is_deletion and do_deletes: | |
| 75 type="del" | |
| 76 try: | |
| 77 sequence[record.affected_start:record.affected_end] = ['-'] * length | |
| 78 except IndexError as e: | |
| 79 print("del: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) | |
| 80 count += 1 | |
| 81 else: | |
| 82 if do_inserts: | |
| 83 print("Warning: insert processing from VCF is dangerously broken", file=sys.stderr) | |
| 84 type="ins" | |
| 85 # insertions[seqname].append(record) | |
| 86 ref = str(record.alleles[0]) | |
| 87 alt = str(record.alleles[1]) | |
| 88 # print("ins", alt.startswith(ref), fuzzysearch(ref, alt), ref, alt, record.affected_start, record.affected_end, len(alt) - len(ref), len(alt), len(ref), record.affected_end - record.affected_start + 1) | |
| 89 alt_sequence = alt[len(ref) - 1:] if alt.startswith(ref) else alt | |
| 90 insertion_sites.append((record.affected_start, record.affected_end, alt_sequence, seqname)) | |
| 91 # interval = intervaltree.Interval(record.affected_start, record.affected_start + length, data=[seqname]) | |
| 92 # if interval in tree: | |
| 93 # existing_interval = tree[interval.begin:interval.end + 1] | |
| 94 # start = min([existing_interval.begin, interval.begin]) | |
| 95 # end = max([existing_interval.end, interval.end]) | |
| 96 # tree.remove(existing_interval) | |
| 97 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data) | |
| 98 # tree.add(new_interval) | |
| 99 | |
| 100 SeqIO.write(reference_seq, args.output_file, "fasta") | |
| 101 offset = 0 | |
| 102 for name in sequence_names: | |
| 103 sequence = sequences[name] | |
| 104 for site in sorted(insertion_sites, key=itemgetter(0)): | |
| 105 (start, end, allele, seqname) = site | |
| 106 # print(start, allele, seqname) | |
| 107 length = len(allele) | |
| 108 # start += offset | |
| 109 # end += offset | |
| 110 # offset += length | |
| 111 try: | |
| 112 if name == seqname: | |
| 113 sequence[start:end] = list(str(allele)) | |
| 114 else: | |
| 115 sequence[start:end] = ['-'] * length | |
| 116 except IndexError as e: | |
| 117 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr) | |
| 118 SeqIO.write(SeqRecord(Seq(''.join(sequence), alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") | |
| 119 | |
| 120 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) | |
| 121 # output.close() | |
| 122 | |
| 123 args.output_file.close() |
