comparison vcf_to_msa.py @ 3:62fbd3f96b30 draft

planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
author sanbi-uwc
date Fri, 03 Feb 2017 05:56:11 -0500
parents a0c85f2d74a5
children f58178c0f00d
comparison
equal deleted inserted replaced
2:a0c85f2d74a5 3:62fbd3f96b30
30 30
31 parser = argparse.ArgumentParser() 31 parser = argparse.ArgumentParser()
32 parser.add_argument('--vcf_files', nargs="+") 32 parser.add_argument('--vcf_files', nargs="+")
33 parser.add_argument('--reference_file', type=argparse.FileType()) 33 parser.add_argument('--reference_file', type=argparse.FileType())
34 parser.add_argument('--output_file', type=argparse.FileType('w')) 34 parser.add_argument('--output_file', type=argparse.FileType('w'))
35 parser.add_argument('--remove_invariant', action='store_true', default=False)
35 args = parser.parse_args() 36 args = parser.parse_args()
36 37
37 do_inserts = False 38 do_inserts = False
38 do_deletes = False 39 do_deletes = False
39 do_snps = True 40 do_snps = True
41 if (do_inserts or do_deletes) and args.remove_invariant:
42 exit("Cannot combine indel processing with 'remove_invariant' argument")
40 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) 43 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq)
41 # print(reference, file=open('/tmp/reference.txt', 'w')) 44 # print(reference, file=open('/tmp/reference.txt', 'w'))
42 # vcf_files_dir = os.path.expanduser("~/Data/vcf") 45 # vcf_files_dir = os.path.expanduser("~/Data/vcf")
43 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)] 46 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)]
44 # print(vcf_files) 47 # print(vcf_files)
48 insertions = {} 51 insertions = {}
49 insertion_sites = [] 52 insertion_sites = []
50 tree = intervaltree.IntervalTree() 53 tree = intervaltree.IntervalTree()
51 sequence_names = [] 54 sequence_names = []
52 sequences = {} 55 sequences = {}
56 if args.remove_invariant:
57 variant_sites = set()
53 for i, vcf_descriptor in enumerate(args.vcf_files): 58 for i, vcf_descriptor in enumerate(args.vcf_files):
54 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] 59 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0]
55 (seqname,vcf_filename) = vcf_descriptor.split('^^^') 60 (seqname,vcf_filename) = vcf_descriptor.split('^^^')
56 sequence_names.append(seqname) 61 sequence_names.append(seqname)
57 sequence = list(reference) 62 sequence = list(reference)
67 type="snp" 72 type="snp"
68 try: 73 try:
69 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele 74 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele
70 except IndexError as e: 75 except IndexError as e:
71 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) 76 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
77 if args.remove_invariant:
78 variant_sites.add(record.affected_start)
72 count += 1 79 count += 1
73 elif record.is_indel: 80 elif record.is_indel:
74 length = record.affected_end - record.affected_start 81 length = record.affected_end - record.affected_start
75 if record.is_deletion and do_deletes: 82 if record.is_deletion and do_deletes:
76 type="del" 83 type="del"
96 # end = max([existing_interval.end, interval.end]) 103 # end = max([existing_interval.end, interval.end])
97 # tree.remove(existing_interval) 104 # tree.remove(existing_interval)
98 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data) 105 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data)
99 # tree.add(new_interval) 106 # tree.add(new_interval)
100 107
101 SeqIO.write(reference_seq, args.output_file, "fasta") 108 if args.remove_invariant:
109 reference_str = ''.join([c for (i, c) in enumerate(reference) if i in variant_sites])
110 reference_seq_variant = SeqRecord(Seq(reference_str), id=reference_seq.id, description=reference_seq.description)
111 SeqIO.write(reference_seq_variant, args.output_file, "fasta")
112 else:
113 SeqIO.write(reference_seq, args.output_file, "fasta")
114
102 offset = 0 115 offset = 0
116 sequence_length = 0
103 for name in sequence_names: 117 for name in sequence_names:
104 sequence = sequences[name] 118 sequence = sequences[name]
119 sequence_length = len(sequence)
105 for site in sorted(insertion_sites, key=itemgetter(0)): 120 for site in sorted(insertion_sites, key=itemgetter(0)):
106 (start, end, allele, seqname) = site 121 (start, end, allele, seqname) = site
107 # print(start, allele, seqname) 122 # print(start, allele, seqname)
108 length = len(allele) 123 length = len(allele)
109 # start += offset 124 # start += offset
114 sequence[start:end] = list(str(allele)) 129 sequence[start:end] = list(str(allele))
115 else: 130 else:
116 sequence[start:end] = ['-'] * length 131 sequence[start:end] = ['-'] * length
117 except IndexError as e: 132 except IndexError as e:
118 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr) 133 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr)
119 SeqIO.write(SeqRecord(Seq(''.join(sequence), alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") 134
135 if args.remove_invariant:
136 seq_str = ''.join([c for (i, c) in enumerate(sequence) if i in variant_sites])
137 else:
138 seq_str = ''.join(sequence)
139 SeqIO.write(SeqRecord(Seq(seq_str, alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta")
120 140
121 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) 141 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii"))
122 # output.close() 142 # output.close()
123 143
124 args.output_file.close() 144 args.output_file.close()