Mercurial > repos > sanbi-uwc > vcf_to_alignment
comparison vcf_to_msa.py @ 3:62fbd3f96b30 draft
planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 8a59904b63da8bcb647c8afbc2a88070c51a697e
author | sanbi-uwc |
---|---|
date | Fri, 03 Feb 2017 05:56:11 -0500 |
parents | a0c85f2d74a5 |
children | f58178c0f00d |
comparison
equal
deleted
inserted
replaced
2:a0c85f2d74a5 | 3:62fbd3f96b30 |
---|---|
30 | 30 |
31 parser = argparse.ArgumentParser() | 31 parser = argparse.ArgumentParser() |
32 parser.add_argument('--vcf_files', nargs="+") | 32 parser.add_argument('--vcf_files', nargs="+") |
33 parser.add_argument('--reference_file', type=argparse.FileType()) | 33 parser.add_argument('--reference_file', type=argparse.FileType()) |
34 parser.add_argument('--output_file', type=argparse.FileType('w')) | 34 parser.add_argument('--output_file', type=argparse.FileType('w')) |
35 parser.add_argument('--remove_invariant', action='store_true', default=False) | |
35 args = parser.parse_args() | 36 args = parser.parse_args() |
36 | 37 |
37 do_inserts = False | 38 do_inserts = False |
38 do_deletes = False | 39 do_deletes = False |
39 do_snps = True | 40 do_snps = True |
41 if (do_inserts or do_deletes) and args.remove_invariant: | |
42 exit("Cannot combine indel processing with 'remove_invariant' argument") | |
40 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) | 43 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) |
41 # print(reference, file=open('/tmp/reference.txt', 'w')) | 44 # print(reference, file=open('/tmp/reference.txt', 'w')) |
42 # vcf_files_dir = os.path.expanduser("~/Data/vcf") | 45 # vcf_files_dir = os.path.expanduser("~/Data/vcf") |
43 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)] | 46 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)] |
44 # print(vcf_files) | 47 # print(vcf_files) |
48 insertions = {} | 51 insertions = {} |
49 insertion_sites = [] | 52 insertion_sites = [] |
50 tree = intervaltree.IntervalTree() | 53 tree = intervaltree.IntervalTree() |
51 sequence_names = [] | 54 sequence_names = [] |
52 sequences = {} | 55 sequences = {} |
56 if args.remove_invariant: | |
57 variant_sites = set() | |
53 for i, vcf_descriptor in enumerate(args.vcf_files): | 58 for i, vcf_descriptor in enumerate(args.vcf_files): |
54 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] | 59 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] |
55 (seqname,vcf_filename) = vcf_descriptor.split('^^^') | 60 (seqname,vcf_filename) = vcf_descriptor.split('^^^') |
56 sequence_names.append(seqname) | 61 sequence_names.append(seqname) |
57 sequence = list(reference) | 62 sequence = list(reference) |
67 type="snp" | 72 type="snp" |
68 try: | 73 try: |
69 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele | 74 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele |
70 except IndexError as e: | 75 except IndexError as e: |
71 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) | 76 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) |
77 if args.remove_invariant: | |
78 variant_sites.add(record.affected_start) | |
72 count += 1 | 79 count += 1 |
73 elif record.is_indel: | 80 elif record.is_indel: |
74 length = record.affected_end - record.affected_start | 81 length = record.affected_end - record.affected_start |
75 if record.is_deletion and do_deletes: | 82 if record.is_deletion and do_deletes: |
76 type="del" | 83 type="del" |
96 # end = max([existing_interval.end, interval.end]) | 103 # end = max([existing_interval.end, interval.end]) |
97 # tree.remove(existing_interval) | 104 # tree.remove(existing_interval) |
98 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data) | 105 # new_interval = intervaltree.Interval(start, end, existing_interval.data + interval.data) |
99 # tree.add(new_interval) | 106 # tree.add(new_interval) |
100 | 107 |
101 SeqIO.write(reference_seq, args.output_file, "fasta") | 108 if args.remove_invariant: |
109 reference_str = ''.join([c for (i, c) in enumerate(reference) if i in variant_sites]) | |
110 reference_seq_variant = SeqRecord(Seq(reference_str), id=reference_seq.id, description=reference_seq.description) | |
111 SeqIO.write(reference_seq_variant, args.output_file, "fasta") | |
112 else: | |
113 SeqIO.write(reference_seq, args.output_file, "fasta") | |
114 | |
102 offset = 0 | 115 offset = 0 |
116 sequence_length = 0 | |
103 for name in sequence_names: | 117 for name in sequence_names: |
104 sequence = sequences[name] | 118 sequence = sequences[name] |
119 sequence_length = len(sequence) | |
105 for site in sorted(insertion_sites, key=itemgetter(0)): | 120 for site in sorted(insertion_sites, key=itemgetter(0)): |
106 (start, end, allele, seqname) = site | 121 (start, end, allele, seqname) = site |
107 # print(start, allele, seqname) | 122 # print(start, allele, seqname) |
108 length = len(allele) | 123 length = len(allele) |
109 # start += offset | 124 # start += offset |
114 sequence[start:end] = list(str(allele)) | 129 sequence[start:end] = list(str(allele)) |
115 else: | 130 else: |
116 sequence[start:end] = ['-'] * length | 131 sequence[start:end] = ['-'] * length |
117 except IndexError as e: | 132 except IndexError as e: |
118 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr) | 133 print("ins: Error assigning to {}:{}: {}".format(start, end, str(e)), file=sys.stderr) |
119 SeqIO.write(SeqRecord(Seq(''.join(sequence), alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") | 134 |
135 if args.remove_invariant: | |
136 seq_str = ''.join([c for (i, c) in enumerate(sequence) if i in variant_sites]) | |
137 else: | |
138 seq_str = ''.join(sequence) | |
139 SeqIO.write(SeqRecord(Seq(seq_str, alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") | |
120 | 140 |
121 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) | 141 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) |
122 # output.close() | 142 # output.close() |
123 | 143 |
124 args.output_file.close() | 144 args.output_file.close() |