Mercurial > repos > sanbi-uwc > vcf_to_alignment
comparison vcf_to_msa.py @ 4:f58178c0f00d draft
planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 3e5c71977e50ec920d4f45be809d2528e55bff76
author | sanbi-uwc |
---|---|
date | Mon, 15 Oct 2018 00:34:44 -0400 |
parents | 62fbd3f96b30 |
children |
comparison
equal
deleted
inserted
replaced
3:62fbd3f96b30 | 4:f58178c0f00d |
---|---|
9 from Bio.Alphabet import IUPAC | 9 from Bio.Alphabet import IUPAC |
10 import os.path | 10 import os.path |
11 import vcf | 11 import vcf |
12 import intervaltree | 12 import intervaltree |
13 from operator import itemgetter | 13 from operator import itemgetter |
14 from pathlib import Path | |
14 | 15 |
15 difference = lambda x,y: 0 if x == y else 1 | |
16 | 16 |
17 string_difference = lambda query, target, query_len: sum((difference(query[i], target[i])) for i in range(query_len)) | 17 def difference(x, y): |
18 return 0 if x == y else 1 | |
19 | |
20 | |
21 def string_difference(query, target, query_len): | |
22 return sum((difference(query[i], target[i])) for i in range(query_len)) | |
23 | |
18 | 24 |
19 def fuzzysearch(query, target): | 25 def fuzzysearch(query, target): |
20 query_len = len(query) | 26 query_len = len(query) |
21 target_len = len(target) | 27 target_len = len(target) |
22 assert query_len <= target_len, "query cannot be longer than target" | 28 assert query_len <= target_len, "query cannot be longer than target" |
23 min_distance = string_difference(query, target, query_len) | 29 min_distance = string_difference(query, target, query_len) |
24 best_pos = 0 | 30 best_pos = 0 |
25 for i in range(0, target_len - query_len + 1): | 31 for i in range(0, target_len - query_len + 1): |
26 distance = string_difference(query, target[i:i+query_len], query_len) | 32 distance = string_difference(query, target[i:i + query_len], query_len) |
27 if distance < min_distance: | 33 if distance < min_distance: |
28 (min_distance, best_pos) = (distance, i) | 34 (min_distance, best_pos) = (distance, i) |
29 return best_pos | 35 return best_pos |
30 | 36 |
37 | |
38 class readable_dir(argparse.Action): | |
39 def __call__(self, parser, namespace, values, option_string=None): | |
40 prospective_dir = values | |
41 if not os.path.isdir(prospective_dir): | |
42 raise argparse.ArgumentTypeError("readable_dir:{0} is not a valid path".format(prospective_dir)) | |
43 if os.access(prospective_dir, os.R_OK): | |
44 setattr(namespace, self.dest, prospective_dir) | |
45 else: | |
46 raise argparse.ArgumentTypeError("readable_dir:{0} is not a readable dir".format(prospective_dir)) | |
47 | |
48 | |
31 parser = argparse.ArgumentParser() | 49 parser = argparse.ArgumentParser() |
32 parser.add_argument('--vcf_files', nargs="+") | 50 parser.add_argument('--vcf_files', nargs="+") |
33 parser.add_argument('--reference_file', type=argparse.FileType()) | 51 parser.add_argument('-d', '--vcf_dir', action=readable_dir, help="VCF directory ") |
34 parser.add_argument('--output_file', type=argparse.FileType('w')) | 52 parser.add_argument('--reference_file', required=True, type=argparse.FileType()) |
53 parser.add_argument('--output_file', required=True, type=argparse.FileType('w')) | |
35 parser.add_argument('--remove_invariant', action='store_true', default=False) | 54 parser.add_argument('--remove_invariant', action='store_true', default=False) |
55 parser.add_argument('--exclude', type=argparse.FileType(), help='BED format file of regions to exclude from variant calling') | |
36 args = parser.parse_args() | 56 args = parser.parse_args() |
57 | |
58 exclude_trees = {} | |
59 if args.exclude is not None: | |
60 for line in args.exclude: | |
61 # all of BED format that we care about is chromosome, start, end | |
62 fields = line.strip().split('\t') | |
63 if len(fields) < 3: | |
64 continue | |
65 chrom = fields[0] | |
66 start = int(fields[1]) | |
67 end = int(fields[2]) | |
68 if chrom not in exclude_trees: | |
69 tree = intervaltree.IntervalTree() | |
70 exclude_trees[chrom] = tree | |
71 else: | |
72 tree = exclude_trees[chrom] | |
73 tree[start:end] = True | |
37 | 74 |
38 do_inserts = False | 75 do_inserts = False |
39 do_deletes = False | 76 do_deletes = False |
40 do_snps = True | 77 do_snps = True |
41 if (do_inserts or do_deletes) and args.remove_invariant: | 78 if (do_inserts or do_deletes) and args.remove_invariant: |
43 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) | 80 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) |
44 # print(reference, file=open('/tmp/reference.txt', 'w')) | 81 # print(reference, file=open('/tmp/reference.txt', 'w')) |
45 # vcf_files_dir = os.path.expanduser("~/Data/vcf") | 82 # vcf_files_dir = os.path.expanduser("~/Data/vcf") |
46 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)] | 83 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)] |
47 # print(vcf_files) | 84 # print(vcf_files) |
48 reference_seq = SeqIO.read(args.reference_file, "fasta") | 85 reference_seq = SeqIO.read(args.reference_file, "fasta") |
49 reference = str(reference_seq.seq) | 86 reference = str(reference_seq.seq) |
50 # output_file = open(os.path.join(os.path.expanduser("~/Data/fasta/vcf_to_msa"), 'output.fasta'), 'w') | 87 # output_file = open(os.path.join(os.path.expanduser("~/Data/fasta/vcf_to_msa"), 'output.fasta'), 'w') |
51 insertions = {} | 88 insertions = {} |
52 insertion_sites = [] | 89 insertion_sites = [] |
53 tree = intervaltree.IntervalTree() | 90 tree = intervaltree.IntervalTree() |
54 sequence_names = [] | 91 sequence_names = [] |
55 sequences = {} | 92 sequences = {} |
56 if args.remove_invariant: | 93 if args.remove_invariant: |
57 variant_sites = set() | 94 variant_sites = set() |
58 for i, vcf_descriptor in enumerate(args.vcf_files): | 95 |
59 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] | 96 vcf_list = [] |
60 (seqname,vcf_filename) = vcf_descriptor.split('^^^') | 97 if args.vcf_dir: |
98 pathlist = Path(args.vcf_dir).glob('*.vcf') | |
99 for path in pathlist: | |
100 vcf_list.append(str(path)) | |
101 elif args.vcf_files: | |
102 vcf_list = args.vcf_files | |
103 | |
104 for i, vcf_descriptor in enumerate(vcf_list): | |
105 # print(os.path.basename(vcf_descriptor)) | |
106 if '^^^' in vcf_descriptor: | |
107 (seqname, vcf_filename) = vcf_descriptor.split('^^^') | |
108 else: | |
109 seqname = str(os.path.basename(vcf_descriptor)).rsplit('.vcf', 1)[0] | |
110 vcf_filename = vcf_descriptor | |
61 sequence_names.append(seqname) | 111 sequence_names.append(seqname) |
62 sequence = list(reference) | 112 sequence = list(reference) |
63 sequences[seqname] = sequence | 113 sequences[seqname] = sequence |
64 print(seqname) | 114 |
65 # tsv_filename = vcf_filename.replace(".vcf", ".tsv") | |
66 # output = open(tsv_filename, "wb") | |
67 insertions[seqname] = [] | 115 insertions[seqname] = [] |
68 count = 0 | 116 count = 0 |
69 for record in vcf.VCFReader(filename=vcf_filename): | 117 for record in vcf.VCFReader(filename=vcf_filename): |
70 type="unknown" | 118 if args.exclude: |
119 if record.CHROM in exclude_trees: | |
120 tree = exclude_trees[record.CHROM] | |
121 if tree.overlaps(record.affected_start, record.affected_end): | |
122 continue | |
123 type = "unknown" | |
71 if record.is_snp and do_snps: | 124 if record.is_snp and do_snps: |
72 type="snp" | 125 type = "snp" |
73 try: | 126 try: |
74 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele | 127 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele |
75 except IndexError as e: | 128 except IndexError as e: |
76 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) | 129 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) |
77 if args.remove_invariant: | 130 if args.remove_invariant: |
78 variant_sites.add(record.affected_start) | 131 variant_sites.add(record.affected_start) |
79 count += 1 | 132 count += 1 |
80 elif record.is_indel: | 133 elif record.is_indel: |
81 length = record.affected_end - record.affected_start | 134 length = record.affected_end - record.affected_start |
82 if record.is_deletion and do_deletes: | 135 if record.is_deletion and do_deletes: |
83 type="del" | 136 type = "del" |
84 try: | 137 try: |
85 sequence[record.affected_start:record.affected_end] = ['-'] * length | 138 sequence[record.affected_start:record.affected_end] = ['-'] * length |
86 except IndexError as e: | 139 except IndexError as e: |
87 print("del: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) | 140 print("del: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) |
88 count += 1 | 141 count += 1 |
89 else: | 142 else: |
90 if do_inserts: | 143 if do_inserts: |
91 print("Warning: insert processing from VCF is dangerously broken", file=sys.stderr) | 144 print("Warning: insert processing from VCF is dangerously broken", file=sys.stderr) |
92 type="ins" | 145 type = "ins" |
93 # insertions[seqname].append(record) | 146 # insertions[seqname].append(record) |
94 ref = str(record.alleles[0]) | 147 ref = str(record.alleles[0]) |
95 alt = str(record.alleles[1]) | 148 alt = str(record.alleles[1]) |
96 # print("ins", alt.startswith(ref), fuzzysearch(ref, alt), ref, alt, record.affected_start, record.affected_end, len(alt) - len(ref), len(alt), len(ref), record.affected_end - record.affected_start + 1) | 149 # print("ins", alt.startswith(ref), fuzzysearch(ref, alt), ref, alt, record.affected_start, record.affected_end, len(alt) - len(ref), len(alt), len(ref), record.affected_end - record.affected_start + 1) |
97 alt_sequence = alt[len(ref) - 1:] if alt.startswith(ref) else alt | 150 alt_sequence = alt[len(ref) - 1:] if alt.startswith(ref) else alt |
136 seq_str = ''.join([c for (i, c) in enumerate(sequence) if i in variant_sites]) | 189 seq_str = ''.join([c for (i, c) in enumerate(sequence) if i in variant_sites]) |
137 else: | 190 else: |
138 seq_str = ''.join(sequence) | 191 seq_str = ''.join(sequence) |
139 SeqIO.write(SeqRecord(Seq(seq_str, alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") | 192 SeqIO.write(SeqRecord(Seq(seq_str, alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") |
140 | 193 |
141 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) | 194 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) |
142 # output.close() | 195 # output.close() |
143 | 196 |
144 args.output_file.close() | 197 args.output_file.close() |