comparison vcf_to_msa.py @ 4:f58178c0f00d draft

planemo upload for repository https://github.com/sanbi-sa/tools-sanbi-uwc commit 3e5c71977e50ec920d4f45be809d2528e55bff76
author sanbi-uwc
date Mon, 15 Oct 2018 00:34:44 -0400
parents 62fbd3f96b30
children
comparison
equal deleted inserted replaced
3:62fbd3f96b30 4:f58178c0f00d
9 from Bio.Alphabet import IUPAC 9 from Bio.Alphabet import IUPAC
10 import os.path 10 import os.path
11 import vcf 11 import vcf
12 import intervaltree 12 import intervaltree
13 from operator import itemgetter 13 from operator import itemgetter
14 from pathlib import Path
14 15
15 difference = lambda x,y: 0 if x == y else 1
16 16
17 string_difference = lambda query, target, query_len: sum((difference(query[i], target[i])) for i in range(query_len)) 17 def difference(x, y):
18 return 0 if x == y else 1
19
20
21 def string_difference(query, target, query_len):
22 return sum((difference(query[i], target[i])) for i in range(query_len))
23
18 24
19 def fuzzysearch(query, target): 25 def fuzzysearch(query, target):
20 query_len = len(query) 26 query_len = len(query)
21 target_len = len(target) 27 target_len = len(target)
22 assert query_len <= target_len, "query cannot be longer than target" 28 assert query_len <= target_len, "query cannot be longer than target"
23 min_distance = string_difference(query, target, query_len) 29 min_distance = string_difference(query, target, query_len)
24 best_pos = 0 30 best_pos = 0
25 for i in range(0, target_len - query_len + 1): 31 for i in range(0, target_len - query_len + 1):
26 distance = string_difference(query, target[i:i+query_len], query_len) 32 distance = string_difference(query, target[i:i + query_len], query_len)
27 if distance < min_distance: 33 if distance < min_distance:
28 (min_distance, best_pos) = (distance, i) 34 (min_distance, best_pos) = (distance, i)
29 return best_pos 35 return best_pos
30 36
37
38 class readable_dir(argparse.Action):
39 def __call__(self, parser, namespace, values, option_string=None):
40 prospective_dir = values
41 if not os.path.isdir(prospective_dir):
42 raise argparse.ArgumentTypeError("readable_dir:{0} is not a valid path".format(prospective_dir))
43 if os.access(prospective_dir, os.R_OK):
44 setattr(namespace, self.dest, prospective_dir)
45 else:
46 raise argparse.ArgumentTypeError("readable_dir:{0} is not a readable dir".format(prospective_dir))
47
48
31 parser = argparse.ArgumentParser() 49 parser = argparse.ArgumentParser()
32 parser.add_argument('--vcf_files', nargs="+") 50 parser.add_argument('--vcf_files', nargs="+")
33 parser.add_argument('--reference_file', type=argparse.FileType()) 51 parser.add_argument('-d', '--vcf_dir', action=readable_dir, help="VCF directory ")
34 parser.add_argument('--output_file', type=argparse.FileType('w')) 52 parser.add_argument('--reference_file', required=True, type=argparse.FileType())
53 parser.add_argument('--output_file', required=True, type=argparse.FileType('w'))
35 parser.add_argument('--remove_invariant', action='store_true', default=False) 54 parser.add_argument('--remove_invariant', action='store_true', default=False)
55 parser.add_argument('--exclude', type=argparse.FileType(), help='BED format file of regions to exclude from variant calling')
36 args = parser.parse_args() 56 args = parser.parse_args()
57
58 exclude_trees = {}
59 if args.exclude is not None:
60 for line in args.exclude:
61 # all of BED format that we care about is chromosome, start, end
62 fields = line.strip().split('\t')
63 if len(fields) < 3:
64 continue
65 chrom = fields[0]
66 start = int(fields[1])
67 end = int(fields[2])
68 if chrom not in exclude_trees:
69 tree = intervaltree.IntervalTree()
70 exclude_trees[chrom] = tree
71 else:
72 tree = exclude_trees[chrom]
73 tree[start:end] = True
37 74
38 do_inserts = False 75 do_inserts = False
39 do_deletes = False 76 do_deletes = False
40 do_snps = True 77 do_snps = True
41 if (do_inserts or do_deletes) and args.remove_invariant: 78 if (do_inserts or do_deletes) and args.remove_invariant:
43 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq) 80 # reference = str(SeqIO.read(os.path.expanduser("~/Data/fasta/NC_000962.fna"), "fasta").seq)
44 # print(reference, file=open('/tmp/reference.txt', 'w')) 81 # print(reference, file=open('/tmp/reference.txt', 'w'))
45 # vcf_files_dir = os.path.expanduser("~/Data/vcf") 82 # vcf_files_dir = os.path.expanduser("~/Data/vcf")
46 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)] 83 # vcf_files = [os.path.join(vcf_files_dir, "vcf{}.vcf".format(num)) for num in range(1,4)]
47 # print(vcf_files) 84 # print(vcf_files)
48 reference_seq = SeqIO.read(args.reference_file, "fasta") 85 reference_seq = SeqIO.read(args.reference_file, "fasta")
49 reference = str(reference_seq.seq) 86 reference = str(reference_seq.seq)
50 # output_file = open(os.path.join(os.path.expanduser("~/Data/fasta/vcf_to_msa"), 'output.fasta'), 'w') 87 # output_file = open(os.path.join(os.path.expanduser("~/Data/fasta/vcf_to_msa"), 'output.fasta'), 'w')
51 insertions = {} 88 insertions = {}
52 insertion_sites = [] 89 insertion_sites = []
53 tree = intervaltree.IntervalTree() 90 tree = intervaltree.IntervalTree()
54 sequence_names = [] 91 sequence_names = []
55 sequences = {} 92 sequences = {}
56 if args.remove_invariant: 93 if args.remove_invariant:
57 variant_sites = set() 94 variant_sites = set()
58 for i, vcf_descriptor in enumerate(args.vcf_files): 95
59 # seqname = os.path.splitext(os.path.basename(vcf_filename))[0] 96 vcf_list = []
60 (seqname,vcf_filename) = vcf_descriptor.split('^^^') 97 if args.vcf_dir:
98 pathlist = Path(args.vcf_dir).glob('*.vcf')
99 for path in pathlist:
100 vcf_list.append(str(path))
101 elif args.vcf_files:
102 vcf_list = args.vcf_files
103
104 for i, vcf_descriptor in enumerate(vcf_list):
105 # print(os.path.basename(vcf_descriptor))
106 if '^^^' in vcf_descriptor:
107 (seqname, vcf_filename) = vcf_descriptor.split('^^^')
108 else:
109 seqname = str(os.path.basename(vcf_descriptor)).rsplit('.vcf', 1)[0]
110 vcf_filename = vcf_descriptor
61 sequence_names.append(seqname) 111 sequence_names.append(seqname)
62 sequence = list(reference) 112 sequence = list(reference)
63 sequences[seqname] = sequence 113 sequences[seqname] = sequence
64 print(seqname) 114
65 # tsv_filename = vcf_filename.replace(".vcf", ".tsv")
66 # output = open(tsv_filename, "wb")
67 insertions[seqname] = [] 115 insertions[seqname] = []
68 count = 0 116 count = 0
69 for record in vcf.VCFReader(filename=vcf_filename): 117 for record in vcf.VCFReader(filename=vcf_filename):
70 type="unknown" 118 if args.exclude:
119 if record.CHROM in exclude_trees:
120 tree = exclude_trees[record.CHROM]
121 if tree.overlaps(record.affected_start, record.affected_end):
122 continue
123 type = "unknown"
71 if record.is_snp and do_snps: 124 if record.is_snp and do_snps:
72 type="snp" 125 type = "snp"
73 try: 126 try:
74 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele 127 sequence[record.affected_start] = str(record.alleles[1]) # SNP, simply insert alt allele
75 except IndexError as e: 128 except IndexError as e:
76 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) 129 print("snp: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
77 if args.remove_invariant: 130 if args.remove_invariant:
78 variant_sites.add(record.affected_start) 131 variant_sites.add(record.affected_start)
79 count += 1 132 count += 1
80 elif record.is_indel: 133 elif record.is_indel:
81 length = record.affected_end - record.affected_start 134 length = record.affected_end - record.affected_start
82 if record.is_deletion and do_deletes: 135 if record.is_deletion and do_deletes:
83 type="del" 136 type = "del"
84 try: 137 try:
85 sequence[record.affected_start:record.affected_end] = ['-'] * length 138 sequence[record.affected_start:record.affected_end] = ['-'] * length
86 except IndexError as e: 139 except IndexError as e:
87 print("del: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr) 140 print("del: Error assigning to {}:{}: {}".format(record.affected_start, record.affected_end, str(e)), file=sys.stderr)
88 count += 1 141 count += 1
89 else: 142 else:
90 if do_inserts: 143 if do_inserts:
91 print("Warning: insert processing from VCF is dangerously broken", file=sys.stderr) 144 print("Warning: insert processing from VCF is dangerously broken", file=sys.stderr)
92 type="ins" 145 type = "ins"
93 # insertions[seqname].append(record) 146 # insertions[seqname].append(record)
94 ref = str(record.alleles[0]) 147 ref = str(record.alleles[0])
95 alt = str(record.alleles[1]) 148 alt = str(record.alleles[1])
96 # print("ins", alt.startswith(ref), fuzzysearch(ref, alt), ref, alt, record.affected_start, record.affected_end, len(alt) - len(ref), len(alt), len(ref), record.affected_end - record.affected_start + 1) 149 # print("ins", alt.startswith(ref), fuzzysearch(ref, alt), ref, alt, record.affected_start, record.affected_end, len(alt) - len(ref), len(alt), len(ref), record.affected_end - record.affected_start + 1)
97 alt_sequence = alt[len(ref) - 1:] if alt.startswith(ref) else alt 150 alt_sequence = alt[len(ref) - 1:] if alt.startswith(ref) else alt
136 seq_str = ''.join([c for (i, c) in enumerate(sequence) if i in variant_sites]) 189 seq_str = ''.join([c for (i, c) in enumerate(sequence) if i in variant_sites])
137 else: 190 else:
138 seq_str = ''.join(sequence) 191 seq_str = ''.join(sequence)
139 SeqIO.write(SeqRecord(Seq(seq_str, alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta") 192 SeqIO.write(SeqRecord(Seq(seq_str, alphabet=IUPAC.ambiguous_dna), id=name, description=""), args.output_file, "fasta")
140 193
141 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii")) 194 # output.write(bytes("\t".join([type, str(record.affected_start), str(record.affected_end), str(record.alleles[0]), str(record.alleles[1])])+"\n", encoding="ascii"))
142 # output.close() 195 # output.close()
143 196
144 args.output_file.close() 197 args.output_file.close()