Mercurial > repos > nick > duplex
comparison utils/fuzzy-match.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:836fa4fe9494 | 18:e4d75f9efb90 |
---|---|
1 #!/usr/bin/env python | |
2 from __future__ import division | |
3 from __future__ import print_function | |
4 import os | |
5 import sys | |
6 import logging | |
7 import argparse | |
8 import tempfile | |
9 import subprocess | |
10 import multiprocessing | |
11 import consensus | |
12 import swalign | |
13 | |
14 ARG_DEFAULTS = {'bar_len':24, 'win_len':5, 'shift':3, 'processes':1, 'loglevel':logging.ERROR} | |
15 USAGE = "%(prog)s [options]" | |
16 DESCRIPTION = """Try to match barcodes with sequencing errors. | |
17 Match based on a small window in the middle of each half of the barcode. | |
18 Then it will align all the unique barcodes which match and then print the similarity of each to the | |
19 consensus.""" | |
20 EPILOG = """This will print each kmer observed, the barcodes which contained it, and their | |
21 similarities. The output is 4 tab-delimited columns: 1. whether the kmer was in the first or second | |
22 half of the barcode (0 for first half, 1 for second) 2. the kmer 3. the barcode 4. its similarity to | |
23 the consensus""" | |
24 | |
25 # Algorithm from Paul Medvedev (email from 2015-12-16) | |
26 | |
27 def main(argv): | |
28 | |
29 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
30 parser.set_defaults(**ARG_DEFAULTS) | |
31 | |
32 parser.add_argument('infile', metavar='families.tsv', nargs='?', | |
33 help='Input file (sorted output of make-barcodes.awk).') | |
34 parser.add_argument('-n', '--num-barcodes', type=int, | |
35 help='Only read in this many different barcodes.') | |
36 parser.add_argument('-c', '--consensus', action='store_true', | |
37 help='Include consensus sequences in the output. They will appear the same as normal barcodes, ' | |
38 'but they will be printed before each set of barcodes matching a kmer. (So you can filter ' | |
39 'them out by looking for when either column 1 or 2 change, then discard the line after ' | |
40 'the change.') | |
41 parser.add_argument('-b', '--bar-len', type=int, | |
42 help='Barcode length. Default: %(default)s') | |
43 parser.add_argument('-w', '--win-len', type=int, | |
44 help='Window (k-mer) size. Default: %(default)s') | |
45 parser.add_argument('-s', '--shift', type=int, | |
46 help='Bases to shift the window (number of k-mers to check). Default: %(default)s') | |
47 parser.add_argument('-q', '--quiet', dest='loglevel', action='store_const', const=logging.CRITICAL) | |
48 parser.add_argument('-v', '--verbose', dest='loglevel', action='store_const', const=logging.INFO) | |
49 parser.add_argument('--debug', dest='loglevel', action='store_const', const=logging.DEBUG) | |
50 parser.add_argument('-p', '--processes', type=int, | |
51 help='Number of worker processes to use. Default: %(default)s') | |
52 | |
53 args = parser.parse_args(argv[1:]) | |
54 | |
55 assert args.processes > 0, '-p must be greater than zero' | |
56 logging.basicConfig(stream=sys.stderr, level=args.loglevel, format='%(message)s') | |
57 | |
58 starts = calc_starts(args.bar_len, args.win_len, args.shift) | |
59 | |
60 if args.infile: | |
61 infile = open(args.infile) | |
62 else: | |
63 infile = sys.stdin | |
64 | |
65 logging.info('Beginning to read in data.') | |
66 # For each window sequence (kmer), build a set of barcodes which contained it, in any of the shift | |
67 # positions. Do this for both halves of the barcode (independently, at the moment). | |
68 kmer_dicts = [{}, {}] | |
69 last_barcode = None | |
70 barcode_count = 0 | |
71 for line in infile: | |
72 fields = line.rstrip('\r\n').split('\t') | |
73 if len(fields) != 8: | |
74 logging.warn('Line contains incorrect number of fields.') | |
75 continue | |
76 barcode = fields[0] | |
77 # Only do it for each unique barcode (in the sorted output, there will be runs of lines with | |
78 # the same barcode). | |
79 if barcode == last_barcode: | |
80 continue | |
81 barcode_count += 1 | |
82 # for each half of the barcode | |
83 for kmer_dict, start in zip(kmer_dicts, starts): | |
84 # for each shift position (trying kmers at each of args.shift offsets) | |
85 for i in range(args.shift): | |
86 kmer = barcode[start+i:start+i+args.win_len] | |
87 kmer_set = kmer_dict.get(kmer, set()) | |
88 kmer_set.add(barcode) | |
89 kmer_dict[kmer] = kmer_set | |
90 last_barcode = barcode | |
91 if args.num_barcodes and barcode_count >= args.num_barcodes: | |
92 break | |
93 | |
94 if infile is not sys.stdin: | |
95 infile.close() | |
96 | |
97 workers = open_workers(args.processes) | |
98 | |
99 # Analyze the groups of barcodes that contained each kmer: | |
100 # Multiple sequence align all the barcodes in a each, call a consensus, then smith-waterman | |
101 # align each barcode to that consensus to measure their similarity to it. | |
102 run_num = 0 | |
103 for dict_num, kmer_dict in enumerate(kmer_dicts): | |
104 # Each half of the barcode (one dict per). | |
105 for kmer, barcodes_set in kmer_dict.items(): | |
106 # Each set of barcodes which share a kmer. | |
107 barcodes = list(barcodes_set) | |
108 results = delegate(workers, run_num, dict_num, kmer, barcodes) | |
109 if results: | |
110 process_results(*results, print_consensus=args.consensus) | |
111 run_num += 1 | |
112 | |
113 # Do one last loop through the workers, reading the remaining results and stopping them. | |
114 # Start at the worker after the last one processed by the previous loop. | |
115 start_i = (run_num + 1) % len(workers) | |
116 for i in range(len(workers)): | |
117 worker_i = (start_i + i) % args.processes | |
118 worker = workers[worker_i] | |
119 results = worker.recv() | |
120 if results: | |
121 process_results(*results, print_consensus=args.consensus) | |
122 worker.send(None) | |
123 | |
124 | |
125 def calc_starts(bar_len, win_len, shift): | |
126 half_len = bar_len//2 | |
127 assert win_len < half_len, 'Window length must be less than half the barcode length.' | |
128 # Place the window right in the middle of the first half of the barcode. | |
129 # Offset is where it should start. | |
130 offset = (half_len-win_len)/2 | |
131 # Move it backward by half the shift length so that the average kmer start is at the offset | |
132 # calculated above. | |
133 start1 = int(offset - shift/2) | |
134 start2 = start1 + half_len | |
135 return start1, start2 | |
136 | |
137 | |
138 def process_results(dict_num, kmer, consensus_seq, barcodes, similarities, print_consensus=False): | |
139 if print_consensus: | |
140 print(dict_num, kmer, consensus_seq, 1.0, sep='\t') | |
141 for barcode, similarity in zip(barcodes, similarities): | |
142 print(dict_num, kmer, barcode, similarity, sep='\t') | |
143 | |
144 | |
145 def open_workers(num_workers): | |
146 """Open the required number of worker processes.""" | |
147 workers = [] | |
148 for i in range(num_workers): | |
149 parent_pipe, child_pipe = multiprocessing.Pipe() | |
150 process = multiprocessing.Process(target=worker_function, args=(child_pipe,)) | |
151 process.start() | |
152 workers.append(parent_pipe) | |
153 return workers | |
154 | |
155 | |
156 def delegate(workers, run_num, dict_num, kmer, barcodes): | |
157 worker_i = run_num % len(workers) | |
158 worker = workers[worker_i] | |
159 if run_num >= len(workers): | |
160 logging.info('Parent: Trying to receive results from worker..') | |
161 results = worker.recv() | |
162 else: | |
163 results = None | |
164 args = (dict_num, kmer, barcodes) | |
165 logging.info('Parent: Sending new data to worker..') | |
166 worker.send(args) | |
167 return results | |
168 | |
169 | |
170 ##### HAPPENS IN CHILD PROCESSES ##### | |
171 | |
172 def worker_function(child_pipe): | |
173 while True: | |
174 # logging.info('Worker: Listening for new data from parent..') | |
175 args = child_pipe.recv() | |
176 if args is None: | |
177 break | |
178 # logging.info('Worker: Sending results back to parent..') | |
179 child_pipe.send(process_barcodes(*args)) | |
180 | |
181 | |
182 def process_barcodes(dict_num, kmer, barcodes): | |
183 """Perform a multiple sequence alignment on a set of barcodes and parse the result. | |
184 Uses MAFFT.""" | |
185 # If there's only one barcode, we don't have to do an alignment. | |
186 if len(barcodes) == 1: | |
187 return dict_num, kmer, barcodes[0], barcodes, [1.0] | |
188 with tempfile.NamedTemporaryFile('w', delete=False, prefix='align.msa.') as family_file: | |
189 for i, barcode in enumerate(barcodes): | |
190 family_file.write('>{}\n'.format(i)) | |
191 family_file.write(barcode+'\n') | |
192 with open(os.devnull, 'w') as devnull: | |
193 try: | |
194 command = ['mafft', '--nuc', '--quiet', family_file.name] | |
195 output = subprocess.check_output(command, stderr=devnull) | |
196 except (OSError, subprocess.CalledProcessError): | |
197 return None | |
198 os.remove(family_file.name) | |
199 alignment = read_fasta(output, upper=True) | |
200 consensus_seq = consensus.get_consensus(alignment) | |
201 similarities = [] | |
202 for barcode in barcodes: | |
203 similarities.append(get_similarity(consensus_seq, barcode)) | |
204 return dict_num, kmer, consensus_seq, barcodes, similarities | |
205 | |
206 | |
207 def read_fasta(fasta, upper=False): | |
208 """Quick and dirty FASTA parser. Return a list of the sequences only (no names).""" | |
209 sequences = [] | |
210 sequence = '' | |
211 for line in fasta.splitlines(): | |
212 if line.startswith('>'): | |
213 if upper: | |
214 sequence = sequence.upper() | |
215 if sequence: | |
216 sequences.append(sequence) | |
217 sequence = '' | |
218 continue | |
219 sequence += line.strip() | |
220 if upper: | |
221 sequence = sequence.upper() | |
222 if sequence: | |
223 sequences.append(sequence) | |
224 return sequences | |
225 | |
226 | |
227 def get_similarity(seq1, seq2): | |
228 align = swalign.smith_waterman(seq1, seq2) | |
229 logging.debug(align.target+'\n'+align.query) | |
230 return align.matches / len(align.query) | |
231 | |
232 | |
233 def fail(message): | |
234 sys.stderr.write(message+"\n") | |
235 sys.exit(1) | |
236 | |
237 if __name__ == '__main__': | |
238 sys.exit(main(sys.argv)) |