comparison utils/fuzzy-match.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 #!/usr/bin/env python
2 from __future__ import division
3 from __future__ import print_function
4 import os
5 import sys
6 import logging
7 import argparse
8 import tempfile
9 import subprocess
10 import multiprocessing
11 import consensus
12 import swalign
13
14 ARG_DEFAULTS = {'bar_len':24, 'win_len':5, 'shift':3, 'processes':1, 'loglevel':logging.ERROR}
15 USAGE = "%(prog)s [options]"
16 DESCRIPTION = """Try to match barcodes with sequencing errors.
17 Match based on a small window in the middle of each half of the barcode.
18 Then it will align all the unique barcodes which match and then print the similarity of each to the
19 consensus."""
20 EPILOG = """This will print each kmer observed, the barcodes which contained it, and their
21 similarities. The output is 4 tab-delimited columns: 1. whether the kmer was in the first or second
22 half of the barcode (0 for first half, 1 for second) 2. the kmer 3. the barcode 4. its similarity to
23 the consensus"""
24
25 # Algorithm from Paul Medvedev (email from 2015-12-16)
26
27 def main(argv):
28
29 parser = argparse.ArgumentParser(description=DESCRIPTION)
30 parser.set_defaults(**ARG_DEFAULTS)
31
32 parser.add_argument('infile', metavar='families.tsv', nargs='?',
33 help='Input file (sorted output of make-barcodes.awk).')
34 parser.add_argument('-n', '--num-barcodes', type=int,
35 help='Only read in this many different barcodes.')
36 parser.add_argument('-c', '--consensus', action='store_true',
37 help='Include consensus sequences in the output. They will appear the same as normal barcodes, '
38 'but they will be printed before each set of barcodes matching a kmer. (So you can filter '
39 'them out by looking for when either column 1 or 2 change, then discard the line after '
40 'the change.')
41 parser.add_argument('-b', '--bar-len', type=int,
42 help='Barcode length. Default: %(default)s')
43 parser.add_argument('-w', '--win-len', type=int,
44 help='Window (k-mer) size. Default: %(default)s')
45 parser.add_argument('-s', '--shift', type=int,
46 help='Bases to shift the window (number of k-mers to check). Default: %(default)s')
47 parser.add_argument('-q', '--quiet', dest='loglevel', action='store_const', const=logging.CRITICAL)
48 parser.add_argument('-v', '--verbose', dest='loglevel', action='store_const', const=logging.INFO)
49 parser.add_argument('--debug', dest='loglevel', action='store_const', const=logging.DEBUG)
50 parser.add_argument('-p', '--processes', type=int,
51 help='Number of worker processes to use. Default: %(default)s')
52
53 args = parser.parse_args(argv[1:])
54
55 assert args.processes > 0, '-p must be greater than zero'
56 logging.basicConfig(stream=sys.stderr, level=args.loglevel, format='%(message)s')
57
58 starts = calc_starts(args.bar_len, args.win_len, args.shift)
59
60 if args.infile:
61 infile = open(args.infile)
62 else:
63 infile = sys.stdin
64
65 logging.info('Beginning to read in data.')
66 # For each window sequence (kmer), build a set of barcodes which contained it, in any of the shift
67 # positions. Do this for both halves of the barcode (independently, at the moment).
68 kmer_dicts = [{}, {}]
69 last_barcode = None
70 barcode_count = 0
71 for line in infile:
72 fields = line.rstrip('\r\n').split('\t')
73 if len(fields) != 8:
74 logging.warn('Line contains incorrect number of fields.')
75 continue
76 barcode = fields[0]
77 # Only do it for each unique barcode (in the sorted output, there will be runs of lines with
78 # the same barcode).
79 if barcode == last_barcode:
80 continue
81 barcode_count += 1
82 # for each half of the barcode
83 for kmer_dict, start in zip(kmer_dicts, starts):
84 # for each shift position (trying kmers at each of args.shift offsets)
85 for i in range(args.shift):
86 kmer = barcode[start+i:start+i+args.win_len]
87 kmer_set = kmer_dict.get(kmer, set())
88 kmer_set.add(barcode)
89 kmer_dict[kmer] = kmer_set
90 last_barcode = barcode
91 if args.num_barcodes and barcode_count >= args.num_barcodes:
92 break
93
94 if infile is not sys.stdin:
95 infile.close()
96
97 workers = open_workers(args.processes)
98
99 # Analyze the groups of barcodes that contained each kmer:
100 # Multiple sequence align all the barcodes in a each, call a consensus, then smith-waterman
101 # align each barcode to that consensus to measure their similarity to it.
102 run_num = 0
103 for dict_num, kmer_dict in enumerate(kmer_dicts):
104 # Each half of the barcode (one dict per).
105 for kmer, barcodes_set in kmer_dict.items():
106 # Each set of barcodes which share a kmer.
107 barcodes = list(barcodes_set)
108 results = delegate(workers, run_num, dict_num, kmer, barcodes)
109 if results:
110 process_results(*results, print_consensus=args.consensus)
111 run_num += 1
112
113 # Do one last loop through the workers, reading the remaining results and stopping them.
114 # Start at the worker after the last one processed by the previous loop.
115 start_i = (run_num + 1) % len(workers)
116 for i in range(len(workers)):
117 worker_i = (start_i + i) % args.processes
118 worker = workers[worker_i]
119 results = worker.recv()
120 if results:
121 process_results(*results, print_consensus=args.consensus)
122 worker.send(None)
123
124
125 def calc_starts(bar_len, win_len, shift):
126 half_len = bar_len//2
127 assert win_len < half_len, 'Window length must be less than half the barcode length.'
128 # Place the window right in the middle of the first half of the barcode.
129 # Offset is where it should start.
130 offset = (half_len-win_len)/2
131 # Move it backward by half the shift length so that the average kmer start is at the offset
132 # calculated above.
133 start1 = int(offset - shift/2)
134 start2 = start1 + half_len
135 return start1, start2
136
137
138 def process_results(dict_num, kmer, consensus_seq, barcodes, similarities, print_consensus=False):
139 if print_consensus:
140 print(dict_num, kmer, consensus_seq, 1.0, sep='\t')
141 for barcode, similarity in zip(barcodes, similarities):
142 print(dict_num, kmer, barcode, similarity, sep='\t')
143
144
145 def open_workers(num_workers):
146 """Open the required number of worker processes."""
147 workers = []
148 for i in range(num_workers):
149 parent_pipe, child_pipe = multiprocessing.Pipe()
150 process = multiprocessing.Process(target=worker_function, args=(child_pipe,))
151 process.start()
152 workers.append(parent_pipe)
153 return workers
154
155
156 def delegate(workers, run_num, dict_num, kmer, barcodes):
157 worker_i = run_num % len(workers)
158 worker = workers[worker_i]
159 if run_num >= len(workers):
160 logging.info('Parent: Trying to receive results from worker..')
161 results = worker.recv()
162 else:
163 results = None
164 args = (dict_num, kmer, barcodes)
165 logging.info('Parent: Sending new data to worker..')
166 worker.send(args)
167 return results
168
169
170 ##### HAPPENS IN CHILD PROCESSES #####
171
172 def worker_function(child_pipe):
173 while True:
174 # logging.info('Worker: Listening for new data from parent..')
175 args = child_pipe.recv()
176 if args is None:
177 break
178 # logging.info('Worker: Sending results back to parent..')
179 child_pipe.send(process_barcodes(*args))
180
181
182 def process_barcodes(dict_num, kmer, barcodes):
183 """Perform a multiple sequence alignment on a set of barcodes and parse the result.
184 Uses MAFFT."""
185 # If there's only one barcode, we don't have to do an alignment.
186 if len(barcodes) == 1:
187 return dict_num, kmer, barcodes[0], barcodes, [1.0]
188 with tempfile.NamedTemporaryFile('w', delete=False, prefix='align.msa.') as family_file:
189 for i, barcode in enumerate(barcodes):
190 family_file.write('>{}\n'.format(i))
191 family_file.write(barcode+'\n')
192 with open(os.devnull, 'w') as devnull:
193 try:
194 command = ['mafft', '--nuc', '--quiet', family_file.name]
195 output = subprocess.check_output(command, stderr=devnull)
196 except (OSError, subprocess.CalledProcessError):
197 return None
198 os.remove(family_file.name)
199 alignment = read_fasta(output, upper=True)
200 consensus_seq = consensus.get_consensus(alignment)
201 similarities = []
202 for barcode in barcodes:
203 similarities.append(get_similarity(consensus_seq, barcode))
204 return dict_num, kmer, consensus_seq, barcodes, similarities
205
206
207 def read_fasta(fasta, upper=False):
208 """Quick and dirty FASTA parser. Return a list of the sequences only (no names)."""
209 sequences = []
210 sequence = ''
211 for line in fasta.splitlines():
212 if line.startswith('>'):
213 if upper:
214 sequence = sequence.upper()
215 if sequence:
216 sequences.append(sequence)
217 sequence = ''
218 continue
219 sequence += line.strip()
220 if upper:
221 sequence = sequence.upper()
222 if sequence:
223 sequences.append(sequence)
224 return sequences
225
226
227 def get_similarity(seq1, seq2):
228 align = swalign.smith_waterman(seq1, seq2)
229 logging.debug(align.target+'\n'+align.query)
230 return align.matches / len(align.query)
231
232
233 def fail(message):
234 sys.stderr.write(message+"\n")
235 sys.exit(1)
236
237 if __name__ == '__main__':
238 sys.exit(main(sys.argv))