Mercurial > repos > nick > duplex
comparison utils/fuzzy-match.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:836fa4fe9494 | 18:e4d75f9efb90 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from __future__ import division | |
| 3 from __future__ import print_function | |
| 4 import os | |
| 5 import sys | |
| 6 import logging | |
| 7 import argparse | |
| 8 import tempfile | |
| 9 import subprocess | |
| 10 import multiprocessing | |
| 11 import consensus | |
| 12 import swalign | |
| 13 | |
| 14 ARG_DEFAULTS = {'bar_len':24, 'win_len':5, 'shift':3, 'processes':1, 'loglevel':logging.ERROR} | |
| 15 USAGE = "%(prog)s [options]" | |
| 16 DESCRIPTION = """Try to match barcodes with sequencing errors. | |
| 17 Match based on a small window in the middle of each half of the barcode. | |
| 18 Then it will align all the unique barcodes which match and then print the similarity of each to the | |
| 19 consensus.""" | |
| 20 EPILOG = """This will print each kmer observed, the barcodes which contained it, and their | |
| 21 similarities. The output is 4 tab-delimited columns: 1. whether the kmer was in the first or second | |
| 22 half of the barcode (0 for first half, 1 for second) 2. the kmer 3. the barcode 4. its similarity to | |
| 23 the consensus""" | |
| 24 | |
| 25 # Algorithm from Paul Medvedev (email from 2015-12-16) | |
| 26 | |
| 27 def main(argv): | |
| 28 | |
| 29 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
| 30 parser.set_defaults(**ARG_DEFAULTS) | |
| 31 | |
| 32 parser.add_argument('infile', metavar='families.tsv', nargs='?', | |
| 33 help='Input file (sorted output of make-barcodes.awk).') | |
| 34 parser.add_argument('-n', '--num-barcodes', type=int, | |
| 35 help='Only read in this many different barcodes.') | |
| 36 parser.add_argument('-c', '--consensus', action='store_true', | |
| 37 help='Include consensus sequences in the output. They will appear the same as normal barcodes, ' | |
| 38 'but they will be printed before each set of barcodes matching a kmer. (So you can filter ' | |
| 39 'them out by looking for when either column 1 or 2 change, then discard the line after ' | |
| 40 'the change.') | |
| 41 parser.add_argument('-b', '--bar-len', type=int, | |
| 42 help='Barcode length. Default: %(default)s') | |
| 43 parser.add_argument('-w', '--win-len', type=int, | |
| 44 help='Window (k-mer) size. Default: %(default)s') | |
| 45 parser.add_argument('-s', '--shift', type=int, | |
| 46 help='Bases to shift the window (number of k-mers to check). Default: %(default)s') | |
| 47 parser.add_argument('-q', '--quiet', dest='loglevel', action='store_const', const=logging.CRITICAL) | |
| 48 parser.add_argument('-v', '--verbose', dest='loglevel', action='store_const', const=logging.INFO) | |
| 49 parser.add_argument('--debug', dest='loglevel', action='store_const', const=logging.DEBUG) | |
| 50 parser.add_argument('-p', '--processes', type=int, | |
| 51 help='Number of worker processes to use. Default: %(default)s') | |
| 52 | |
| 53 args = parser.parse_args(argv[1:]) | |
| 54 | |
| 55 assert args.processes > 0, '-p must be greater than zero' | |
| 56 logging.basicConfig(stream=sys.stderr, level=args.loglevel, format='%(message)s') | |
| 57 | |
| 58 starts = calc_starts(args.bar_len, args.win_len, args.shift) | |
| 59 | |
| 60 if args.infile: | |
| 61 infile = open(args.infile) | |
| 62 else: | |
| 63 infile = sys.stdin | |
| 64 | |
| 65 logging.info('Beginning to read in data.') | |
| 66 # For each window sequence (kmer), build a set of barcodes which contained it, in any of the shift | |
| 67 # positions. Do this for both halves of the barcode (independently, at the moment). | |
| 68 kmer_dicts = [{}, {}] | |
| 69 last_barcode = None | |
| 70 barcode_count = 0 | |
| 71 for line in infile: | |
| 72 fields = line.rstrip('\r\n').split('\t') | |
| 73 if len(fields) != 8: | |
| 74 logging.warn('Line contains incorrect number of fields.') | |
| 75 continue | |
| 76 barcode = fields[0] | |
| 77 # Only do it for each unique barcode (in the sorted output, there will be runs of lines with | |
| 78 # the same barcode). | |
| 79 if barcode == last_barcode: | |
| 80 continue | |
| 81 barcode_count += 1 | |
| 82 # for each half of the barcode | |
| 83 for kmer_dict, start in zip(kmer_dicts, starts): | |
| 84 # for each shift position (trying kmers at each of args.shift offsets) | |
| 85 for i in range(args.shift): | |
| 86 kmer = barcode[start+i:start+i+args.win_len] | |
| 87 kmer_set = kmer_dict.get(kmer, set()) | |
| 88 kmer_set.add(barcode) | |
| 89 kmer_dict[kmer] = kmer_set | |
| 90 last_barcode = barcode | |
| 91 if args.num_barcodes and barcode_count >= args.num_barcodes: | |
| 92 break | |
| 93 | |
| 94 if infile is not sys.stdin: | |
| 95 infile.close() | |
| 96 | |
| 97 workers = open_workers(args.processes) | |
| 98 | |
| 99 # Analyze the groups of barcodes that contained each kmer: | |
| 100 # Multiple sequence align all the barcodes in a each, call a consensus, then smith-waterman | |
| 101 # align each barcode to that consensus to measure their similarity to it. | |
| 102 run_num = 0 | |
| 103 for dict_num, kmer_dict in enumerate(kmer_dicts): | |
| 104 # Each half of the barcode (one dict per). | |
| 105 for kmer, barcodes_set in kmer_dict.items(): | |
| 106 # Each set of barcodes which share a kmer. | |
| 107 barcodes = list(barcodes_set) | |
| 108 results = delegate(workers, run_num, dict_num, kmer, barcodes) | |
| 109 if results: | |
| 110 process_results(*results, print_consensus=args.consensus) | |
| 111 run_num += 1 | |
| 112 | |
| 113 # Do one last loop through the workers, reading the remaining results and stopping them. | |
| 114 # Start at the worker after the last one processed by the previous loop. | |
| 115 start_i = (run_num + 1) % len(workers) | |
| 116 for i in range(len(workers)): | |
| 117 worker_i = (start_i + i) % args.processes | |
| 118 worker = workers[worker_i] | |
| 119 results = worker.recv() | |
| 120 if results: | |
| 121 process_results(*results, print_consensus=args.consensus) | |
| 122 worker.send(None) | |
| 123 | |
| 124 | |
| 125 def calc_starts(bar_len, win_len, shift): | |
| 126 half_len = bar_len//2 | |
| 127 assert win_len < half_len, 'Window length must be less than half the barcode length.' | |
| 128 # Place the window right in the middle of the first half of the barcode. | |
| 129 # Offset is where it should start. | |
| 130 offset = (half_len-win_len)/2 | |
| 131 # Move it backward by half the shift length so that the average kmer start is at the offset | |
| 132 # calculated above. | |
| 133 start1 = int(offset - shift/2) | |
| 134 start2 = start1 + half_len | |
| 135 return start1, start2 | |
| 136 | |
| 137 | |
| 138 def process_results(dict_num, kmer, consensus_seq, barcodes, similarities, print_consensus=False): | |
| 139 if print_consensus: | |
| 140 print(dict_num, kmer, consensus_seq, 1.0, sep='\t') | |
| 141 for barcode, similarity in zip(barcodes, similarities): | |
| 142 print(dict_num, kmer, barcode, similarity, sep='\t') | |
| 143 | |
| 144 | |
| 145 def open_workers(num_workers): | |
| 146 """Open the required number of worker processes.""" | |
| 147 workers = [] | |
| 148 for i in range(num_workers): | |
| 149 parent_pipe, child_pipe = multiprocessing.Pipe() | |
| 150 process = multiprocessing.Process(target=worker_function, args=(child_pipe,)) | |
| 151 process.start() | |
| 152 workers.append(parent_pipe) | |
| 153 return workers | |
| 154 | |
| 155 | |
| 156 def delegate(workers, run_num, dict_num, kmer, barcodes): | |
| 157 worker_i = run_num % len(workers) | |
| 158 worker = workers[worker_i] | |
| 159 if run_num >= len(workers): | |
| 160 logging.info('Parent: Trying to receive results from worker..') | |
| 161 results = worker.recv() | |
| 162 else: | |
| 163 results = None | |
| 164 args = (dict_num, kmer, barcodes) | |
| 165 logging.info('Parent: Sending new data to worker..') | |
| 166 worker.send(args) | |
| 167 return results | |
| 168 | |
| 169 | |
| 170 ##### HAPPENS IN CHILD PROCESSES ##### | |
| 171 | |
| 172 def worker_function(child_pipe): | |
| 173 while True: | |
| 174 # logging.info('Worker: Listening for new data from parent..') | |
| 175 args = child_pipe.recv() | |
| 176 if args is None: | |
| 177 break | |
| 178 # logging.info('Worker: Sending results back to parent..') | |
| 179 child_pipe.send(process_barcodes(*args)) | |
| 180 | |
| 181 | |
| 182 def process_barcodes(dict_num, kmer, barcodes): | |
| 183 """Perform a multiple sequence alignment on a set of barcodes and parse the result. | |
| 184 Uses MAFFT.""" | |
| 185 # If there's only one barcode, we don't have to do an alignment. | |
| 186 if len(barcodes) == 1: | |
| 187 return dict_num, kmer, barcodes[0], barcodes, [1.0] | |
| 188 with tempfile.NamedTemporaryFile('w', delete=False, prefix='align.msa.') as family_file: | |
| 189 for i, barcode in enumerate(barcodes): | |
| 190 family_file.write('>{}\n'.format(i)) | |
| 191 family_file.write(barcode+'\n') | |
| 192 with open(os.devnull, 'w') as devnull: | |
| 193 try: | |
| 194 command = ['mafft', '--nuc', '--quiet', family_file.name] | |
| 195 output = subprocess.check_output(command, stderr=devnull) | |
| 196 except (OSError, subprocess.CalledProcessError): | |
| 197 return None | |
| 198 os.remove(family_file.name) | |
| 199 alignment = read_fasta(output, upper=True) | |
| 200 consensus_seq = consensus.get_consensus(alignment) | |
| 201 similarities = [] | |
| 202 for barcode in barcodes: | |
| 203 similarities.append(get_similarity(consensus_seq, barcode)) | |
| 204 return dict_num, kmer, consensus_seq, barcodes, similarities | |
| 205 | |
| 206 | |
| 207 def read_fasta(fasta, upper=False): | |
| 208 """Quick and dirty FASTA parser. Return a list of the sequences only (no names).""" | |
| 209 sequences = [] | |
| 210 sequence = '' | |
| 211 for line in fasta.splitlines(): | |
| 212 if line.startswith('>'): | |
| 213 if upper: | |
| 214 sequence = sequence.upper() | |
| 215 if sequence: | |
| 216 sequences.append(sequence) | |
| 217 sequence = '' | |
| 218 continue | |
| 219 sequence += line.strip() | |
| 220 if upper: | |
| 221 sequence = sequence.upper() | |
| 222 if sequence: | |
| 223 sequences.append(sequence) | |
| 224 return sequences | |
| 225 | |
| 226 | |
| 227 def get_similarity(seq1, seq2): | |
| 228 align = swalign.smith_waterman(seq1, seq2) | |
| 229 logging.debug(align.target+'\n'+align.query) | |
| 230 return align.matches / len(align.query) | |
| 231 | |
| 232 | |
| 233 def fail(message): | |
| 234 sys.stderr.write(message+"\n") | |
| 235 sys.exit(1) | |
| 236 | |
| 237 if __name__ == '__main__': | |
| 238 sys.exit(main(sys.argv)) |
