duplex: utils/fuzzy-match.py comparison

comparison utils/fuzzy-match.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty

author	nick
date	Thu, 02 Feb 2017 18:44:31 -0500
parents
children

comparison

equal deleted inserted replaced

-:836fa4fe9494
+:e4d75f9efb90
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import logging
+import argparse
+import tempfile
+import subprocess
+import multiprocessing
+import consensus
+import swalign
+ARG_DEFAULTS = {'bar_len':24, 'win_len':5, 'shift':3, 'processes':1, 'loglevel':logging.ERROR}
+USAGE = "%(prog)s [options]"
+DESCRIPTION = """Try to match barcodes with sequencing errors.
+Match based on a small window in the middle of each half of the barcode.
+Then it will align all the unique barcodes which match and then print the similarity of each to the
+consensus."""
+EPILOG = """This will print each kmer observed, the barcodes which contained it, and their
+similarities. The output is 4 tab-delimited columns: 1. whether the kmer was in the first or second
+half of the barcode (0 for first half, 1 for second) 2. the kmer 3. the barcode 4. its similarity to
+the consensus"""
+# Algorithm from Paul Medvedev (email from 2015-12-16)
+def main(argv):
+parser = argparse.ArgumentParser(description=DESCRIPTION)
+parser.set_defaults(**ARG_DEFAULTS)
+parser.add_argument('infile', metavar='families.tsv', nargs='?',
+help='Input file (sorted output of make-barcodes.awk).')
+parser.add_argument('-n', '--num-barcodes', type=int,
+help='Only read in this many different barcodes.')
+parser.add_argument('-c', '--consensus', action='store_true',
+help='Include consensus sequences in the output. They will appear the same as normal barcodes, '
+'but they will be printed before each set of barcodes matching a kmer. (So you can filter '
+'them out by looking for when either column 1 or 2 change, then discard the line after '
+'the change.')
+parser.add_argument('-b', '--bar-len', type=int,
+help='Barcode length. Default: %(default)s')
+parser.add_argument('-w', '--win-len', type=int,
+help='Window (k-mer) size. Default: %(default)s')
+parser.add_argument('-s', '--shift', type=int,
+help='Bases to shift the window (number of k-mers to check). Default: %(default)s')
+parser.add_argument('-q', '--quiet', dest='loglevel', action='store_const', const=logging.CRITICAL)
+parser.add_argument('-v', '--verbose', dest='loglevel', action='store_const', const=logging.INFO)
+parser.add_argument('--debug', dest='loglevel', action='store_const', const=logging.DEBUG)
+parser.add_argument('-p', '--processes', type=int,
+help='Number of worker processes to use. Default: %(default)s')
+args = parser.parse_args(argv[1:])
+assert args.processes > 0, '-p must be greater than zero'
+logging.basicConfig(stream=sys.stderr, level=args.loglevel, format='%(message)s')
+starts = calc_starts(args.bar_len, args.win_len, args.shift)
+if args.infile:
+infile = open(args.infile)
+else:
+infile = sys.stdin
+logging.info('Beginning to read in data.')
+# For each window sequence (kmer), build a set of barcodes which contained it, in any of the shift
+# positions. Do this for both halves of the barcode (independently, at the moment).
+kmer_dicts = [{}, {}]
+last_barcode = None
+barcode_count = 0
+for line in infile:
+fields = line.rstrip('\r\n').split('\t')
+if len(fields) != 8:
+logging.warn('Line contains incorrect number of fields.')
+continue
+barcode = fields[0]
+# Only do it for each unique barcode (in the sorted output, there will be runs of lines with
+# the same barcode).
+if barcode == last_barcode:
+continue
+barcode_count += 1
+# for each half of the barcode
+for kmer_dict, start in zip(kmer_dicts, starts):
+# for each shift position (trying kmers at each of args.shift offsets)
+for i in range(args.shift):
+kmer = barcode[start+i:start+i+args.win_len]
+kmer_set = kmer_dict.get(kmer, set())
+kmer_set.add(barcode)
+kmer_dict[kmer] = kmer_set
+last_barcode = barcode
+if args.num_barcodes and barcode_count >= args.num_barcodes:
+break
+if infile is not sys.stdin:
+infile.close()
+workers = open_workers(args.processes)
+# Analyze the groups of barcodes that contained each kmer:
+# Multiple sequence align all the barcodes in a each, call a consensus, then smith-waterman
+# align each barcode to that consensus to measure their similarity to it.
+run_num = 0
+for dict_num, kmer_dict in enumerate(kmer_dicts):
+# Each half of the barcode (one dict per).
+for kmer, barcodes_set in kmer_dict.items():
+# Each set of barcodes which share a kmer.
+barcodes = list(barcodes_set)
+results = delegate(workers, run_num, dict_num, kmer, barcodes)
+if results:
+process_results(*results, print_consensus=args.consensus)
+run_num += 1
+# Do one last loop through the workers, reading the remaining results and stopping them.
+# Start at the worker after the last one processed by the previous loop.
+start_i = (run_num + 1) % len(workers)
+for i in range(len(workers)):
+worker_i = (start_i + i) % args.processes
+worker = workers[worker_i]
+results = worker.recv()
+if results:
+process_results(*results, print_consensus=args.consensus)
+worker.send(None)
+def calc_starts(bar_len, win_len, shift):
+half_len = bar_len//2
+assert win_len < half_len, 'Window length must be less than half the barcode length.'
+# Place the window right in the middle of the first half of the barcode.
+# Offset is where it should start.
+offset = (half_len-win_len)/2
+# Move it backward by half the shift length so that the average kmer start is at the offset
+# calculated above.
+start1 = int(offset - shift/2)
+start2 = start1 + half_len
+return start1, start2
+def process_results(dict_num, kmer, consensus_seq, barcodes, similarities, print_consensus=False):
+if print_consensus:
+print(dict_num, kmer, consensus_seq, 1.0, sep='\t')
+for barcode, similarity in zip(barcodes, similarities):
+print(dict_num, kmer, barcode, similarity, sep='\t')
+def open_workers(num_workers):
+"""Open the required number of worker processes."""
+workers = []
+for i in range(num_workers):
+parent_pipe, child_pipe = multiprocessing.Pipe()
+process = multiprocessing.Process(target=worker_function, args=(child_pipe,))
+process.start()
+workers.append(parent_pipe)
+return workers
+def delegate(workers, run_num, dict_num, kmer, barcodes):
+worker_i = run_num % len(workers)
+worker = workers[worker_i]
+if run_num >= len(workers):
+logging.info('Parent: Trying to receive results from worker..')
+results = worker.recv()
+else:
+results = None
+args = (dict_num, kmer, barcodes)
+logging.info('Parent: Sending new data to worker..')
+worker.send(args)
+return results
+##### HAPPENS IN CHILD PROCESSES #####
+def worker_function(child_pipe):
+while True:
+# logging.info('Worker: Listening for new data from parent..')
+args = child_pipe.recv()
+if args is None:
+break
+# logging.info('Worker: Sending results back to parent..')
+child_pipe.send(process_barcodes(*args))
+def process_barcodes(dict_num, kmer, barcodes):
+"""Perform a multiple sequence alignment on a set of barcodes and parse the result.
+Uses MAFFT."""
+# If there's only one barcode, we don't have to do an alignment.
+if len(barcodes) == 1:
+return dict_num, kmer, barcodes[0], barcodes, [1.0]
+with tempfile.NamedTemporaryFile('w', delete=False, prefix='align.msa.') as family_file:
+for i, barcode in enumerate(barcodes):
+family_file.write('>{}\n'.format(i))
+family_file.write(barcode+'\n')
+with open(os.devnull, 'w') as devnull:
+try:
+command = ['mafft', '--nuc', '--quiet', family_file.name]
+output = subprocess.check_output(command, stderr=devnull)
+except (OSError, subprocess.CalledProcessError):
+return None
+os.remove(family_file.name)
+alignment = read_fasta(output, upper=True)
+consensus_seq = consensus.get_consensus(alignment)
+similarities = []
+for barcode in barcodes:
+similarities.append(get_similarity(consensus_seq, barcode))
+return dict_num, kmer, consensus_seq, barcodes, similarities
+def read_fasta(fasta, upper=False):
+"""Quick and dirty FASTA parser. Return a list of the sequences only (no names)."""
+sequences = []
+sequence = ''
+for line in fasta.splitlines():
+if line.startswith('>'):
+if upper:
+sequence = sequence.upper()
+if sequence:
+sequences.append(sequence)
+sequence = ''
+continue
+sequence += line.strip()
+if upper:
+sequence = sequence.upper()
+if sequence:
+sequences.append(sequence)
+return sequences
+def get_similarity(seq1, seq2):
+align = swalign.smith_waterman(seq1, seq2)
+logging.debug(align.target+'\n'+align.query)
+return align.matches / len(align.query)
+def fail(message):
+sys.stderr.write(message+"\n")
+sys.exit(1)
+if __name__ == '__main__':
+sys.exit(main(sys.argv))

Mercurial > repos > nick > duplex

comparison utils/fuzzy-match.py @ 18:e4d75f9efb90 draft