Mercurial > repos > nick > duplex
diff utils/sim-label.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/sim-label.py Thu Feb 02 18:44:31 2017 -0500 @@ -0,0 +1,78 @@ +#!/usr/bin/env python +from __future__ import division +from __future__ import print_function +import sys +import argparse +import fastareader + +ARG_DEFAULTS = {} +USAGE = "%(prog)s [options]" +DESCRIPTION = """Convert the results of the duplex pipeline on simulated data to a single-line tsv +format, and label them with their original fragments.""" + + +def main(argv): + + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.set_defaults(**ARG_DEFAULTS) + + parser.add_argument('reads', type=fastareader.FastaReadGenerator, + help='Output fasta file from duplex pipeline. This script depends on their names being the ' + 'exact format produced by sim.py.') + parser.add_argument('families', type=argparse.FileType('r'), + help='Families tsv file (output of make-barcodes.awk).') + + args = parser.parse_args(argv[1:]) + + bars_to_frags = {} + for line in args.families: + barcode, order, read_name = line.rstrip('\r\n').split('\t')[:3] + if read_name.startswith('@'): + read_name = read_name[1:] + chrom, frag_id, read_num = read_name.split('-') + bars_to_frags[barcode] = (chrom, frag_id) + + reads = iter(args.reads) + while True: + try: + read = next(reads) + except StopIteration: + break + barcode = read.id + try: + chrom, frag_id = bars_to_frags[barcode] + except KeyError: + sys.stderr.write('Missing barcode: {}\n'.format(barcode)) + continue + try: + frag_num = int(frag_id, 16) + except ValueError: + sys.stderr.write('Invalid fragment id: {}\n'.format(frag_id)) + continue + fam1size, fam2size = get_famsizes(read.name) + print(chrom, frag_num, frag_id, barcode, fam1size, fam2size, read.seq, sep='\t') + + +def get_famsizes(read_name): + try: + faminfo = read_name.split()[1] + except IndexError: + faminfo = '' + famsizes = faminfo.split('-') + try: + fam1size = famsizes[0] + except IndexError: + fam1size = '' + try: + fam2size = famsizes[1] + except IndexError: + fam2size = '' + return fam1size, fam2size + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + +if __name__ == '__main__': + sys.exit(main(sys.argv))