Mercurial > repos > nick > duplex
view utils/outconv.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python from __future__ import division from __future__ import print_function from __future__ import absolute_import from __future__ import unicode_literals import sys import argparse ARG_DEFAULTS = {'input':sys.stdin} USAGE = "%(prog)s [options]" DESCRIPTION = """Split interleaved outputs of dunovo.py into pairs of paired-end files.""" def main(argv): parser = argparse.ArgumentParser(description=DESCRIPTION) parser.set_defaults(**ARG_DEFAULTS) parser.add_argument('input', metavar='consensuses.fa', type=argparse.FileType('r'), help='Interleaved consensus reads (DCS or SSCS) from dunovo.py.') parser.add_argument('-1', '--out1', metavar='out_1.fa', type=argparse.FileType('w'), required=True, help='Output filename for mate 1 reads. CAUTION: Will overwrite any existing file.') parser.add_argument('-2', '--out2', metavar='out_2.fa', type=argparse.FileType('w'), required=True, help='Output filename for mate 2 reads. CAUTION: Will overwrite any existing file.') args = parser.parse_args(argv[1:]) """ SSCS: >AAAAAAAAAAAACTAAAATACAAA.ab.1 15 ACTGATGAAAAGGCTGTTATTGTATCTGATGTGTAGTGTATGGCTAAGAAAAGACCTGTAATGATTTGGACTATTAGGCAGACTCCTAGAAGGGACCCAA >AAAAAAAAAAAACTAAAATACAAA.ab.2 15 ATCCAAACACAACCAACATCCCCCCTAAATAAATTAAAAAAACTATTAAACCTAAAAACGATCCACCAAACCCTAAAACCATTAAACAACCAACAAACCC >AAAAAAAAAAAAGACCACGTTTCT.ab.1 15 TGGAATTCAGCCTACTAGCAATTATCCCCATACTAATCAACAAAAAAAACCCACGATCAACTGAAGCAGCAACAAAATACTTCGTCACACAAGCAACAGC >AAAAAAAAAAAAGACCACGTTTCT.ba.2 18 TGGAATTCAGCCTACTAGCAATTATCCCCATACTAATCAACAAAAAAAACCCACGATCAACTGAAGCAGCAACAAAATACTTCGTCACACAAGCAACAGC >AAAAAAAAAAAAGACCACGTTTCT.ab.2 15 GTAGAGTTGAGTAGCGGGTAAATTTGAATTAAAATTGATAGGGGAGCAATTTTTTGTCATGTAAGAAGAATAAGTCCTATGTGCAGTGGGATCCCTTGAG >AAAAAAAAAAAAGACCACGTTTCT.ba.1 18 GTAGAGTTGAGTAGCGGGTAAATTTGAATTAAAATTGATAGGGGAGCAATTTTTTGTCATGTAAGAAGAATAAGTCCTATGTGCAGTGGGATCCCTTGAG DCS: >AAAAAAAACACCAAATACGCCTAC.1 28-39 GCTATGAATATAGGGGCTGTAAGAATAATATAGATTATGAGGTTGAGTAGAGTGAGGGATGGGTTGTAAGGAAGAATTGCTAATATTCATCCTATGTGGG >AAAAAAAACACCAAATACGCCTAC.2 28-39 TACTTCGTCACACAAGCAACAGCCTCAATAATTATCCTCCTGGCCATCGTACTCAACTATAAACAACTAGGAACATGAATATTTCAACAACAAACAAACG """ intype = None line_num = 0 sscs_buffer = {} for line_raw in args.input: line_num += 1 line = line_raw.rstrip('\r\n') if line.startswith('>'): # We're in a header line. barcode, strand, mate, famsizes = parse_header(line) if intype is None: if strand is None: intype = 'DCS' else: intype = 'SSCS' else: # We're in a sequence line. seq = line if intype == 'DCS': if mate == '1': args.out1.write('>{} {}\n{}\n'.format(barcode, famsizes, seq)) elif mate == '2': args.out2.write('>{} {}\n{}\n'.format(barcode, famsizes, seq)) else: fail('Error: Invalid mate "{}" on line {}.'.format(mate, line_num)) elif intype == 'SSCS': # Need to get SSCS in correct order. # Collect reads until they're properly paired, then print both. if (barcode, strand) in sscs_buffer: stored_read = sscs_buffer[(barcode, strand)] new_read = {'mate':mate, 'famsizes':famsizes, 'seq':seq} if stored_read['mate'] == '1' and mate == '2': read1, read2 = stored_read, new_read elif stored_read['mate'] == '2' and mate == '1': read1, read2 = new_read, stored_read args.out1.write('>{}.{} {famsizes}\n{seq}\n'.format(barcode, strand, **read1)) args.out2.write('>{}.{} {famsizes}\n{seq}\n'.format(barcode, strand, **read2)) del sscs_buffer[(barcode, strand)] else: sscs_buffer[(barcode, strand)] = {'mate':mate, 'famsizes':famsizes, 'seq':seq} def parse_header(header): read_id, famsizes = header.split() id_fields = read_id[1:].split('.') barcode = id_fields[0] strand = None mate = id_fields[-1] if len(id_fields) == 3: strand = id_fields[1] return barcode, strand, mate, famsizes def fail(message): sys.stderr.write(message+"\n") sys.exit(1) if __name__ == '__main__': sys.exit(main(sys.argv))