Mercurial > repos > nick > duplex
comparison utils/outconv.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:836fa4fe9494 | 18:e4d75f9efb90 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from __future__ import division | |
| 3 from __future__ import print_function | |
| 4 from __future__ import absolute_import | |
| 5 from __future__ import unicode_literals | |
| 6 import sys | |
| 7 import argparse | |
| 8 | |
| 9 ARG_DEFAULTS = {'input':sys.stdin} | |
| 10 USAGE = "%(prog)s [options]" | |
| 11 DESCRIPTION = """Split interleaved outputs of dunovo.py into pairs of paired-end files.""" | |
| 12 | |
| 13 | |
| 14 def main(argv): | |
| 15 | |
| 16 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
| 17 parser.set_defaults(**ARG_DEFAULTS) | |
| 18 | |
| 19 parser.add_argument('input', metavar='consensuses.fa', type=argparse.FileType('r'), | |
| 20 help='Interleaved consensus reads (DCS or SSCS) from dunovo.py.') | |
| 21 parser.add_argument('-1', '--out1', metavar='out_1.fa', type=argparse.FileType('w'), required=True, | |
| 22 help='Output filename for mate 1 reads. CAUTION: Will overwrite any existing file.') | |
| 23 parser.add_argument('-2', '--out2', metavar='out_2.fa', type=argparse.FileType('w'), required=True, | |
| 24 help='Output filename for mate 2 reads. CAUTION: Will overwrite any existing file.') | |
| 25 | |
| 26 args = parser.parse_args(argv[1:]) | |
| 27 | |
| 28 """ | |
| 29 SSCS: | |
| 30 >AAAAAAAAAAAACTAAAATACAAA.ab.1 15 | |
| 31 ACTGATGAAAAGGCTGTTATTGTATCTGATGTGTAGTGTATGGCTAAGAAAAGACCTGTAATGATTTGGACTATTAGGCAGACTCCTAGAAGGGACCCAA | |
| 32 >AAAAAAAAAAAACTAAAATACAAA.ab.2 15 | |
| 33 ATCCAAACACAACCAACATCCCCCCTAAATAAATTAAAAAAACTATTAAACCTAAAAACGATCCACCAAACCCTAAAACCATTAAACAACCAACAAACCC | |
| 34 >AAAAAAAAAAAAGACCACGTTTCT.ab.1 15 | |
| 35 TGGAATTCAGCCTACTAGCAATTATCCCCATACTAATCAACAAAAAAAACCCACGATCAACTGAAGCAGCAACAAAATACTTCGTCACACAAGCAACAGC | |
| 36 >AAAAAAAAAAAAGACCACGTTTCT.ba.2 18 | |
| 37 TGGAATTCAGCCTACTAGCAATTATCCCCATACTAATCAACAAAAAAAACCCACGATCAACTGAAGCAGCAACAAAATACTTCGTCACACAAGCAACAGC | |
| 38 >AAAAAAAAAAAAGACCACGTTTCT.ab.2 15 | |
| 39 GTAGAGTTGAGTAGCGGGTAAATTTGAATTAAAATTGATAGGGGAGCAATTTTTTGTCATGTAAGAAGAATAAGTCCTATGTGCAGTGGGATCCCTTGAG | |
| 40 >AAAAAAAAAAAAGACCACGTTTCT.ba.1 18 | |
| 41 GTAGAGTTGAGTAGCGGGTAAATTTGAATTAAAATTGATAGGGGAGCAATTTTTTGTCATGTAAGAAGAATAAGTCCTATGTGCAGTGGGATCCCTTGAG | |
| 42 | |
| 43 DCS: | |
| 44 >AAAAAAAACACCAAATACGCCTAC.1 28-39 | |
| 45 GCTATGAATATAGGGGCTGTAAGAATAATATAGATTATGAGGTTGAGTAGAGTGAGGGATGGGTTGTAAGGAAGAATTGCTAATATTCATCCTATGTGGG | |
| 46 >AAAAAAAACACCAAATACGCCTAC.2 28-39 | |
| 47 TACTTCGTCACACAAGCAACAGCCTCAATAATTATCCTCCTGGCCATCGTACTCAACTATAAACAACTAGGAACATGAATATTTCAACAACAAACAAACG | |
| 48 """ | |
| 49 | |
| 50 intype = None | |
| 51 line_num = 0 | |
| 52 sscs_buffer = {} | |
| 53 for line_raw in args.input: | |
| 54 line_num += 1 | |
| 55 line = line_raw.rstrip('\r\n') | |
| 56 if line.startswith('>'): | |
| 57 # We're in a header line. | |
| 58 barcode, strand, mate, famsizes = parse_header(line) | |
| 59 if intype is None: | |
| 60 if strand is None: | |
| 61 intype = 'DCS' | |
| 62 else: | |
| 63 intype = 'SSCS' | |
| 64 else: | |
| 65 # We're in a sequence line. | |
| 66 seq = line | |
| 67 if intype == 'DCS': | |
| 68 if mate == '1': | |
| 69 args.out1.write('>{} {}\n{}\n'.format(barcode, famsizes, seq)) | |
| 70 elif mate == '2': | |
| 71 args.out2.write('>{} {}\n{}\n'.format(barcode, famsizes, seq)) | |
| 72 else: | |
| 73 fail('Error: Invalid mate "{}" on line {}.'.format(mate, line_num)) | |
| 74 elif intype == 'SSCS': | |
| 75 # Need to get SSCS in correct order. | |
| 76 # Collect reads until they're properly paired, then print both. | |
| 77 if (barcode, strand) in sscs_buffer: | |
| 78 stored_read = sscs_buffer[(barcode, strand)] | |
| 79 new_read = {'mate':mate, 'famsizes':famsizes, 'seq':seq} | |
| 80 if stored_read['mate'] == '1' and mate == '2': | |
| 81 read1, read2 = stored_read, new_read | |
| 82 elif stored_read['mate'] == '2' and mate == '1': | |
| 83 read1, read2 = new_read, stored_read | |
| 84 args.out1.write('>{}.{} {famsizes}\n{seq}\n'.format(barcode, strand, **read1)) | |
| 85 args.out2.write('>{}.{} {famsizes}\n{seq}\n'.format(barcode, strand, **read2)) | |
| 86 del sscs_buffer[(barcode, strand)] | |
| 87 else: | |
| 88 sscs_buffer[(barcode, strand)] = {'mate':mate, 'famsizes':famsizes, 'seq':seq} | |
| 89 | |
| 90 | |
| 91 def parse_header(header): | |
| 92 read_id, famsizes = header.split() | |
| 93 id_fields = read_id[1:].split('.') | |
| 94 barcode = id_fields[0] | |
| 95 strand = None | |
| 96 mate = id_fields[-1] | |
| 97 if len(id_fields) == 3: | |
| 98 strand = id_fields[1] | |
| 99 return barcode, strand, mate, famsizes | |
| 100 | |
| 101 | |
| 102 def fail(message): | |
| 103 sys.stderr.write(message+"\n") | |
| 104 sys.exit(1) | |
| 105 | |
| 106 if __name__ == '__main__': | |
| 107 sys.exit(main(sys.argv)) |
