Mercurial > repos > nick > duplex
comparison utils/outconv.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:836fa4fe9494 | 18:e4d75f9efb90 |
---|---|
1 #!/usr/bin/env python | |
2 from __future__ import division | |
3 from __future__ import print_function | |
4 from __future__ import absolute_import | |
5 from __future__ import unicode_literals | |
6 import sys | |
7 import argparse | |
8 | |
9 ARG_DEFAULTS = {'input':sys.stdin} | |
10 USAGE = "%(prog)s [options]" | |
11 DESCRIPTION = """Split interleaved outputs of dunovo.py into pairs of paired-end files.""" | |
12 | |
13 | |
14 def main(argv): | |
15 | |
16 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
17 parser.set_defaults(**ARG_DEFAULTS) | |
18 | |
19 parser.add_argument('input', metavar='consensuses.fa', type=argparse.FileType('r'), | |
20 help='Interleaved consensus reads (DCS or SSCS) from dunovo.py.') | |
21 parser.add_argument('-1', '--out1', metavar='out_1.fa', type=argparse.FileType('w'), required=True, | |
22 help='Output filename for mate 1 reads. CAUTION: Will overwrite any existing file.') | |
23 parser.add_argument('-2', '--out2', metavar='out_2.fa', type=argparse.FileType('w'), required=True, | |
24 help='Output filename for mate 2 reads. CAUTION: Will overwrite any existing file.') | |
25 | |
26 args = parser.parse_args(argv[1:]) | |
27 | |
28 """ | |
29 SSCS: | |
30 >AAAAAAAAAAAACTAAAATACAAA.ab.1 15 | |
31 ACTGATGAAAAGGCTGTTATTGTATCTGATGTGTAGTGTATGGCTAAGAAAAGACCTGTAATGATTTGGACTATTAGGCAGACTCCTAGAAGGGACCCAA | |
32 >AAAAAAAAAAAACTAAAATACAAA.ab.2 15 | |
33 ATCCAAACACAACCAACATCCCCCCTAAATAAATTAAAAAAACTATTAAACCTAAAAACGATCCACCAAACCCTAAAACCATTAAACAACCAACAAACCC | |
34 >AAAAAAAAAAAAGACCACGTTTCT.ab.1 15 | |
35 TGGAATTCAGCCTACTAGCAATTATCCCCATACTAATCAACAAAAAAAACCCACGATCAACTGAAGCAGCAACAAAATACTTCGTCACACAAGCAACAGC | |
36 >AAAAAAAAAAAAGACCACGTTTCT.ba.2 18 | |
37 TGGAATTCAGCCTACTAGCAATTATCCCCATACTAATCAACAAAAAAAACCCACGATCAACTGAAGCAGCAACAAAATACTTCGTCACACAAGCAACAGC | |
38 >AAAAAAAAAAAAGACCACGTTTCT.ab.2 15 | |
39 GTAGAGTTGAGTAGCGGGTAAATTTGAATTAAAATTGATAGGGGAGCAATTTTTTGTCATGTAAGAAGAATAAGTCCTATGTGCAGTGGGATCCCTTGAG | |
40 >AAAAAAAAAAAAGACCACGTTTCT.ba.1 18 | |
41 GTAGAGTTGAGTAGCGGGTAAATTTGAATTAAAATTGATAGGGGAGCAATTTTTTGTCATGTAAGAAGAATAAGTCCTATGTGCAGTGGGATCCCTTGAG | |
42 | |
43 DCS: | |
44 >AAAAAAAACACCAAATACGCCTAC.1 28-39 | |
45 GCTATGAATATAGGGGCTGTAAGAATAATATAGATTATGAGGTTGAGTAGAGTGAGGGATGGGTTGTAAGGAAGAATTGCTAATATTCATCCTATGTGGG | |
46 >AAAAAAAACACCAAATACGCCTAC.2 28-39 | |
47 TACTTCGTCACACAAGCAACAGCCTCAATAATTATCCTCCTGGCCATCGTACTCAACTATAAACAACTAGGAACATGAATATTTCAACAACAAACAAACG | |
48 """ | |
49 | |
50 intype = None | |
51 line_num = 0 | |
52 sscs_buffer = {} | |
53 for line_raw in args.input: | |
54 line_num += 1 | |
55 line = line_raw.rstrip('\r\n') | |
56 if line.startswith('>'): | |
57 # We're in a header line. | |
58 barcode, strand, mate, famsizes = parse_header(line) | |
59 if intype is None: | |
60 if strand is None: | |
61 intype = 'DCS' | |
62 else: | |
63 intype = 'SSCS' | |
64 else: | |
65 # We're in a sequence line. | |
66 seq = line | |
67 if intype == 'DCS': | |
68 if mate == '1': | |
69 args.out1.write('>{} {}\n{}\n'.format(barcode, famsizes, seq)) | |
70 elif mate == '2': | |
71 args.out2.write('>{} {}\n{}\n'.format(barcode, famsizes, seq)) | |
72 else: | |
73 fail('Error: Invalid mate "{}" on line {}.'.format(mate, line_num)) | |
74 elif intype == 'SSCS': | |
75 # Need to get SSCS in correct order. | |
76 # Collect reads until they're properly paired, then print both. | |
77 if (barcode, strand) in sscs_buffer: | |
78 stored_read = sscs_buffer[(barcode, strand)] | |
79 new_read = {'mate':mate, 'famsizes':famsizes, 'seq':seq} | |
80 if stored_read['mate'] == '1' and mate == '2': | |
81 read1, read2 = stored_read, new_read | |
82 elif stored_read['mate'] == '2' and mate == '1': | |
83 read1, read2 = new_read, stored_read | |
84 args.out1.write('>{}.{} {famsizes}\n{seq}\n'.format(barcode, strand, **read1)) | |
85 args.out2.write('>{}.{} {famsizes}\n{seq}\n'.format(barcode, strand, **read2)) | |
86 del sscs_buffer[(barcode, strand)] | |
87 else: | |
88 sscs_buffer[(barcode, strand)] = {'mate':mate, 'famsizes':famsizes, 'seq':seq} | |
89 | |
90 | |
91 def parse_header(header): | |
92 read_id, famsizes = header.split() | |
93 id_fields = read_id[1:].split('.') | |
94 barcode = id_fields[0] | |
95 strand = None | |
96 mate = id_fields[-1] | |
97 if len(id_fields) == 3: | |
98 strand = id_fields[1] | |
99 return barcode, strand, mate, famsizes | |
100 | |
101 | |
102 def fail(message): | |
103 sys.stderr.write(message+"\n") | |
104 sys.exit(1) | |
105 | |
106 if __name__ == '__main__': | |
107 sys.exit(main(sys.argv)) |