Mercurial > repos > nick > duplex
annotate utils/sim-label.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
rev | line source |
---|---|
18
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
1 #!/usr/bin/env python |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
2 from __future__ import division |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
3 from __future__ import print_function |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
4 import sys |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
5 import argparse |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
6 import fastareader |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
7 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
8 ARG_DEFAULTS = {} |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
9 USAGE = "%(prog)s [options]" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
10 DESCRIPTION = """Convert the results of the duplex pipeline on simulated data to a single-line tsv |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
11 format, and label them with their original fragments.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
12 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
13 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
14 def main(argv): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
15 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
16 parser = argparse.ArgumentParser(description=DESCRIPTION) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
17 parser.set_defaults(**ARG_DEFAULTS) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
18 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
19 parser.add_argument('reads', type=fastareader.FastaReadGenerator, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
20 help='Output fasta file from duplex pipeline. This script depends on their names being the ' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
21 'exact format produced by sim.py.') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
22 parser.add_argument('families', type=argparse.FileType('r'), |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
23 help='Families tsv file (output of make-barcodes.awk).') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
24 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
25 args = parser.parse_args(argv[1:]) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
26 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
27 bars_to_frags = {} |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
28 for line in args.families: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
29 barcode, order, read_name = line.rstrip('\r\n').split('\t')[:3] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
30 if read_name.startswith('@'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
31 read_name = read_name[1:] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
32 chrom, frag_id, read_num = read_name.split('-') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
33 bars_to_frags[barcode] = (chrom, frag_id) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
34 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
35 reads = iter(args.reads) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
36 while True: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
37 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
38 read = next(reads) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
39 except StopIteration: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
40 break |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
41 barcode = read.id |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
42 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
43 chrom, frag_id = bars_to_frags[barcode] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
44 except KeyError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
45 sys.stderr.write('Missing barcode: {}\n'.format(barcode)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
46 continue |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
47 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
48 frag_num = int(frag_id, 16) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
49 except ValueError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
50 sys.stderr.write('Invalid fragment id: {}\n'.format(frag_id)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
51 continue |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
52 fam1size, fam2size = get_famsizes(read.name) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
53 print(chrom, frag_num, frag_id, barcode, fam1size, fam2size, read.seq, sep='\t') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
54 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
55 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
56 def get_famsizes(read_name): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
57 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
58 faminfo = read_name.split()[1] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
59 except IndexError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
60 faminfo = '' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
61 famsizes = faminfo.split('-') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
62 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
63 fam1size = famsizes[0] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
64 except IndexError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
65 fam1size = '' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
66 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
67 fam2size = famsizes[1] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
68 except IndexError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
69 fam2size = '' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
70 return fam1size, fam2size |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
71 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
72 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
73 def fail(message): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
74 sys.stderr.write(message+"\n") |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
75 sys.exit(1) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
76 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
77 if __name__ == '__main__': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
78 sys.exit(main(sys.argv)) |