comparison utils/correct-simple.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 #!/usr/bin/env python
2 from __future__ import division
3 from __future__ import print_function
4 from __future__ import absolute_import
5 from __future__ import unicode_literals
6 import sys
7 import errno
8 import logging
9 import argparse
10 import subprocess
11
12 ARG_DEFAULTS = {'nbarcodes':20, 'mapq_thres':25, 'log':sys.stderr, 'volume':logging.ERROR}
13 DESCRIPTION = """"""
14
15
16 def main(argv):
17
18 parser = argparse.ArgumentParser(description=DESCRIPTION)
19 parser.set_defaults(**ARG_DEFAULTS)
20
21 parser.add_argument('nbarcodes', metavar='barcodes to try', type=int, nargs='?',
22 help='')
23 parser.add_argument('-s', '--summary', action='store_true',
24 help='Only print the summary of how many families were rescued.')
25 parser.add_argument('-m', '--mapq', type=int)
26 parser.add_argument('-r', '--random', action='store_true')
27 parser.add_argument('-l', '--log', type=argparse.FileType('w'),
28 help='Print log messages to this file instead of to stderr. Warning: Will overwrite the file.')
29 parser.add_argument('-q', '--quiet', dest='volume', action='store_const', const=logging.CRITICAL)
30 parser.add_argument('-v', '--verbose', dest='volume', action='store_const', const=logging.INFO)
31 parser.add_argument('-D', '--debug', dest='volume', action='store_const', const=logging.DEBUG)
32
33 args = parser.parse_args(argv[1:])
34
35 logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s')
36 tone_down_logger()
37
38 logging.info('Reading random barcodes from border-families.txt..')
39 rand_arg = '--random-source=border-families.txt'
40 if args.random:
41 rand_arg = ''
42 pipeline = 'cat border-families.txt | paste - - | shuf {} | head -n {}'.format(rand_arg,
43 args.nbarcodes)
44 commands = [cmd.split() for cmd in pipeline.split('|')]
45 process = make_pipeline(*commands)
46 families_by_barcode = {}
47 for line_raw in process.stdout:
48 line = line_raw.rstrip('\r\n')
49 fields = line.split()
50 family = {}
51 count1, barcode, order1, count2, barcode2, order2 = fields
52 assert barcode == barcode2, (barcode, barcode2)
53 if order1 == 'ab':
54 assert order2 == 'ba', barcode
55 elif order1 == 'ba':
56 assert order2 == 'ab', barcode
57 count1, count2 = count2, count1
58 else:
59 fail(order1, order2, barcode)
60 family['count1'] = int(count1)
61 family['count2'] = int(count2)
62 family['barcode'] = barcode
63 families_by_barcode[barcode] = family
64
65 logging.info('Reading barcodes.fq to find read names..')
66 hits = 0
67 families_by_read_name = {}
68 line_num = 0
69 with open('barcodes.fq', 'rU') as barcodes_fq:
70 for line in barcodes_fq:
71 line_num += 1
72 if line_num % 4 == 1:
73 read_name = line[1:].rstrip('\r\n')
74 elif line_num % 4 == 2:
75 seq = line.rstrip('\r\n')
76 family = families_by_barcode.get(seq)
77 if family:
78 hits += 1
79 family['read_name'] = read_name
80 families_by_read_name[read_name] = family
81 logging.info('hits: {}'.format(hits))
82
83 logging.info('Reading barcodes.bam to find similar barcodes..')
84 hits = 0
85 neighbors_by_read_name = {}
86 # samtools view -f 256 barcodes.bam | awk '$1 == '$read_name' && $5 > 25 {print $3}'
87 process = subprocess.Popen(('samtools', 'view', '-f', '256', 'barcodes.bam'), stdout=subprocess.PIPE)
88 for line in process.stdout:
89 fields = line.split()
90 mapq = int(fields[4])
91 if mapq >= args.mapq_thres:
92 read_name = fields[0]
93 family = families_by_read_name.get(read_name)
94 if family:
95 hits += 1
96 read_name2 = fields[2]
97 neighbor = {'read_name':read_name2}
98 neighbors = family.get('neighbors', [])
99 neighbors.append(neighbor)
100 family['neighbors'] = neighbors
101 neighbors_by_read_name[read_name2] = neighbor
102 logging.info('hits: {}'.format(hits))
103
104 logging.info('Reading barcodes.fq to find sequences of neighbors..')
105 hits = 0
106 line_num = 0
107 neighbors_by_barcode = {}
108 with open('barcodes.fq', 'rU') as barcodes_fq:
109 for line in barcodes_fq:
110 line_num += 1
111 if line_num % 4 == 1:
112 read_name = line[1:].rstrip('\r\n')
113 neighbor = neighbors_by_read_name.get(read_name)
114 if line_num % 4 == 2 and neighbor:
115 seq = line.rstrip('\r\n')
116 neighbor['barcode'] = seq
117 neighbors_by_barcode[seq] = neighbor
118 logging.info('hits: {}'.format(hits))
119
120 logging.info('Reading families.uniq.txt to get counts of neighbors..')
121 hits = 0
122 with open('families.uniq.txt', 'rU') as families_uniq:
123 for line in families_uniq:
124 fields = line.split()
125 barcode = fields[1]
126 neighbor = neighbors_by_barcode.get(barcode)
127 if neighbor:
128 hits += 1
129 count = int(fields[0])
130 order = fields[2].rstrip('\r\n')
131 alpha = barcode[:len(seq)//2]
132 beta = barcode[len(seq)//2:]
133 swap = alpha >= beta
134 if (not swap and order == 'ab') or (swap and order == 'ba'):
135 neighbor['count1'] = count
136 neighbor['count2'] = 0
137 elif (not swap and order == 'ba') or (swap and order == 'ab'):
138 neighbor['count1'] = 0
139 neighbor['count2'] = count
140 else:
141 fail(order, barcode, swap)
142 logging.info('hits: {}'.format(hits))
143
144 logging.info('Printing results..')
145 total = 0
146 passing = 0
147 for family in families_by_barcode.values():
148 total += 1
149 count1 = family['count1']
150 count2 = family['count2']
151 if not args.summary:
152 print('{barcode}\t{count1}\t{count2}\t{read_name}'.format(**family))
153 neighbors = family.get('neighbors')
154 if neighbors:
155 for neighbor in neighbors:
156 if not args.summary:
157 print('{barcode}\t{count1}\t{count2}\t{read_name}'.format(**neighbor))
158 count1 += neighbor['count1']
159 count2 += neighbor['count2']
160 if count1 >= 3 and count2 >= 3:
161 if not args.summary:
162 print('PASS!')
163 passing += 1
164 elif not args.summary:
165 print('fail')
166
167 print('{} families rescued out of {} ({:0.2f}%)'.format(passing, total, 100*passing/total))
168
169
170 def make_pipeline(*commands):
171 processes = []
172 for command in commands:
173 if not processes:
174 processes.append(subprocess.Popen(command, stdout=subprocess.PIPE))
175 else:
176 processes.append(subprocess.Popen(command, stdin=processes[-1].stdout, stdout=subprocess.PIPE))
177 processes[0].stdout.close()
178 return processes[-1]
179
180
181 def tone_down_logger():
182 """Change the logging level names from all-caps to capitalized lowercase.
183 E.g. "WARNING" -> "Warning" (turn down the volume a bit in your log files)"""
184 for level in (logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG):
185 level_name = logging.getLevelName(level)
186 logging.addLevelName(level, level_name.capitalize())
187
188
189 def fail(message):
190 sys.stderr.write(message+"\n")
191 sys.exit(1)
192
193
194 if __name__ == '__main__':
195 try:
196 sys.exit(main(sys.argv))
197 except IOError as ioe:
198 if ioe.errno != errno.EPIPE:
199 raise