Mercurial > repos > nick > duplex
comparison utils/correct-simple.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:836fa4fe9494 | 18:e4d75f9efb90 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from __future__ import division | |
| 3 from __future__ import print_function | |
| 4 from __future__ import absolute_import | |
| 5 from __future__ import unicode_literals | |
| 6 import sys | |
| 7 import errno | |
| 8 import logging | |
| 9 import argparse | |
| 10 import subprocess | |
| 11 | |
| 12 ARG_DEFAULTS = {'nbarcodes':20, 'mapq_thres':25, 'log':sys.stderr, 'volume':logging.ERROR} | |
| 13 DESCRIPTION = """""" | |
| 14 | |
| 15 | |
| 16 def main(argv): | |
| 17 | |
| 18 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
| 19 parser.set_defaults(**ARG_DEFAULTS) | |
| 20 | |
| 21 parser.add_argument('nbarcodes', metavar='barcodes to try', type=int, nargs='?', | |
| 22 help='') | |
| 23 parser.add_argument('-s', '--summary', action='store_true', | |
| 24 help='Only print the summary of how many families were rescued.') | |
| 25 parser.add_argument('-m', '--mapq', type=int) | |
| 26 parser.add_argument('-r', '--random', action='store_true') | |
| 27 parser.add_argument('-l', '--log', type=argparse.FileType('w'), | |
| 28 help='Print log messages to this file instead of to stderr. Warning: Will overwrite the file.') | |
| 29 parser.add_argument('-q', '--quiet', dest='volume', action='store_const', const=logging.CRITICAL) | |
| 30 parser.add_argument('-v', '--verbose', dest='volume', action='store_const', const=logging.INFO) | |
| 31 parser.add_argument('-D', '--debug', dest='volume', action='store_const', const=logging.DEBUG) | |
| 32 | |
| 33 args = parser.parse_args(argv[1:]) | |
| 34 | |
| 35 logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s') | |
| 36 tone_down_logger() | |
| 37 | |
| 38 logging.info('Reading random barcodes from border-families.txt..') | |
| 39 rand_arg = '--random-source=border-families.txt' | |
| 40 if args.random: | |
| 41 rand_arg = '' | |
| 42 pipeline = 'cat border-families.txt | paste - - | shuf {} | head -n {}'.format(rand_arg, | |
| 43 args.nbarcodes) | |
| 44 commands = [cmd.split() for cmd in pipeline.split('|')] | |
| 45 process = make_pipeline(*commands) | |
| 46 families_by_barcode = {} | |
| 47 for line_raw in process.stdout: | |
| 48 line = line_raw.rstrip('\r\n') | |
| 49 fields = line.split() | |
| 50 family = {} | |
| 51 count1, barcode, order1, count2, barcode2, order2 = fields | |
| 52 assert barcode == barcode2, (barcode, barcode2) | |
| 53 if order1 == 'ab': | |
| 54 assert order2 == 'ba', barcode | |
| 55 elif order1 == 'ba': | |
| 56 assert order2 == 'ab', barcode | |
| 57 count1, count2 = count2, count1 | |
| 58 else: | |
| 59 fail(order1, order2, barcode) | |
| 60 family['count1'] = int(count1) | |
| 61 family['count2'] = int(count2) | |
| 62 family['barcode'] = barcode | |
| 63 families_by_barcode[barcode] = family | |
| 64 | |
| 65 logging.info('Reading barcodes.fq to find read names..') | |
| 66 hits = 0 | |
| 67 families_by_read_name = {} | |
| 68 line_num = 0 | |
| 69 with open('barcodes.fq', 'rU') as barcodes_fq: | |
| 70 for line in barcodes_fq: | |
| 71 line_num += 1 | |
| 72 if line_num % 4 == 1: | |
| 73 read_name = line[1:].rstrip('\r\n') | |
| 74 elif line_num % 4 == 2: | |
| 75 seq = line.rstrip('\r\n') | |
| 76 family = families_by_barcode.get(seq) | |
| 77 if family: | |
| 78 hits += 1 | |
| 79 family['read_name'] = read_name | |
| 80 families_by_read_name[read_name] = family | |
| 81 logging.info('hits: {}'.format(hits)) | |
| 82 | |
| 83 logging.info('Reading barcodes.bam to find similar barcodes..') | |
| 84 hits = 0 | |
| 85 neighbors_by_read_name = {} | |
| 86 # samtools view -f 256 barcodes.bam | awk '$1 == '$read_name' && $5 > 25 {print $3}' | |
| 87 process = subprocess.Popen(('samtools', 'view', '-f', '256', 'barcodes.bam'), stdout=subprocess.PIPE) | |
| 88 for line in process.stdout: | |
| 89 fields = line.split() | |
| 90 mapq = int(fields[4]) | |
| 91 if mapq >= args.mapq_thres: | |
| 92 read_name = fields[0] | |
| 93 family = families_by_read_name.get(read_name) | |
| 94 if family: | |
| 95 hits += 1 | |
| 96 read_name2 = fields[2] | |
| 97 neighbor = {'read_name':read_name2} | |
| 98 neighbors = family.get('neighbors', []) | |
| 99 neighbors.append(neighbor) | |
| 100 family['neighbors'] = neighbors | |
| 101 neighbors_by_read_name[read_name2] = neighbor | |
| 102 logging.info('hits: {}'.format(hits)) | |
| 103 | |
| 104 logging.info('Reading barcodes.fq to find sequences of neighbors..') | |
| 105 hits = 0 | |
| 106 line_num = 0 | |
| 107 neighbors_by_barcode = {} | |
| 108 with open('barcodes.fq', 'rU') as barcodes_fq: | |
| 109 for line in barcodes_fq: | |
| 110 line_num += 1 | |
| 111 if line_num % 4 == 1: | |
| 112 read_name = line[1:].rstrip('\r\n') | |
| 113 neighbor = neighbors_by_read_name.get(read_name) | |
| 114 if line_num % 4 == 2 and neighbor: | |
| 115 seq = line.rstrip('\r\n') | |
| 116 neighbor['barcode'] = seq | |
| 117 neighbors_by_barcode[seq] = neighbor | |
| 118 logging.info('hits: {}'.format(hits)) | |
| 119 | |
| 120 logging.info('Reading families.uniq.txt to get counts of neighbors..') | |
| 121 hits = 0 | |
| 122 with open('families.uniq.txt', 'rU') as families_uniq: | |
| 123 for line in families_uniq: | |
| 124 fields = line.split() | |
| 125 barcode = fields[1] | |
| 126 neighbor = neighbors_by_barcode.get(barcode) | |
| 127 if neighbor: | |
| 128 hits += 1 | |
| 129 count = int(fields[0]) | |
| 130 order = fields[2].rstrip('\r\n') | |
| 131 alpha = barcode[:len(seq)//2] | |
| 132 beta = barcode[len(seq)//2:] | |
| 133 swap = alpha >= beta | |
| 134 if (not swap and order == 'ab') or (swap and order == 'ba'): | |
| 135 neighbor['count1'] = count | |
| 136 neighbor['count2'] = 0 | |
| 137 elif (not swap and order == 'ba') or (swap and order == 'ab'): | |
| 138 neighbor['count1'] = 0 | |
| 139 neighbor['count2'] = count | |
| 140 else: | |
| 141 fail(order, barcode, swap) | |
| 142 logging.info('hits: {}'.format(hits)) | |
| 143 | |
| 144 logging.info('Printing results..') | |
| 145 total = 0 | |
| 146 passing = 0 | |
| 147 for family in families_by_barcode.values(): | |
| 148 total += 1 | |
| 149 count1 = family['count1'] | |
| 150 count2 = family['count2'] | |
| 151 if not args.summary: | |
| 152 print('{barcode}\t{count1}\t{count2}\t{read_name}'.format(**family)) | |
| 153 neighbors = family.get('neighbors') | |
| 154 if neighbors: | |
| 155 for neighbor in neighbors: | |
| 156 if not args.summary: | |
| 157 print('{barcode}\t{count1}\t{count2}\t{read_name}'.format(**neighbor)) | |
| 158 count1 += neighbor['count1'] | |
| 159 count2 += neighbor['count2'] | |
| 160 if count1 >= 3 and count2 >= 3: | |
| 161 if not args.summary: | |
| 162 print('PASS!') | |
| 163 passing += 1 | |
| 164 elif not args.summary: | |
| 165 print('fail') | |
| 166 | |
| 167 print('{} families rescued out of {} ({:0.2f}%)'.format(passing, total, 100*passing/total)) | |
| 168 | |
| 169 | |
| 170 def make_pipeline(*commands): | |
| 171 processes = [] | |
| 172 for command in commands: | |
| 173 if not processes: | |
| 174 processes.append(subprocess.Popen(command, stdout=subprocess.PIPE)) | |
| 175 else: | |
| 176 processes.append(subprocess.Popen(command, stdin=processes[-1].stdout, stdout=subprocess.PIPE)) | |
| 177 processes[0].stdout.close() | |
| 178 return processes[-1] | |
| 179 | |
| 180 | |
| 181 def tone_down_logger(): | |
| 182 """Change the logging level names from all-caps to capitalized lowercase. | |
| 183 E.g. "WARNING" -> "Warning" (turn down the volume a bit in your log files)""" | |
| 184 for level in (logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG): | |
| 185 level_name = logging.getLevelName(level) | |
| 186 logging.addLevelName(level, level_name.capitalize()) | |
| 187 | |
| 188 | |
| 189 def fail(message): | |
| 190 sys.stderr.write(message+"\n") | |
| 191 sys.exit(1) | |
| 192 | |
| 193 | |
| 194 if __name__ == '__main__': | |
| 195 try: | |
| 196 sys.exit(main(sys.argv)) | |
| 197 except IOError as ioe: | |
| 198 if ioe.errno != errno.EPIPE: | |
| 199 raise |
