Mercurial > repos > nick > duplex
annotate correct.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
rev | line source |
---|---|
18
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
1 #!/usr/bin/env python |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
2 from __future__ import division |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
3 from __future__ import print_function |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
4 import os |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
5 import sys |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
6 import gzip |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
7 import logging |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
8 import argparse |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
9 import resource |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
10 import subprocess |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
11 import networkx |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
12 import swalign |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
13 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
14 VERBOSE = (logging.DEBUG+logging.INFO)//2 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
15 ARG_DEFAULTS = {'sam':sys.stdin, 'qual':20, 'pos':2, 'dist':1, 'choose_by':'reads', 'output':True, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
16 'visualize':0, 'viz_format':'png', 'log':sys.stderr, 'volume':logging.WARNING} |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
17 USAGE = "%(prog)s [options]" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
18 DESCRIPTION = """Correct barcodes using an alignment of all barcodes to themselves. Reads the |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
19 alignment in SAM format and corrects the barcodes in an input "families" file (the output of |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
20 make-barcodes.awk). It will print the "families" file to stdout with barcodes (and orders) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
21 corrected.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
22 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
23 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
24 def main(argv): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
25 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
26 parser = argparse.ArgumentParser(description=DESCRIPTION) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
27 parser.set_defaults(**ARG_DEFAULTS) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
28 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
29 parser.add_argument('families', type=open_as_text_or_gzip, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
30 help='The sorted output of make-barcodes.awk. The important part is that it\'s a tab-delimited ' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
31 'file with at least 2 columns: the barcode sequence and order, and it must be sorted in ' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
32 'the same order as the "reads" in the SAM file.') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
33 parser.add_argument('reads', type=open_as_text_or_gzip, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
34 help='The fasta/q file given to the aligner. Used to get barcode sequences from read names.') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
35 parser.add_argument('sam', type=argparse.FileType('r'), nargs='?', |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
36 help='Barcode alignment, in SAM format. Omit to read from stdin. The read names must be ' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
37 'integers, representing the (1-based) order they appear in the families file.') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
38 parser.add_argument('-P', '--prepend', action='store_true', |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
39 help='Prepend the corrected barcodes and orders to the original columns.') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
40 parser.add_argument('-d', '--dist', type=int, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
41 help='NM edit distance threshold. Default: %(default)s') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
42 parser.add_argument('-m', '--mapq', type=int, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
43 help='MAPQ threshold. Default: %(default)s') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
44 parser.add_argument('-p', '--pos', type=int, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
45 help='POS tolerance. Alignments will be ignored if abs(POS - 1) is greater than this value. ' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
46 'Set to greater than the barcode length for no threshold. Default: %(default)s') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
47 parser.add_argument('-t', '--tag-len', type=int, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
48 help='Length of each half of the barcode. If not given, it will be determined from the first ' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
49 'barcode in the families file.') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
50 parser.add_argument('-c', '--choose-by', choices=('reads', 'connectivity')) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
51 parser.add_argument('--limit', type=int, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
52 help='Limit the number of lines that will be read from each input file, for testing purposes.') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
53 parser.add_argument('-S', '--structures', action='store_true', |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
54 help='Print a list of the unique isoforms') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
55 parser.add_argument('--struct-human', action='store_true') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
56 parser.add_argument('-V', '--visualize', nargs='?', |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
57 help='Produce a visualization of the unique structures write the image to this file. ' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
58 'If you omit a filename, it will be displayed in a window.') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
59 parser.add_argument('-F', '--viz-format', choices=('dot', 'graphviz', 'png')) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
60 parser.add_argument('-n', '--no-output', dest='output', action='store_false') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
61 parser.add_argument('-l', '--log', type=argparse.FileType('w'), |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
62 help='Print log messages to this file instead of to stderr. Warning: Will overwrite the file.') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
63 parser.add_argument('-q', '--quiet', dest='volume', action='store_const', const=logging.CRITICAL) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
64 parser.add_argument('-i', '--info', dest='volume', action='store_const', const=logging.INFO) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
65 parser.add_argument('-v', '--verbose', dest='volume', action='store_const', const=VERBOSE) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
66 parser.add_argument('-D', '--debug', dest='volume', action='store_const', const=logging.DEBUG, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
67 help='Print debug messages (very verbose).') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
68 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
69 args = parser.parse_args(argv[1:]) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
70 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
71 logging.basicConfig(stream=args.log, level=args.volume, format='%(message)s') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
72 tone_down_logger() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
73 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
74 logging.info('Reading the fasta/q to map read names to barcodes..') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
75 names_to_barcodes = map_names_to_barcodes(args.reads, args.limit) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
76 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
77 logging.info('Reading the SAM to build the graph of barcode relationships..') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
78 graph, reversed_barcodes = read_alignments(args.sam, names_to_barcodes, args.pos, args.mapq, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
79 args.dist, args.limit) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
80 logging.info('{} reversed barcodes'.format(len(reversed_barcodes))) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
81 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
82 logging.info('Reading the families.tsv to get the counts of each family..') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
83 family_counts = get_family_counts(args.families, args.limit) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
84 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
85 if args.structures: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
86 logging.info('Counting the unique barcode networks..') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
87 structures = count_structures(graph, family_counts) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
88 print_structures(structures, args.struct_human) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
89 if args.visualize != 0: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
90 logging.info('Generating a visualization of barcode networks..') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
91 visualize([s['graph'] for s in structures], args.visualize, args.viz_format) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
92 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
93 logging.info('Building the correction table from the graph..') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
94 corrections = make_correction_table(graph, family_counts, args.choose_by) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
95 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
96 logging.info('Reading the families.tsv again to print corrected output..') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
97 families = open_as_text_or_gzip(args.families.name) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
98 print_corrected_output(families, corrections, reversed_barcodes, args.prepend, args.limit, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
99 args.output) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
100 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
101 max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
102 logging.info('Max memory usage: {:0.2f}MB'.format(max_mem)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
103 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
104 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
105 def detect_format(reads_file, max_lines=7): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
106 """Detect whether a file is a fastq or a fasta, based on its content.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
107 fasta_votes = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
108 fastq_votes = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
109 line_num = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
110 for line in reads_file: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
111 line_num += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
112 if line_num % 4 == 1: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
113 if line.startswith('@'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
114 fastq_votes += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
115 elif line.startswith('>'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
116 fasta_votes += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
117 elif line_num % 4 == 3: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
118 if line.startswith('+'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
119 fastq_votes += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
120 elif line.startswith('>'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
121 fasta_votes += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
122 if line_num >= max_lines: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
123 break |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
124 reads_file.seek(0) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
125 if fasta_votes > fastq_votes: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
126 return 'fasta' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
127 elif fastq_votes > fasta_votes: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
128 return 'fastq' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
129 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
130 return None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
131 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
132 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
133 def read_fastaq(reads_file): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
134 filename = reads_file.name |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
135 if filename.endswith('.fa') or filename.endswith('.fasta'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
136 format = 'fasta' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
137 elif filename.endswith('.fq') or filename.endswith('.fastq'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
138 format = 'fastq' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
139 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
140 format = detect_format(reads_file) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
141 if format == 'fasta': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
142 return read_fasta(reads_file) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
143 elif format == 'fastq': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
144 return read_fastq(reads_file) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
145 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
146 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
147 def read_fasta(reads_file): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
148 """Read a FASTA file, yielding read names and sequences. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
149 NOTE: This assumes sequences are only one line!""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
150 line_num = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
151 for line_raw in reads_file: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
152 line = line_raw.rstrip('\r\n') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
153 line_num += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
154 if line_num % 2 == 1: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
155 assert line.startswith('>'), line |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
156 read_name = line[1:] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
157 elif line_num % 2 == 0: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
158 read_seq = line |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
159 yield read_name, read_seq |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
160 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
161 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
162 def read_fastq(reads_file): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
163 """Read a FASTQ file, yielding read names and sequences. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
164 NOTE: This assumes sequences are only one line!""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
165 line_num = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
166 for line in reads_file: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
167 line_num += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
168 if line_num % 4 == 1: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
169 assert line.startswith('@'), line |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
170 read_name = line[1:].rstrip('\r\n') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
171 elif line_num % 4 == 2: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
172 read_seq = line.rstrip('\r\n') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
173 yield read_name, read_seq |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
174 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
175 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
176 def map_names_to_barcodes(reads_file, limit=None): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
177 """Map barcode names to their sequences.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
178 names_to_barcodes = {} |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
179 read_num = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
180 for read_name, read_seq in read_fastaq(reads_file): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
181 read_num += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
182 if limit is not None and read_num > limit: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
183 break |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
184 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
185 name = int(read_name) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
186 except ValueError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
187 logging.critical('non-int read name "{}"'.format(name)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
188 raise |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
189 names_to_barcodes[name] = read_seq |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
190 reads_file.close() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
191 return names_to_barcodes |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
192 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
193 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
194 def parse_alignment(sam_file, pos_thres, mapq_thres, dist_thres): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
195 """Parse the SAM file and yield reads that pass the filters. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
196 Returns (qname, rname, reversed).""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
197 line_num = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
198 for line in sam_file: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
199 line_num += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
200 if line.startswith('@'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
201 logging.debug('Header line ({})'.format(line_num)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
202 continue |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
203 fields = line.split('\t') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
204 logging.debug('read {} -> ref {} (read seq {}):'.format(fields[2], fields[0], fields[9])) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
205 qname_str = fields[0] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
206 rname_str = fields[2] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
207 rname_fields = rname_str.split(':') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
208 if len(rname_fields) == 2 and rname_fields[1] == 'rev': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
209 reversed = True |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
210 rname_str = rname_fields[0] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
211 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
212 reversed = False |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
213 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
214 qname = int(qname_str) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
215 rname = int(rname_str) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
216 except ValueError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
217 if fields[2] == '*': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
218 logging.debug('\tRead unmapped (reference == "*")') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
219 continue |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
220 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
221 logging.error('Non-integer read name(s) on line {}: "{}", "{}".' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
222 .format(line_num, qname, rname)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
223 raise |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
224 # Apply alignment quality filters. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
225 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
226 flags = int(fields[1]) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
227 pos = int(fields[3]) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
228 mapq = int(fields[4]) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
229 except ValueError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
230 logging.warn('\tNon-integer flag ({}), pos ({}), or mapq ({})' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
231 .format(fields[1], fields[3], fields[4])) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
232 continue |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
233 if flags & 4: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
234 logging.debug('\tRead unmapped (flag & 4 == True)') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
235 continue |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
236 if abs(pos - 1) > pos_thres: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
237 logging.debug('\tAlignment failed pos filter: abs({} - 1) > {}'.format(pos, pos_thres)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
238 continue |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
239 if mapq < mapq_thres: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
240 logging.debug('\tAlignment failed mapq filter: {} > {}'.format(mapq, mapq_thres)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
241 continue |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
242 nm = None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
243 for tag in fields[11:]: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
244 if tag.startswith('NM:i:'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
245 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
246 nm = int(tag[5:]) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
247 except ValueError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
248 logging.error('Invalid NM tag "{}" on line {}.'.format(tag, line_num)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
249 raise |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
250 break |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
251 assert nm is not None, line_num |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
252 if nm > dist_thres: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
253 logging.debug('\tAlignment failed NM distance filter: {} > {}'.format(nm, dist_thres)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
254 continue |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
255 yield qname, rname, reversed |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
256 sam_file.close() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
257 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
258 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
259 def read_alignments(sam_file, names_to_barcodes, pos_thres, mapq_thres, dist_thres, limit=None): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
260 """Read the alignments from the SAM file. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
261 Returns a dict mapping each reference sequence (RNAME) to sets of sequences (QNAMEs) that align to |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
262 it.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
263 graph = networkx.Graph() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
264 # This is the set of all barcodes which are involved in an alignment where the target is reversed. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
265 # Whether it's a query or reference sequence in the alignment, it's marked here. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
266 reversed_barcodes = set() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
267 # Maps correct barcode numbers to sets of original barcodes (includes correct ones). |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
268 line_num = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
269 for qname, rname, reversed in parse_alignment(sam_file, pos_thres, mapq_thres, dist_thres): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
270 line_num += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
271 if limit is not None and line_num > limit: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
272 break |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
273 # Skip self-alignments. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
274 if rname == qname: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
275 continue |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
276 rseq = names_to_barcodes[rname] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
277 qseq = names_to_barcodes[qname] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
278 # Is this an alignment to a reversed barcode? |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
279 if reversed: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
280 reversed_barcodes.add(rseq) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
281 reversed_barcodes.add(qseq) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
282 graph.add_node(rseq) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
283 graph.add_node(qseq) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
284 graph.add_edge(rseq, qseq) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
285 return graph, reversed_barcodes |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
286 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
287 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
288 def get_family_counts(families_file, limit=None): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
289 """For each family (barcode), count how many read pairs exist for each strand (order).""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
290 family_counts = {} |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
291 last_barcode = None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
292 this_family_counts = None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
293 line_num = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
294 for line in families_file: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
295 line_num += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
296 if limit is not None and line_num > limit: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
297 break |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
298 fields = line.rstrip('\r\n').split('\t') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
299 barcode = fields[0] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
300 order = fields[1] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
301 if barcode != last_barcode: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
302 if this_family_counts: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
303 this_family_counts['all'] = this_family_counts['ab'] + this_family_counts['ba'] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
304 family_counts[last_barcode] = this_family_counts |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
305 this_family_counts = {'ab':0, 'ba':0} |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
306 last_barcode = barcode |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
307 this_family_counts[order] += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
308 this_family_counts['all'] = this_family_counts['ab'] + this_family_counts['ba'] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
309 family_counts[last_barcode] = this_family_counts |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
310 families_file.close() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
311 return family_counts |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
312 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
313 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
314 def make_correction_table(meta_graph, family_counts, choose_by='reads'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
315 """Make a table mapping original barcode sequences to correct barcodes. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
316 Assumes the most connected node in the graph as the correct barcode.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
317 corrections = {} |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
318 for graph in networkx.connected_component_subgraphs(meta_graph): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
319 if choose_by == 'reads': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
320 def key(bar): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
321 return family_counts[bar]['all'] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
322 elif choose_by == 'connectivity': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
323 degrees = graph.degree() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
324 def key(bar): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
325 return degrees[bar] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
326 barcodes = sorted(graph.nodes(), key=key, reverse=True) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
327 correct = barcodes[0] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
328 for barcode in barcodes: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
329 if barcode != correct: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
330 logging.debug('Correcting {} ->\n {}\n'.format(barcode, correct)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
331 corrections[barcode] = correct |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
332 return corrections |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
333 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
334 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
335 def print_corrected_output(families_file, corrections, reversed_barcodes, prepend=False, limit=None, |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
336 output=True): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
337 line_num = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
338 barcode_num = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
339 barcode_last = None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
340 corrected = {'reads':0, 'barcodes':0, 'reversed':0} |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
341 reads = [0, 0] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
342 corrections_in_this_family = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
343 for line in families_file: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
344 line_num += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
345 if limit is not None and line_num > limit: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
346 break |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
347 fields = line.rstrip('\r\n').split('\t') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
348 raw_barcode = fields[0] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
349 order = fields[1] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
350 if raw_barcode != barcode_last: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
351 # We just started a new family. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
352 barcode_num += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
353 family_info = '{}\t{}\t{}'.format(barcode_last, reads[0], reads[1]) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
354 if corrections_in_this_family: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
355 corrected['reads'] += corrections_in_this_family |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
356 corrected['barcodes'] += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
357 family_info += '\tCORRECTED!' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
358 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
359 family_info += '\tuncorrected' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
360 logging.log(VERBOSE, family_info) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
361 reads = [0, 0] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
362 corrections_in_this_family = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
363 barcode_last = raw_barcode |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
364 if order == 'ab': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
365 reads[0] += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
366 elif order == 'ba': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
367 reads[1] += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
368 if raw_barcode in corrections: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
369 correct_barcode = corrections[raw_barcode] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
370 corrections_in_this_family += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
371 # Check if the order of the barcode reverses in the correct version. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
372 # First, we check in reversed_barcodes whether either barcode was involved in a reversed |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
373 # alignment, to save time (is_alignment_reversed() does a full smith-waterman alignment). |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
374 if ((raw_barcode in reversed_barcodes or correct_barcode in reversed_barcodes) and |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
375 is_alignment_reversed(raw_barcode, correct_barcode)): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
376 # If so, then switch the order field. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
377 corrected['reversed'] += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
378 if order == 'ab': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
379 fields[1] = 'ba' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
380 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
381 fields[1] = 'ab' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
382 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
383 correct_barcode = raw_barcode |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
384 if prepend: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
385 fields.insert(0, correct_barcode) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
386 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
387 fields[0] = correct_barcode |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
388 if output: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
389 print(*fields, sep='\t') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
390 families_file.close() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
391 if corrections_in_this_family: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
392 corrected['reads'] += corrections_in_this_family |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
393 corrected['barcodes'] += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
394 logging.info('Corrected {barcodes} barcodes on {reads} read pairs, with {reversed} reversed.' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
395 .format(**corrected)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
396 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
397 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
398 def is_alignment_reversed(barcode1, barcode2): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
399 """Return True if the barcodes are reversed with respect to each other, False otherwise. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
400 "reversed" in this case meaning the alpha + beta halves are swapped. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
401 Determine by aligning the two to each other, once in their original forms, and once with the |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
402 second barcode reversed. If the smith-waterman score is higher in the reversed form, return True. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
403 """ |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
404 half = len(barcode2)//2 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
405 barcode2_rev = barcode2[half:] + barcode2[:half] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
406 fwd_align = swalign.smith_waterman(barcode1, barcode2) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
407 rev_align = swalign.smith_waterman(barcode1, barcode2_rev) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
408 if rev_align.score > fwd_align.score: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
409 return True |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
410 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
411 return False |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
412 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
413 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
414 def count_structures(meta_graph, family_counts): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
415 """Count the number of unique (isomorphic) subgraphs in the main graph.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
416 structures = [] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
417 for graph in networkx.connected_component_subgraphs(meta_graph): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
418 match = False |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
419 for structure in structures: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
420 archetype = structure['graph'] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
421 if networkx.is_isomorphic(graph, archetype): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
422 match = True |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
423 structure['count'] += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
424 structure['central'] += int(is_centralized(graph, family_counts)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
425 break |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
426 if not match: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
427 size = len(graph) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
428 central = is_centralized(graph, family_counts) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
429 structures.append({'graph':graph, 'size':size, 'count':1, 'central':int(central)}) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
430 return structures |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
431 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
432 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
433 def is_centralized(graph, family_counts): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
434 """Checks if the graph is centralized in terms of where the reads are located. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
435 In a centralized graph, the node with the highest degree is the only one which (may) have more |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
436 than one read pair associated with that barcode. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
437 This returns True if that's the case, False otherwise.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
438 if len(graph) == 2: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
439 # Special-case graphs with 2 nodes, since the other algorithm doesn't work for them. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
440 # - When both nodes have a degree of 1, sorting by degree doesn't work and can result in the |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
441 # barcode with more read pairs coming second. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
442 barcode1, barcode2 = graph.nodes() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
443 counts1 = family_counts[barcode1] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
444 counts2 = family_counts[barcode2] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
445 total1 = counts1['all'] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
446 total2 = counts2['all'] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
447 logging.debug('{}: {:3d} ({}/{})\n{}: {:3d} ({}/{})\n' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
448 .format(barcode1, total1, counts1['ab'], counts1['ba'], |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
449 barcode2, total2, counts2['ab'], counts2['ba'])) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
450 if (total1 >= 1 and total2 == 1) or (total1 == 1 and total2 >= 1): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
451 return True |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
452 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
453 return False |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
454 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
455 degrees = graph.degree() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
456 first = True |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
457 for barcode in sorted(graph.nodes(), key=lambda barcode: degrees[barcode], reverse=True): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
458 if not first: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
459 counts = family_counts[barcode] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
460 # How many read pairs are associated with this barcode (how many times did we see this barcode)? |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
461 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
462 if counts['all'] > 1: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
463 return False |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
464 except TypeError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
465 logging.critical('barcode: {}, counts: {}'.format(barcode, counts)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
466 raise |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
467 first = False |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
468 return True |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
469 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
470 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
471 def print_structures(structures, human=True): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
472 # Define a cmp function to sort the list of structures in ascending order of size, but then |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
473 # descending order of count. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
474 def cmp_fxn(structure1, structure2): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
475 if structure1['size'] == structure2['size']: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
476 return structure2['count'] - structure1['count'] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
477 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
478 return structure1['size'] - structure2['size'] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
479 width = None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
480 last_size = None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
481 for structure in sorted(structures, cmp=cmp_fxn): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
482 size = structure['size'] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
483 graph = structure['graph'] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
484 if size == last_size: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
485 i += 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
486 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
487 i = 0 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
488 if width is None: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
489 width = str(len(str(structure['count']))) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
490 letters = num_to_letters(i) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
491 degrees = sorted(graph.degree().values(), reverse=True) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
492 if human: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
493 degrees_str = ' '.join(map(str, degrees)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
494 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
495 degrees_str = ','.join(map(str, degrees)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
496 if human: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
497 format_str = '{:2d}{:<3s} {count:<'+width+'d} {central:<'+width+'d} {}' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
498 print(format_str.format(size, letters+':', degrees_str, **structure)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
499 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
500 print(size, letters, structure['count'], structure['central'], degrees_str, sep='\t') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
501 last_size = size |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
502 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
503 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
504 def num_to_letters(i): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
505 """Translate numbers to letters, e.g. 1 -> A, 10 -> J, 100 -> CV""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
506 letters = '' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
507 while i > 0: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
508 n = (i-1) % 26 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
509 i = i // 26 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
510 if n == 25: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
511 i -= 1 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
512 letters = chr(65+n) + letters |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
513 return letters |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
514 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
515 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
516 def visualize(graphs, viz_path, args_viz_format): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
517 import matplotlib |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
518 from networkx.drawing.nx_agraph import graphviz_layout |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
519 meta_graph = networkx.Graph() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
520 for graph in graphs: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
521 add_graph(meta_graph, graph) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
522 pos = graphviz_layout(meta_graph) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
523 networkx.draw(meta_graph, pos) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
524 if viz_path: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
525 ext = os.path.splitext(viz_path)[1] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
526 if ext == '.dot': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
527 viz_format = 'graphviz' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
528 elif ext == '.png': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
529 viz_format = 'png' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
530 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
531 viz_format = args_viz_format |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
532 if viz_format == 'graphviz': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
533 from networkx.drawing.nx_pydot import write_dot |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
534 assert viz_path is not None, 'Must provide a filename to --visualize if using --viz-format "graphviz".' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
535 base_path = os.path.splitext(viz_path) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
536 write_dot(meta_graph, base_path+'.dot') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
537 run_command('dot', '-T', 'png', '-o', base_path+'.png', base_path+'.dot') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
538 logging.info('Wrote image of graph to '+base_path+'.dot') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
539 elif viz_format == 'png': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
540 if viz_path is None: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
541 matplotlib.pyplot.show() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
542 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
543 matplotlib.pyplot.savefig(viz_path) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
544 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
545 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
546 def add_graph(graph, subgraph): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
547 # I'm sure there's a function in the library for this, but just cause I need it quick.. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
548 for node in subgraph.nodes(): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
549 graph.add_node(node) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
550 for edge in subgraph.edges(): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
551 graph.add_edge(*edge) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
552 return graph |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
553 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
554 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
555 def open_as_text_or_gzip(path): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
556 """Return an open file-like object reading the path as a text file or a gzip file, depending on |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
557 which it looks like.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
558 if detect_gzip(path): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
559 return gzip.open(path, 'r') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
560 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
561 return open(path, 'rU') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
562 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
563 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
564 def detect_gzip(path): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
565 """Return True if the file looks like a gzip file: ends with .gz or contains non-ASCII bytes.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
566 ext = os.path.splitext(path)[1] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
567 if ext == '.gz': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
568 return True |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
569 elif ext in ('.txt', '.tsv', '.csv'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
570 return False |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
571 with open(path) as fh: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
572 is_not_ascii = detect_non_ascii(fh.read(100)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
573 if is_not_ascii: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
574 return True |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
575 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
576 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
577 def detect_non_ascii(bytes, max_test=100): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
578 """Return True if any of the first "max_test" bytes are non-ASCII (the high bit set to 1). |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
579 Return False otherwise.""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
580 for i, char in enumerate(bytes): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
581 # Is the high bit a 1? |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
582 if ord(char) & 128: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
583 return True |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
584 if i >= max_test: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
585 return False |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
586 return False |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
587 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
588 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
589 def run_command(*command): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
590 try: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
591 exit_status = subprocess.call(command) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
592 except subprocess.CalledProcessError as cpe: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
593 exit_status = cpe.returncode |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
594 except OSError: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
595 exit_status = None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
596 return exit_status |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
597 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
598 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
599 def tone_down_logger(): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
600 """Change the logging level names from all-caps to capitalized lowercase. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
601 E.g. "WARNING" -> "Warning" (turn down the volume a bit in your log files)""" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
602 for level in (logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
603 level_name = logging.getLevelName(level) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
604 logging.addLevelName(level, level_name.capitalize()) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
605 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
606 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
607 if __name__ == '__main__': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
608 sys.exit(main(sys.argv)) |