annotate duplex.py @ 4:af383638de66 draft

planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
author nick
date Mon, 23 Nov 2015 18:44:23 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
1 #!/usr/bin/env python
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
2 from __future__ import division
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
3 import os
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
4 import sys
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
5 import time
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
6 import logging
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
7 import tempfile
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
8 import argparse
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
9 import subprocess
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
10 import collections
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
11 import distutils.spawn
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
12 import consensus
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
13 import swalign
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
14
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
15 SANGER_START = 33
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
16 SOLEXA_START = 64
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
17 REQUIRED_COMMANDS = ['mafft']
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
18 OPT_DEFAULTS = {'min_reads':3, 'processes':1, 'qual':20, 'qual_format':'sanger'}
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
19 USAGE = "%(prog)s [options]"
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
20 DESCRIPTION = """Build consensus sequences from read aligned families. Prints duplex consensus
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
21 sequences in FASTA to stdout. The sequence ids are BARCODE.MATE, e.g. "CTCAGATAACATACCTTATATGCA.1",
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
22 where "BARCODE" is the input barcode, and "MATE" is "1" or "2" as an arbitrary designation of the
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
23 two reads in the pair. The id is followed by the count of the number of reads in the two families
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
24 (one from each strand) that make up the duplex, in the format READS1/READS2. If the duplex is
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
25 actually a single-strand consensus because the matching strand is missing, only one number is
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
26 listed."""
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
27
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
28
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
29 def main(argv):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
30
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
31 parser = argparse.ArgumentParser(description=DESCRIPTION)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
32 parser.set_defaults(**OPT_DEFAULTS)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
33
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
34 parser.add_argument('infile', metavar='read-families.tsv', nargs='?',
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
35 help='The output of align_families.py. 6 columns: 1. (canonical) barcode. 2. order ("ab" or '
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
36 '"ba"). 3. mate ("1" or "2"). 4. read name. 5. aligned sequence. 6. aligned quality '
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
37 'scores.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
38 parser.add_argument('-r', '--min-reads', type=int,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
39 help='The minimum number of reads (from each strand) required to form a single-strand '
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
40 'consensus. Strands with fewer reads will be skipped. Default: %(default)s.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
41 parser.add_argument('-q', '--qual', type=int,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
42 help='Base quality threshold. Bases below this quality will not be counted. '
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
43 'Default: %(default)s.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
44 parser.add_argument('-F', '--qual-format', choices=('sanger', 'solexa'),
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
45 help='FASTQ quality score format. Sanger scores are assumed to begin at \'{}\' ({}). Default: '
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
46 '%(default)s.'.format(SANGER_START, chr(SANGER_START)))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
47 parser.add_argument('--incl-sscs', action='store_true',
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
48 help='When outputting duplex consensus sequences, include reads without a full duplex (missing '
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
49 'one strand). The result will just be the single-strand consensus of the remaining read.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
50 parser.add_argument('-s', '--sscs-file',
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
51 help='Save single-strand consensus sequences in this file (FASTA format). Currently does not '
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
52 'work when in parallel mode.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
53 parser.add_argument('-l', '--log', metavar='LOG_FILE', dest='stats_file',
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
54 help='Print statistics on the run to this file. Use "-" to print to stderr.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
55 parser.add_argument('-p', '--processes', type=int,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
56 help='Number of processes to use. If > 1, launches this many worker subprocesses. Note: if '
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
57 'this option is used, no output will be generated until the end of the entire run, so no '
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
58 'streaming is possible. Default: %(default)s.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
59 parser.add_argument('-S', '--slurm', action='store_true',
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
60 help='If --processes > 1, prepend sub-commands with "srun -C new".')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
61
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
62 args = parser.parse_args(argv[1:])
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
63
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
64 assert args.processes > 0, '-p must be greater than zero'
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
65 # Make dict of process_family() parameters that don't change between families.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
66 static = {}
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
67 static['processes'] = args.processes
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
68 static['incl_sscs'] = args.incl_sscs
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
69 static['min_reads'] = args.min_reads
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
70 if args.sscs_file:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
71 static['sscs_fh'] = open(args.sscs_file, 'w')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
72 if args.qual_format == 'sanger':
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
73 static['qual_thres'] = chr(args.qual + SANGER_START)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
74 elif args.qual_format == 'solexa':
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
75 static['qual_thres'] = chr(args.qual + SOLEXA_START)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
76 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
77 fail('Error: unrecognized --qual-format.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
78
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
79 # Check for required commands.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
80 missing_commands = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
81 if args.slurm:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
82 REQUIRED_COMMANDS.append('srun')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
83 for command in REQUIRED_COMMANDS:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
84 if not distutils.spawn.find_executable(command):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
85 missing_commands.append(command)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
86 if missing_commands:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
87 fail('Error: Missing commands: "'+'", "'.join(missing_commands)+'".')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
88
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
89 if args.infile:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
90 infile = open(args.infile)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
91 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
92 infile = sys.stdin
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
93
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
94 if args.stats_file:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
95 if args.stats_file == '-':
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
96 logging.basicConfig(stream=sys.stderr, level=logging.INFO, format='%(message)s')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
97 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
98 logging.basicConfig(filename=args.stats_file, filemode='w', level=logging.INFO,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
99 format='%(message)s')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
100 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
101 logging.disable(logging.CRITICAL)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
102
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
103 # Open all the worker processes, if we're using more than one.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
104 workers = None
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
105 if args.processes > 1:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
106 workers = open_workers(args.processes, args)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
107
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
108 stats = {'time':0, 'reads':0, 'runs':0, 'families':0}
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
109 all_reads = 0
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
110 duplex = collections.OrderedDict()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
111 family = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
112 barcode = None
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
113 order = None
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
114 mate = None
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
115 for line in infile:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
116 fields = line.rstrip('\r\n').split('\t')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
117 if len(fields) != 6:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
118 continue
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
119 (this_barcode, this_order, this_mate, name, seq, qual) = fields
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
120 this_mate = int(this_mate)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
121 # If the barcode or order has changed, we're in a new single-stranded family.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
122 # Process the reads we've previously gathered as one family and start a new family.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
123 if this_barcode != barcode or this_order != order or this_mate != mate:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
124 duplex[(order, mate)] = family
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
125 # We're at the end of the duplex pair if the barcode changes or if the order changes without
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
126 # the mate changing, or vice versa (the second read in each duplex comes when the barcode
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
127 # stays the same while both the order and mate switch). Process the duplex and start
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
128 # a new one. If the barcode is the same, we're in the same duplex, but we've switched strands.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
129 if this_barcode != barcode or not (this_order != order and this_mate != mate):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
130 # sys.stderr.write('New duplex: {}, {}, {}\n'.format(this_barcode, this_order, this_mate))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
131 process_duplex(duplex, barcode, workers=workers, stats=stats, **static)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
132 duplex = collections.OrderedDict()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
133 # else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
134 # sys.stderr.write('Same duplex: {}, {}, {}\n'.format(this_barcode, this_order, this_mate))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
135 barcode = this_barcode
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
136 order = this_order
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
137 mate = this_mate
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
138 family = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
139 read = {'name': name, 'seq':seq, 'qual':qual}
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
140 family.append(read)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
141 all_reads += 1
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
142 # Process the last family.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
143 duplex[(order, mate)] = family
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
144 process_duplex(duplex, barcode, workers=workers, stats=stats, **static)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
145
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
146 if args.processes > 1:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
147 close_workers(workers)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
148 compile_results(workers)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
149 delete_tempfiles(workers)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
150
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
151 if args.sscs_file:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
152 static['sscs_fh'].close()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
153 if infile is not sys.stdin:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
154 infile.close()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
155
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
156 if not args.stats_file:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
157 return
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
158
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
159 # Final stats on the run.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
160 logging.info('Processed {} reads and {} duplexes.'
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
161 .format(all_reads, stats['runs']))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
162 per_read = stats['time'] / stats['reads']
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
163 per_run = stats['time'] / stats['runs']
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
164 logging.info('{:0.3f}s per read, {:0.3f}s per run.'.format(per_read, per_run))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
165
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
166
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
167 def open_workers(num_workers, args):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
168 """Open the required number of worker processes."""
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
169 script_path = os.path.realpath(sys.argv[0])
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
170 workers = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
171 for i in range(num_workers):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
172 if args.slurm:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
173 command = ['srun', '-C', 'new', 'python', script_path]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
174 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
175 command = ['python', script_path]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
176 arguments = gather_args(sys.argv, args.infile)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
177 command.extend(arguments)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
178 stats_subfile = None
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
179 if args.stats_file:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
180 if args.stats_file == '-':
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
181 stats_subfile = '-'
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
182 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
183 stats_subfile = "{}.{}.log".format(args.stats_file, i)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
184 command.extend(['-s', stats_subfile])
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
185 outfile = tempfile.NamedTemporaryFile('w', delete=False, prefix='sscs.out.part.')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
186 process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=outfile)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
187 worker = {'proc':process, 'outfile':outfile, 'stats':stats_subfile}
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
188 workers.append(worker)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
189 return workers
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
190
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
191
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
192 def gather_args(args, infile, excluded_flags={'-S', '--slurm'},
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
193 excluded_args={'-p', '--processes', '-l', '--log', '-s', '--sscs-file'}):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
194 """Take the full list of command-line arguments and return only the ones which
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
195 should be passed to worker processes.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
196 Excludes the 0th argument (the command name), the input filename ("infile"), all
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
197 arguments in "excluded_flags", and all arguments in "excluded_args" plus the
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
198 arguments which follow."""
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
199 out_args = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
200 skip = True
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
201 for arg in args:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
202 if skip:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
203 skip = False
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
204 continue
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
205 if arg in excluded_flags:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
206 continue
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
207 if arg in excluded_args:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
208 skip = True
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
209 continue
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
210 if arg == infile:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
211 continue
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
212 out_args.append(arg)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
213 return out_args
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
214
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
215
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
216 def delegate(worker, duplex, barcode):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
217 """Send a family to a worker process."""
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
218 for (order, mate), family in duplex.items():
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
219 for read in family:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
220 line = '{}\t{}\t{}\t{name}\t{seq}\t{qual}\n'.format(barcode, order, mate, **read)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
221 if family:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
222 worker['proc'].stdin.write(line)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
223
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
224
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
225 def close_workers(workers):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
226 for worker in workers:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
227 worker['outfile'].close()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
228 worker['proc'].stdin.close()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
229
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
230
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
231 def compile_results(workers):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
232 for worker in workers:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
233 worker['proc'].wait()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
234 with open(worker['outfile'].name, 'r') as outfile:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
235 for line in outfile:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
236 sys.stdout.write(line)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
237
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
238
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
239 def delete_tempfiles(workers):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
240 for worker in workers:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
241 os.remove(worker['outfile'].name)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
242 if worker['stats']:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
243 os.remove(worker['stats'])
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
244
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
245
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
246 def process_duplex(duplex, barcode, workers=None, stats=None, incl_sscs=False, sscs_fh=None,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
247 processes=1, min_reads=1, qual_thres=' '):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
248 stats['families'] += 1
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
249 # Are we the controller process or a worker?
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
250 if processes > 1:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
251 i = stats['families'] % len(workers)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
252 worker = workers[i]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
253 delegate(worker, duplex, barcode)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
254 return
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
255 # We're a worker. Actually process the family.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
256 start = time.time()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
257 consensi = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
258 reads_per_strand = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
259 duplex_mate = None
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
260 for (order, mate), family in duplex.items():
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
261 reads = len(family)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
262 if reads < min_reads:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
263 continue
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
264 # The mate number for the duplex consensus. It's arbitrary, but all that matters is that the
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
265 # two mates have different numbers. This system ensures that:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
266 # Mate 1 is from the consensus of ab/1 and ba/2 families, while mate 2 is from ba/1 and ab/2.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
267 if (order == 'ab' and mate == 1) or (order == 'ba' and mate == 2):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
268 duplex_mate = 1
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
269 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
270 duplex_mate = 2
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
271 seqs = [read['seq'] for read in family]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
272 quals = [read['qual'] for read in family]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
273 consensi.append(consensus.get_consensus(seqs, quals, qual_thres=qual_thres))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
274 reads_per_strand.append(reads)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
275 assert len(consensi) <= 2
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
276 if sscs_fh:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
277 for cons, (order, mate), reads in zip(consensi, duplex.keys(), reads_per_strand):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
278 sscs_fh.write('>{bar}.{order}.{mate} {reads}\n'.format(bar=barcode, order=order, mate=mate,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
279 reads=reads))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
280 sscs_fh.write(cons+'\n')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
281 if len(consensi) == 1 and incl_sscs:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
282 print_duplex(consensi[0], barcode, duplex_mate, reads_per_strand)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
283 elif len(consensi) == 2:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
284 align = swalign.smith_waterman(*consensi)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
285 #TODO: log error & return if len(align.target) != len(align.query)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
286 cons = consensus.build_consensus_duplex_simple(align.target, align.query)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
287 print_duplex(cons, barcode, duplex_mate, reads_per_strand)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
288 elapsed = time.time() - start
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
289 logging.info('{} sec for {} reads.'.format(elapsed, sum(reads_per_strand)))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
290 if stats and len(consensi) > 0:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
291 stats['time'] += elapsed
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
292 stats['reads'] += sum(reads_per_strand)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
293 stats['runs'] += 1
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
294
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
295
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
296 def print_duplex(cons, barcode, mate, reads_per_strand, outfile=sys.stdout):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
297 header = '>{bar}.{mate} {reads}'.format(bar=barcode, mate=mate,
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
298 reads='/'.join(map(str, reads_per_strand)))
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
299 outfile.write(header+'\n')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
300 outfile.write(cons+'\n')
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
301
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
302
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
303 def read_fasta(fasta, is_file=True):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
304 """Quick and dirty FASTA parser. Return the sequences and their names.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
305 Returns a list of sequences. Each is a dict of 'name' and 'seq'.
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
306 Warning: Reads the entire contents of the file into memory at once."""
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
307 sequences = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
308 seq_lines = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
309 seq_name = None
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
310 if is_file:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
311 with open(fasta) as fasta_file:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
312 fasta_lines = fasta_file.readlines()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
313 else:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
314 fasta_lines = fasta.splitlines()
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
315 for line in fasta_lines:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
316 if line.startswith('>'):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
317 if seq_lines:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
318 sequences.append({'name':seq_name, 'seq':''.join(seq_lines)})
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
319 seq_lines = []
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
320 seq_name = line.rstrip('\r\n')[1:]
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
321 continue
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
322 seq_lines.append(line.strip())
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
323 if seq_lines:
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
324 sequences.append({'name':seq_name, 'seq':''.join(seq_lines)})
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
325 return sequences
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
326
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
327
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
328 def fail(message):
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
329 sys.stderr.write(message+"\n")
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
330 sys.exit(1)
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
331
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
332 if __name__ == '__main__':
af383638de66 planemo upload commit 022984f323d3da44f70b3bf79c684cfd8dda3f61-dirty
nick
parents:
diff changeset
333 sys.exit(main(sys.argv))