annotate vmap.py @ 44:a8b31d446fec draft default tip

updated vhom regions with multiple hit file input and adapted visualization
author mzeidler
date Mon, 04 Nov 2013 11:57:55 -0500
parents be5d3889a1b9
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1 #!/usr/bin/env python
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
2 #from __future__ import print_function
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
3
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
4 import cProfile
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
5
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
6 import sys
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
7 import re
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
8
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
9 import tempfile
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
10 import subprocess
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
11 import shutil
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
12 import os
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
13 import os.path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
14 import logging
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
15 import bz2
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
16 import zlib
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
17
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
18 import math
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
19 import string
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
20
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
21 from collections import defaultdict, Counter
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
22
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
23 from subprocess import PIPE
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
24
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
25 NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
26 NON_ID = NON_ID.replace('_', '').replace('-', '')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
27
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
28 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
29 from Bio.SeqRecord import SeqRecord
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
30 from Bio import SeqIO
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
31 from Bio.Seq import Seq
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
32 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
33 message = 'This script requires the BioPython python package\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
34 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
35 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
36
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
37 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
38 from plumbum import cli
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
39 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
40 message = 'This script requires the plumbum python package\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
41 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
42 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
43
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
44
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
45 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
46 import HTSeq
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
47 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
48 message = 'This script requires the HTSeq python package\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
49 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
50 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
51
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
52 KHMER_AVAILABLE = True
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
53 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
54 import khmer
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
55 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
56 KHMER_AVAILABLE = False
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
57
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
58 #from io import BufferedRandom
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
59
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
60 logging.basicConfig(level=logging.INFO, format='%(message)s')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
61
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
62
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
63 def profile_this(fn):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
64 def profiled_fn(*args, **kwargs):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
65 fpath = fn.__name__ + ".profile"
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
66 prof = cProfile.Profile()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
67 ret = prof.runcall(fn, *args, **kwargs)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
68 prof.dump_stats(fpath)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
69 return ret
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
70 return profiled_fn
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
71
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
72
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
73 class CLI(cli.Application):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
74 """RNA-Seq and DNA-Seq short read analysis by mapping to known reference sequences"""
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
75 PROGNAME = "vmap"
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
76 VERSION = "1.0.0"
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
77 DESCRIPTION = """Virana vmap is an interface to the NCBI and ensembl reference databases that can
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
78 generate reference indexes for the short read mappers STAR (RNA-Seq) and
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
79 BWA-MEM (DNA-Seq). Short reads can be mapped to arbitrary combinations of
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
80 reference databases and the results can be summarized by taxonomic family
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
81 as well as stored as SAM file, unsorted BAM file, or as a HIT file that
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
82 models multimapping reads between specific reference databases."""
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
83 USAGE = """The program has four modes that can be accessed by `vmap rnaindex`, `vmap dnaindex`, `vmap rnamap`, and `vmap dnamap.`"""
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
84
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
85 def main(self, *args):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
86
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
87 if args:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
88 print("Unknown command %r" % (args[0]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
89 print self.USAGE
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
90 return 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
91
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
92 if not self.nested_command:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
93 print("No command given")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
94 print self.USAGE
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
95 return 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
96
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
97
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
98 @CLI.subcommand("rnaindex")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
99 class RNAIndex(cli.Application):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
100 """ Creates a STAR index from a FASTA genome reference """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
101
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
102 reference_files = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
103 ['-r', '--reference_file'], str, list=True, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
104 help="Sets the reference genome(s) FASTA file." +
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
105 " Multiple occurrences of this parameter are allowed.")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
106 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
107 help="Sets the index output directory." +
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
108 " Directory will be generated if not existing." +
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
109 " Directory will be filled with several index files.")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
110 threads = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
111 ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
112 help="Sets the number of threads to use",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
113 default=1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
114 max_ram = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
115 ['-m'], cli.Range(1, 400000000000), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
116 help="Sets the maximum amount of memory (RAM) to use (in bytes)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
117 default=400000000000)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
118 path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
119 help="Path to STAR executable",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
120 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
121 sparse = cli.Flag(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
122 ["-s", "--sparse"], help="If given, a sparse index that requires less " +
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
123 " RAM in the mapping phase will be constructed")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
124
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
125 debug = cli.Flag(["-d", "--debug"], help="Enable debug output")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
126
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
127 def main(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
128
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
129 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
130 logging.getLogger().setLevel(logging.DEBUG)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
131
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
132 # Obtain star executable
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
133 star = [self.path and self.path or 'STAR']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
134
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
135 # Check if genome directory is existing
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
136 for reference_file in self.reference_files:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
137 if not os.path.exists(reference_file):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
138 sys.stdout.write(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
139 'Reference file %s nor existing, exiting' % reference_file)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
140 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
141
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
142 # Check if output directory is existing
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
143 if not os.path.exists(self.index_dir):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
144 logging.debug(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
145 'Making output directory for index at %s' % self.index_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
146 os.makedirs(self.index_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
147
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
148 # # Make named pipe to extract genomes
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
149 # pipe_path = os.path.abspath(os.path.join(self.genome_dir, 'pipe.fa'))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
150 # if os.path.exists(pipe_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
151 # os.unlink(pipe_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
152 # os.mkfifo(pipe_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
153
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
154 # Make star command line
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
155 cline = star + ['--runMode', 'genomeGenerate',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
156 '--genomeDir', self.index_dir,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
157 '--limitGenomeGenerateRAM', str(self.max_ram),
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
158 '--runThreadN', str(self.threads),
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
159 '--genomeFastaFiles'] + self.reference_files
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
160
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
161 # Add parameters for sparse (memory-saving) index generation
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
162 if self.sparse:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
163 cline += ['--genomeSAsparseD', '2',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
164 '--genomeChrBinNbits', '12',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
165 '--genomeSAindexNbases', '13']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
166
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
167 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
168 cline += ['--genomeSAsparseD', '1',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
169 '--genomeChrBinNbits', '18',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
170 '--genomeSAindexNbases', '15']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
171
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
172 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
173 print ' '.join(cline)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
174
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
175 # Run STAR reference generation process
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
176 star_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
177
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
178 # Block until streams are closed by the process
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
179 stdout, stderr = star_process.communicate()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
180
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
181 if stderr:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
182 sys.stderr.write(stderr)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
183
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
184 if self.debug and stdout:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
185 print stdout
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
186
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
187
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
188 @CLI.subcommand("dnaindex")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
189 class DNAIndex(cli.Application):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
190 """ Creates a BWA index from a FASTA reference file """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
191
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
192 reference_file = cli.SwitchAttr(['-r', '--reference_file'], str, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
193 help="Sets the input reference FASTA file.")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
194 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
195 help="Sets the index output directory." +
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
196 " Directory will be generated if not existing." +
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
197 " Directory will be filled with several index files.")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
198 path = cli.SwitchAttr(['-p', '--path'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
199 help="Path to BWA executable",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
200 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
201 debug = cli.Flag(["-d", "--debug"], help="Enable debug output")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
202
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
203 if debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
204 logging.getLogger().setLevel(logging.DEBUG)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
205
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
206 def main(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
207
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
208 # Obtain star executable
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
209 bwa = [self.path and self.path or 'bwa']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
210
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
211 # Check if genome directory is existing
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
212 if not os.path.exists(self.reference_file):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
213 sys.stdout.write('Genome file %s nor existing, exiting' % self.reference_file)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
214 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
215
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
216 # Check if output directory is existing
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
217 if not os.path.exists(self.index_dir):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
218 logging.debug('Making output directory %s' % self.index_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
219 os.makedirs(self.index_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
220
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
221 # Make star command line
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
222 cline = bwa + ['index', '-a', 'bwtsw', '-p', os.path.join(self.index_dir, 'index'), self.reference_file]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
223
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
224 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
225 print ' '.join(cline)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
226
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
227 # Run BWA index generation process
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
228 bwa_process = subprocess.Popen(' '.join(cline), shell=True, stdout=PIPE, stderr=PIPE)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
229 stdout, stderr = bwa_process.communicate()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
230
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
231 if stderr:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
232 sys.stderr.write(stderr)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
233
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
234 if self.debug and stdout:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
235 print stdout
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
236
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
237
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
238
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
239
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
240
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
241 class SAMHits:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
242 """ Converts SAM output of mappers into bzipped HIT files. """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
243
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
244 def __init__(self, output_file, sample_id, refseq_filter=None, min_mapping_score=None,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
245 min_alignment_score=None, max_mismatches=None,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
246 max_relative_mismatches=None, min_continiously_matching=None,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
247 filter_complexity=False):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
248
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
249 self.output_file = bz2.BZ2File(output_file, 'wb', buffering=100 * 1024 * 1024)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
250 self.sample_id = sample_id.translate(None, NON_ID)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
251 self.refseq_filter = refseq_filter
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
252 self.max_mismatches = max_mismatches
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
253 self.max_relative_mismatches = max_relative_mismatches
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
254 self.current_group = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
255 self.min_mapping_score = min_mapping_score
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
256 self.min_alignment_score = min_alignment_score
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
257 self.min_continiously_matching = min_continiously_matching
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
258 self.filter_complexity = filter_complexity
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
259
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
260 self.re_matches = re.compile(r'(\d+)M')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
261 self.re_dels = re.compile(r'(\d+)D')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
262
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
263 def count(self, parsed_line):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
264
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
265 if parsed_line is None:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
266 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
267
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
268 read_key, read_name, flag, ref_name, ref_position, mapping_score,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
269 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
270 is_end1, is_end2, number_mismatches, alignment_score,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
271 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
272 is_paired, number_matches, read_end_pos, max_match = parsed_line
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
273
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
274 if not is_mapped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
275 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
276
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
277 if self.filter_complexity:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
278
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
279 avg_compression = float(len(zlib.compress(seq)))/len(seq)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
280
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
281 if avg_compression < 0.5:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
282 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
283
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
284 # length = len(seq)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
285 # counts = [seq.count(nuc) for nuc in 'ACGT']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
286 # min_count = length * 0.10
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
287 # max_count = length * 0.50
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
288 # for count in counts:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
289 # if count < min_count or count > max_count:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
290 # return None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
291
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
292 # counter = Counter()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
293 # for i in range(length - 2):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
294 # counter[seq[i: i + 3]] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
295 # maximal = length - 4
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
296
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
297 # highest = sum([v for k, v in counter.most_common(2)])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
298 # if highest > (maximal / 3.0):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
299 # return None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
300
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
301 # self.passed.append(avg_compression)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
302
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
303 pair_id = ''
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
304 if is_end1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
305 pair_id = '/1'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
306
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
307 elif is_end2:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
308 pair_id = '/2'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
309
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
310 read_name = self.sample_id + ';' + read_name + pair_id
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
311
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
312 # Initialize new current group
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
313 if len(self.current_group) == 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
314 self.current_group = [read_name, seq, []]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
315
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
316 # Write old current group to file
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
317 if read_name != self.current_group[0]:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
318 self._write_group()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
319 self.current_group = [read_name, seq, []]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
320
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
321 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
322 refseq_group, family, organism, identifier = ref_name.split(';')[:4]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
323 except ValueError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
324 sys.stderr.write('Read mapped to malformed reference sequence %s, skipping\n' % ref_name)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
325 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
326
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
327 if self.min_continiously_matching:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
328
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
329 if self.min_continiously_matching > max_match:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
330 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
331
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
332 if self.max_mismatches\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
333 and int(number_mismatches) > self.max_mismatches:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
334 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
335
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
336 if self.max_relative_mismatches\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
337 and int(number_mismatches) / float(len(seq))\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
338 > self.max_relative_mismatches:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
339 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
340
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
341 if self.min_mapping_score\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
342 and self.min_mapping_score > mapping_score:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
343 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
344
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
345 if self.min_alignment_score\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
346 and self.min_alignment_score > alignment_score:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
347 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
348
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
349 start = int(ref_position) + 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
350
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
351 self.current_group[2].append([refseq_group, family, organism, identifier, str(start), str(read_end_pos)])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
352
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
353 def _write_group(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
354 passed = True
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
355
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
356 if self.refseq_filter:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
357 passed = False
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
358 for refseq_group, family, organism, identifier, start, end in self.current_group[2]:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
359 if passed:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
360 break
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
361 for f in self.refseq_filter:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
362 if refseq_group == f:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
363 passed = True
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
364 break
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
365 if passed:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
366 description = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
367 for identifier in self.current_group[2]:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
368 description.append(';'.join(identifier))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
369 description = '|'.join(description)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
370
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
371 record = SeqRecord(Seq(self.current_group[1]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
372 record.id = 'Read;' + self.current_group[0]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
373 record.description = description
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
374
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
375 SeqIO.write([record], self.output_file, "fasta")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
376
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
377 def write(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
378
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
379 self._write_group()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
380 self.output_file.close()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
381
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
382 class SAMParser:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
383
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
384 def parse(self, line):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
385
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
386 if line[0] == '@':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
387 return None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
388
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
389 alignment = HTSeq._HTSeq.SAM_Alignment.from_SAM_line(line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
390 read_name = alignment.read.name
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
391 seq = alignment.read.seq
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
392 qual = alignment.read.qual
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
393 flag = alignment.flag
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
394 cigar = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
395
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
396 is_paired = (flag & 1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
397 is_mapped = not (flag & 4)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
398 is_mate_mapped = alignment.mate_aligned is not None #not (flag & 8)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
399 is_reverse = (flag & 16)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
400 is_end1 = (flag & 64)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
401 is_end2 = (flag & 128)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
402 is_primary = not (flag & 256)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
403
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
404 read_key = (read_name, is_end1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
405
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
406 ref_name = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
407 ref_position = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
408 mapping_score = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
409 mate_ref_name = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
410 mate_ref_position = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
411 insert_size = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
412 alignment_score = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
413 read_end_pos = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
414
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
415 if is_mate_mapped and alignment.mate_start:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
416 mate_ref_name = alignment.mate_start.chrom
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
417 mate_ref_position = alignment.mate_start.start
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
418
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
419 number_hits = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
420 alignment_score = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
421 number_mismatches = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
422
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
423 number_matches = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
424 max_match = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
425
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
426 if is_mapped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
427
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
428 ref_name = alignment.iv.chrom
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
429 ref_position = alignment.iv.start
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
430 read_end_pos = alignment.iv.end
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
431 alignment_score = alignment.aQual
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
432 cigar = alignment.cigar
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
433
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
434 if is_mate_mapped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
435 insert_size = alignment.inferred_insert_size
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
436
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
437 for c in cigar:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
438 if c.type == 'M':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
439 number_matches += c.size
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
440 max_match = max(max_match, c.size)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
441
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
442 for tag, value in alignment.optional_fields:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
443 if tag == 'NM':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
444 number_hits = value
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
445 elif tag == 'AS':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
446 alignment_score = value
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
447 elif tag == 'NH':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
448 number_mismatches = value
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
449
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
450 return read_key, read_name, flag, ref_name, ref_position, mapping_score,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
451 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
452 is_end1, is_end2, number_mismatches, alignment_score,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
453 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
454 is_paired, number_matches, read_end_pos, max_match
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
455
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
456
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
457 class SAMQuality:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
458
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
459 def __init__(self, file_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
460
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
461 self.file_path = file_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
462
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
463 self.stored = defaultdict(Counter)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
464 self.all_references = defaultdict(int)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
465 self.primary_references = defaultdict(int)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
466 self.complement = string.maketrans('ATCGN', 'TAGCN')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
467
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
468 if KHMER_AVAILABLE:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
469 self.ktable = khmer.new_ktable(10)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
470
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
471 def _get_complement(self, sequence):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
472
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
473 return sequence.translate(self.complement)[::-1]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
474
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
475 def _get_summary(self, counter):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
476 """"Returns five numbers (sum, extrema, mean, and std)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
477 for a max_frequency counter """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
478
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
479 maximum = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
480 minimum = sys.maxint
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
481 thesum = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
482 allcount = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
483 mode = [0, None]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
484
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
485 items = 0.0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
486 mean = 0.0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
487 m2 = 0.0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
488 variance = 0.0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
489
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
490 for item in counter:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
491
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
492 count = counter[item]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
493 if count > mode[0]:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
494 mode = [count, item]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
495
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
496 allcount += count
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
497 maximum = max(maximum, item)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
498 minimum = min(minimum, item)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
499 thesum += (count * item)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
500
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
501 x = 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
502 while x <= count:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
503 items += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
504 delta = item - mean
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
505 mean = mean + delta / items
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
506 m2 = m2 + delta * (item - mean)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
507 variance = m2 / items
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
508 x += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
509
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
510 std = math.sqrt(variance)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
511
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
512 return allcount, thesum, minimum, maximum, mode[1], mean, std
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
513
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
514
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
515 def _to_unit(self, item, is_percentage=False):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
516 """ Convert a numeric to a string with metric units """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
517
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
518 if is_percentage:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
519 return ('%-.3f' % (item * 100)) + '%'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
520 converted = None
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
521 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
522 item = float(item)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
523 if item > 10**12:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
524 converted = str(round(item / 10**9,3))+'P'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
525 elif item > 10**9:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
526 converted = str(round(item / 10**9,3))+'G'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
527 elif item > 10**6:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
528 converted = str(round(item / 10**6,3))+'M'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
529 elif item > 10**3:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
530 converted = str(round(item / 10**3,3))+'K'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
531 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
532 converted = str(round(item,3))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
533 except:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
534 converted = str(item)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
535
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
536 return converted
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
537
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
538 def _str_metrics(self, data):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
539
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
540 str_metrics = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
541
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
542 for (item, metric) in sorted(data.keys()):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
543 counter = data[(item, metric)]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
544 if not hasattr(counter.iterkeys().next(), 'real'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
545 for element, count in counter.most_common():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
546 str_metrics.append(self._str_metric(item, metric, element, count, no_units=True))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
547 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
548 summary = self._get_summary(counter)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
549 str_metrics.append(self._str_metric(item, metric, *summary))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
550
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
551 return str_metrics
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
552
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
553 def _str_metric(self, item, metric, count, thesum='', minimum='',\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
554 maximum='', mode='', mean='', std='', no_units=False):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
555
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
556 counters = [count, thesum, minimum, maximum, mode, mean, std]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
557 counters = map(str, counters)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
558
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
559 if no_units:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
560 items = [item, metric] + counters
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
561 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
562 units = map(self._to_unit, counters)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
563 items = [item, metric] + units
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
564
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
565 return '%-15s\t%-60s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\n' \
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
566 % tuple(items)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
567
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
568
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
569 def _count_read(self, metric, data, sample):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
570
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
571 item = 'read'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
572
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
573 (insert_size, alignment_score, mapping_score, length,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
574 q20_length, avg_phred_quality, number_hits, is_reverse) = data
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
575
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
576 self.stored[(item, metric + ' mappings')][number_hits] += sample
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
577 self.stored[(item, metric + ' insert')][insert_size] += sample
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
578
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
579
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
580 def _count_segment(self, metric, data, sample):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
581
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
582 item = 'segment'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
583
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
584 (insert_size, alignment_score, mapping_score, length,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
585 q20_length, avg_phred_quality, number_hits, is_reverse) = data
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
586
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
587 self.stored[(item, metric + ' algq')][alignment_score] += sample
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
588 self.stored[(item, metric + ' mapq')][mapping_score] += sample
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
589 self.stored[(item, metric + ' length')][length] += sample
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
590 self.stored[(item, metric + ' q20length')][q20_length] += sample
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
591 self.stored[(item, metric + ' meanbasequal')][avg_phred_quality] += sample
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
592 self.stored[(item, metric + ' reverse')][is_reverse] += sample
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
593
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
594 def count(self, parsed_line):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
595
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
596 if parsed_line is None:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
597 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
598
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
599 #print_metric('Item' , 'Metric', 'Count', 'Sum', 'Min', 'Max', 'Mode', 'Mean', 'STD')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
600
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
601 read_key, read_name, flag, ref_name, ref_position, mapping_score,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
602 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
603 is_end1, is_end2, number_mismatches, alignment_score,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
604 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
605 is_paired, number_matches, read_end_pos, max_match = parsed_line
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
606
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
607 phred_quality = [q - 33 for q in qual]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
608 avg_phred_quality = sum(phred_quality) / float(len(phred_quality))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
609 length = len(seq)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
610 mate_reference_id = mate_ref_name
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
611 reference_id = ref_name
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
612 reference = reference_id is not None and reference_id != '*'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
613 insert_size = insert_size and abs(insert_size) or insert_size
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
614 is_segment1 = not is_paired or (is_paired and is_end1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
615 is_reverse = is_reverse
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
616 is_unique = is_primary and number_hits == 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
617 is_translocation = is_paired and is_mapped and is_mate_mapped\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
618 and (mate_reference_id != '=' and reference_id != mate_reference_id)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
619 is_part_of_doublemap = is_paired and is_mapped and is_mate_mapped
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
620 is_part_of_halfmap = is_paired and (is_mapped != is_mate_mapped)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
621 is_part_of_nomap = is_paired and not is_mapped and not is_mate_mapped
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
622
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
623
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
624 # Count length until first low quality base call
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
625 q20_length = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
626 for q in phred_quality:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
627 if q < 20:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
628 break
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
629 q20_length += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
630
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
631 # Count kmers
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
632 if KHMER_AVAILABLE:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
633 if not is_reverse:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
634 self.ktable.consume(seq)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
635 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
636 self.ktable.consume(self._get_complement(seq))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
637
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
638 if reference:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
639
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
640 self.all_references[reference_id] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
641
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
642 if is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
643 self.primary_references[reference_id] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
644
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
645
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
646 data = (insert_size, alignment_score, mapping_score, length,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
647 q20_length, avg_phred_quality, number_hits, is_reverse)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
648
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
649 sample = 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
650
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
651 self._count_segment('sequenced', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
652 if is_mapped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
653 self._count_segment('sequenced mapped multi', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
654 if is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
655 self._count_segment('sequenced mapped primary', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
656 if number_hits and is_unique:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
657 self._count_segment('sequenced mapped primary unique', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
658
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
659 if is_segment1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
660 self._count_read('sequenced mapped multi', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
661 if is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
662 self._count_read('sequenced mapped primary', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
663
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
664 if is_paired:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
665 self._count_segment('sequenced paired', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
666
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
667 if is_part_of_doublemap:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
668 self._count_segment('sequenced paired doublemap', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
669
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
670 if is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
671 self._count_segment('sequenced paired doublemap primary', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
672
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
673 if is_segment1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
674 self._count_read('sequenced paired doublemap multi', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
675
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
676 if is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
677 self._count_read('sequenced paired doublemap primary', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
678
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
679 if number_hits and is_unique:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
680 self._count_read('sequenced paired doublemap primary unique', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
681
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
682 if is_translocation:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
683 self._count_read('sequenced paired doublemap primary unique translocation', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
684
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
685 elif is_part_of_halfmap:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
686
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
687 self._count_segment('sequenced paired halfmap', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
688
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
689 # The mapped segment
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
690 if is_mapped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
691
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
692 self._count_segment('sequenced paired halfmap mapped', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
693
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
694 if is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
695
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
696 self._count_read('sequenced paired halfmap mapped primary', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
697
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
698 if number_hits and is_unique:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
699 self._count_read('sequenced paired halfmap mapped primary unique', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
700
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
701 elif not is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
702
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
703 self._count_read('sequenced unpaired mapped multi', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
704
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
705 # The unmapped segment
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
706 elif not is_mapped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
707 self._count_segment('sequenced paired halfmap unmapped', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
708
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
709 elif is_part_of_nomap:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
710
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
711 self._count_segment('sequenced paired nomap', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
712
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
713 if is_segment1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
714
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
715 self._count_read('sequenced paired nomap', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
716
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
717 elif not is_paired:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
718
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
719 self._count_segment('sequenced unpaired', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
720
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
721 if is_mapped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
722
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
723 self._count_segment('sequenced unpaired mapped', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
724
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
725 if is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
726
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
727 self._count_read('sequenced unpaired mapped primary', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
728
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
729 if number_hits and is_unique:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
730
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
731 self._count_read('sequenced paired unpaired mapped primary unique', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
732
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
733 elif not is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
734
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
735 self._count_read('sequenced unpaired mapped multi', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
736
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
737
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
738 elif not is_mapped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
739
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
740 self._count_segment('sequenced unpaired unmapped', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
741
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
742 if is_segment1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
743 self._count_read('sequenced unpaired unmapped', data, sample)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
744
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
745 def write(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
746
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
747 with open(self.file_path, 'w') as output_file:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
748
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
749 all_references = sorted([(count, reference) for reference, count\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
750 in self.all_references.iteritems()], reverse=True)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
751
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
752 for j, (count, reference) in enumerate(all_references[:30]):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
753 self.stored[('segment', 'multireference_' + str(j+1))][reference] = count
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
754
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
755 primary_references = sorted([(count, reference) for reference, count\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
756 in self.primary_references.iteritems()], reverse=True)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
757
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
758 for j, (count, reference) in enumerate(primary_references[:30]):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
759 self.stored[('segment', 'primaryreference_' + str(j+1))][reference] = count
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
760
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
761 # Extract top-ranking kmers
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
762 if KHMER_AVAILABLE:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
763 kmer_frequencies = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
764 for i in range(0, self.ktable.n_entries()):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
765 n = self.ktable.get(i)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
766 if n > 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
767 kmer_frequencies.append((n, self.ktable.reverse_hash(i)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
768 kmer_frequencies = sorted(kmer_frequencies, reverse=True)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
769 for j, (frequency, kmer) in enumerate(kmer_frequencies[:10]):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
770 self.stored[('segment', 'kmer_' + str(j+1))][kmer] = frequency
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
771
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
772 output_file.writelines(self._str_metrics(self.stored))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
773
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
774
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
775 class SAMTaxonomy:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
776 """ Provides taxonomic summary information from a SAM file stream. """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
777
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
778 def __init__(self, file_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
779
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
780 self.file_path = file_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
781
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
782 self.count_primaries = Counter()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
783 self.detailed_information = {}
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
784
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
785 self._last_read = (None, None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
786 self._last_read_human_prim = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
787 self._last_read_human_sec = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
788 self._last_organisms = set()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
789
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
790 def count(self, parsed_line):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
791
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
792 if parsed_line is None:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
793 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
794
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
795 read_key, read_name, flag, ref_name, ref_position, mapping_score,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
796 cigar, mate_ref_name, mate_ref_position, insert_size, seq, qual,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
797 is_end1, is_end2, number_mismatches, alignment_score,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
798 number_hits, is_reverse, is_primary, is_mapped, is_mate_mapped,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
799 is_paired, number_matches, read_end_pos, max_match = parsed_line
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
800
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
801 if is_mapped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
802
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
803 refseq_group, family, organism, gi = ref_name.split(';')[:4]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
804
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
805 if is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
806 self.count_primaries[organism] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
807
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
808 if organism not in self.detailed_information:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
809 # refseq_group. family, gis, avg_mapping_score,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
810 # avg_seq_length, avg_number_hits, avg_alignment_score, avg_nr_mismatches
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
811 initial = [refseq_group,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
812 family,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
813 set([gi]),
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
814 [int(mapping_score), 1],
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
815 [len(seq), 1],
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
816 [0, 0],
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
817 [alignment_score, 1],
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
818 [number_mismatches, 1],
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
819 0,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
820 0]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
821
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
822 self.detailed_information[organism] = initial
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
823
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
824 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
825 entry = self.detailed_information[organism]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
826 entry[2].add(gi)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
827 entry[3][0] += int(mapping_score)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
828 entry[3][1] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
829 entry[4][0] += len(seq)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
830 entry[4][1] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
831 entry[6][0] += alignment_score
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
832 entry[6][1] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
833 entry[7][0] += number_mismatches
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
834 entry[7][1] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
835
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
836 if is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
837 entry = self.detailed_information[organism]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
838 entry[5][0] += number_hits
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
839 entry[5][1] += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
840
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
841 if self._last_read == (None, None):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
842 self._last_read = read_key
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
843
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
844 if self._last_read != read_key:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
845
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
846 for last_organism in self._last_organisms:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
847
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
848 self.detailed_information[last_organism][8]\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
849 += self._last_read_human_prim
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
850
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
851 self.detailed_information[last_organism][9]\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
852 += self._last_read_human_sec
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
853
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
854 self._last_read = read_key
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
855 self._last_organisms = set()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
856 self._last_read_human_prim = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
857 self._last_read_human_sec = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
858
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
859 self._last_organisms.add(organism)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
860
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
861 if organism == 'Homo_sapiens':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
862 if is_primary:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
863 self._last_read_human_prim += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
864 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
865 self._last_read_human_sec += 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
866
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
867 def get_summary(self, top=100):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
868
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
869 lines = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
870
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
871 lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10s\t%10s\t%5s\t%5s\t%5s\t%10s\t%10s\n'\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
872 % ('Count', 'Group', 'Family', 'Organism', 'Targets', 'ReadLen', 'Hits', 'Map', 'Algn', 'Mism', 'HuP', 'HuS'))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
873
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
874 top_organisms = self.count_primaries.most_common(top)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
875
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
876 for organism, count in top_organisms:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
877
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
878 refseq_group, family, identifiers,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
879 avg_mapping_score, avg_seq_length, avg_number_hits,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
880 avg_alignment_score, avg_nr_mismatches, human_prim, human_sec\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
881 = self.detailed_information[organism]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
882
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
883 avg_len = int(avg_seq_length[0] / float(avg_seq_length[1]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
884 if avg_number_hits[1] == 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
885 avg_hits = 0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
886 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
887 avg_hits = int(avg_number_hits[
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
888 0] / float(avg_number_hits[1]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
889
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
890 avg_mapping_score = int(avg_mapping_score[
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
891 0] / float(avg_mapping_score[1]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
892
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
893 avg_alignment_score = int(avg_alignment_score[
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
894 0] / float(avg_alignment_score[1]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
895
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
896 avg_nr_mismatches = int(avg_nr_mismatches[
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
897 0] / float(avg_nr_mismatches[1]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
898
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
899 nr_ids = len(identifiers)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
900
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
901 if count > 10**6:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
902 count = str(round(count / float(10**6), 3)) + 'M'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
903 if human_prim > 10**6:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
904 human_prim = str(round(human_prim / float(10**6), 3)) + 'M'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
905 if human_sec > 10**6:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
906 human_sec = str(round(human_sec / float(10**6), 3)) + 'M'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
907 if nr_ids > 10**6:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
908 nr_ids = str(round(nr_ids / float(10**6), 3)) + 'M'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
909
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
910 lines.append('%10s\t%20s\t%20s\t%-20s\t%10s\t%10i\t%10i\t%5i\t%5i\t%5i\t%10s\t%10s\n'\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
911 % (str(count), refseq_group[:20], family[:20], organism[:20],\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
912 str(nr_ids), avg_len, avg_hits, avg_mapping_score,\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
913 avg_alignment_score, avg_nr_mismatches, str(human_prim),\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
914 str(human_sec)))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
915
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
916 return lines
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
917
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
918 def write(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
919
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
920 with open(self.file_path, 'w') as output_file:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
921 output_file.writelines(self.get_summary())
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
922
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
923 @CLI.subcommand("rnamap")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
924 class RNAmap(cli.Application):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
925 """ Map input reads against a STAR index """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
926
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
927 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
928 help="Sets the index output directory")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
929
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
930 threads = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
931 ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
932 help="Sets the number of threads to use",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
933 default=1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
934
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
935 taxonomy = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
936 ['-x', '--taxonomy'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
937 help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
938 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
939
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
940 star_path = cli.SwitchAttr(['--star_path'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
941 help="Path to STAR executable",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
942 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
943
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
944 samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
945 help="Path to samtools executable",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
946 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
947
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
948 temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
949 help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
950 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
951
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
952 min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
953 help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
954 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
955
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
956 min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
957 help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
958 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
959
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
960 max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
961 help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
962 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
963
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
964 max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
965 help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
966 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
967
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
968 min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
969 help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
970 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
971
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
972 filter_complexity = cli.Flag(['--filter_complexity'],
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
973 help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
974 default=False)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
975
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
976 bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
977 help="Path to unsorted, unindexed output BAM file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
978 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
979
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
980 sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
981 help="Path to output SAM file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
982 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
983
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
984 qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
985 help="Path to output quality file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
986 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
987
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
988 hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
989 help="Path to bzip2-compressed tab-delimited output hit file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
990 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
991
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
992 sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
993 help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
994 default='no_sample_id')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
995
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
996 unmapped1 = cli.SwitchAttr(['--unmapped_end_1'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
997 help="Output path to uncompressed fastq file containing unmapped reads, first ends only for paired ends.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
998 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
999
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1000 unmapped2 = cli.SwitchAttr(['--unmapped_end_2'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1001 help="Output path to uncompressed fastq file containing unmapped reads, second ends only for paired ends.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1002 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1003
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1004 splice_junctions = cli.SwitchAttr(['--splice_junctions'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1005 help="Input path to splice junction file (currently not implemented)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1006 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1007
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1008 chimeric_mappings = cli.SwitchAttr(['--chimeric_mappings'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1009 help="Ouput path to SAM file containing chimeric mappings",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1010 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1011
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1012 hit_filter = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1013 ['-f', '--virana_hit_filter'], str, list=True, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1014 help="Only generate hit groups that include at last one read mapping to a reference of this reference group.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1015 default=[])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1016
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1017 debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1018
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1019 reads = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1020 ['-r', '--reads'], str, list=True, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1021 help="Sets the input reads. Add this parameter twice for paired end reads.")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1022
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1023 zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1024
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1025 sensitive = cli.Flag(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1026 ["--sensitive"], help="If given, mapping will process slower and more sensitive")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1027
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1028 def main(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1029
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1030 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1031 logging.getLogger().setLevel(logging.DEBUG)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1032
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1033 # Obtain star executable
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1034 star = [self.star_path and self.star_path or 'STAR']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1035 samtools = [self.samtools_path and self.samtools_path or 'samtools']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1036
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1037 # Check if genome directory is existing
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1038 if not os.path.exists(self.index_dir):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1039 sys.stdout.write('Index directory %s not existing, exiting' % self.genome_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1040 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1041
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1042 if self.temp_path:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1043 temp_path = tempfile.mkdtemp(dir=self.temp_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1044 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1045 temp_path = tempfile.mkdtemp()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1046
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1047 first_ends = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1048 second_ends = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1049 single_ends = []
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1050
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1051 if len(self.reads) == 2:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1052 first, second = self.reads
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1053 first_ends.append(first)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1054 second_ends.append(second)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1055
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1056 elif len(self.reads) == 1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1057 single_ends.append(self.reads[0])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1058
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1059 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1060 sys.stdout.write('Invalid number of fastq files; provide either one (single end) or two (paired end)')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1061 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1062
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1063 if single_ends and not first_ends and not second_ends:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1064 reads = [','.join(single_ends)]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1065
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1066 elif first_ends and second_ends:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1067 reads = [','.join(first_ends), ','.join(second_ends)]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1068
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1069
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1070 star_cline = star + ['--runMode', 'alignReads',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1071 '--genomeDir', self.index_dir,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1072 '--runThreadN', str(self.threads),
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1073 '--readMatesLengthsIn', 'NotEqual',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1074 '--outFileNamePrefix', os.path.join(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1075 temp_path, 'out'),
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1076 '--outSAMmode', 'Full',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1077 '--outSAMstrandField', 'None',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1078 '--outSAMattributes', 'Standard',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1079 '--outSAMunmapped', 'Within',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1080 '--outStd', 'SAM',
9
be5d3889a1b9 Uploaded
mzeidler
parents: 0
diff changeset
1081 '--outFilterMultimapNmax', '1000']
0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1082
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1083 if self.unmapped1 or self.unmapped2:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1084 star_cline += ['--outReadsUnmapped', 'Fastx']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1085 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1086 star_cline += ['--outReadsUnmapped', 'None']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1087
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1088
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1089 if self.zipped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1090 star_cline += ['--readFilesCommand', 'zcat']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1091
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1092 if self.sensitive:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1093 star_cline += ['--outFilterMultimapScoreRange', '10',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1094 '--outFilterMismatchNmax', '60',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1095 '--outFilterMismatchNoverLmax', '0.3',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1096 '--outFilterScoreMin', '0',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1097 '--outFilterScoreMinOverLread', '0.3',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1098 '--outFilterMatchNmin', '0',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1099 '--outFilterMatchNminOverLread', '0.66',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1100 '--seedSearchStartLmax', '12',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1101 '--winAnchorMultimapNmax', '50']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1102
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1103 star_cline += ['--readFilesIn'] + reads
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1104
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1105 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1106 print ' '.join(star_cline)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1107
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1108 # Try if we can make the relevant files
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1109 touch_files = [self.unmapped1, self.unmapped2, self.taxonomy, self.qual, self.hits, self.sam, self.bam]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1110 for file_path in touch_files:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1111 if file_path is None or file_path == '':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1112 continue
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1113 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1114 with file(file_path, 'a'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1115 os.utime(file_path, None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1116 except IOError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1117 sys.stderr.write('Could not write output file %s\n' % file_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1118 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1119
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1120 star_process = subprocess.Popen(' '.join(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1121 star_cline), shell=True, stdout=PIPE)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1122
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1123 parser = SAMParser()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1124
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1125 if self.taxonomy:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1126 taxonomy = SAMTaxonomy(self.taxonomy)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1127
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1128 if self.qual:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1129 quality = SAMQuality(self.qual)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1130
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1131 if self.hits:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1132 hits = SAMHits(self.hits, self.sample_id, self.hit_filter,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1133 self.min_mapping_score,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1134 self.min_alignment_score,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1135 self.max_mismatches,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1136 self.max_relative_mismatches,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1137 self.min_continiously_matching,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1138 self.filter_complexity)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1139
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1140 if self.sam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1141 sam_file = open(self.sam, 'w')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1142
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1143 if self.bam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1144 with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1145 samtools_cline = samtools + [
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1146 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1147 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1148 print ' '.join(samtools_cline)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1149 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1150
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1151
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1152 do_sam = self.sam
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1153 do_bam = self.bam
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1154 do_taxonomy = self.taxonomy
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1155 do_qual = self.qual
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1156 do_hits = self.hits
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1157 do_parse = do_taxonomy or do_qual or do_hits
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1158
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1159 for i, line in enumerate(iter(star_process.stdout.readline, '')):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1160
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1161 if do_sam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1162 sam_file.write(line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1163
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1164 if do_bam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1165 samtools_process.stdin.write(line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1166
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1167 if line[0] == '@':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1168 continue
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1169
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1170 if do_parse:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1171 parsed_line = parser.parse(line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1172
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1173 if do_taxonomy:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1174 taxonomy.count(parsed_line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1175 if i > 0 and (i % 50000) == 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1176 print ''.join(taxonomy.get_summary(10))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1177
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1178 if do_qual:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1179 quality.count(parsed_line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1180
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1181 if do_hits:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1182 hits.count(parsed_line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1183
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1184 if do_bam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1185 samtools_process.stdin.close()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1186
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1187 if do_sam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1188 sam_file.close()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1189
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1190 if do_hits:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1191 hits.write()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1192
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1193 if do_taxonomy:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1194 print ''.join(taxonomy.get_summary(10))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1195 taxonomy.write()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1196
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1197 if do_qual:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1198 quality.write()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1199
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1200 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1201 if self.unmapped1:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1202 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate1'),\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1203 self.unmapped1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1204 except IOError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1205 pass
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1206
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1207 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1208 if self.unmapped2:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1209 shutil.move(os.path.join(temp_path, 'out' + 'Unmapped.out.mate2'),\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1210 self.unmapped2)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1211 except IOError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1212 pass
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1213
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1214 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1215 if self.chimeric_mappings:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1216 shutil.move(os.path.join(temp_path, 'out' + 'Chimeric.out.sam'),\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1217 self.chimeric_mappings)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1218 except IOError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1219 pass
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1220
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1221 shutil.rmtree(temp_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1222
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1223 @CLI.subcommand("dnamap")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1224 class DNAmap(cli.Application):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1225 """ Map input reads against a BWA index """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1226
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1227 index_dir = cli.SwitchAttr(['-i', '--index_dir'], str, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1228 help="Sets the index output directory")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1229
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1230 threads = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1231 ['-t', '--threads'], cli.Range(1, 512), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1232 help="Sets the number of threads to use",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1233 default=1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1234
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1235 taxonomy = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1236 ['-x', '--taxonomy'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1237 help="Output path for the taxonomy file; setting this option will also enable regular taxonomy output to stdout during mapping",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1238 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1239
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1240 samtools_path = cli.SwitchAttr(['--samtools_path'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1241 help="Path to samtools executable",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1242 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1243
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1244 temp_path = cli.SwitchAttr(['--temporary_path'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1245 help="Path to temporary directory in which to generate temp files. All temp files with be automatically deleted after execution is complete.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1246 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1247
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1248 min_mapping_score = cli.SwitchAttr(['--min_mapping_score'], cli.Range(1, 255), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1249 help="Mimimum mapping score for saved hits (only applied to -v/--virana_hits)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1250 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1251
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1252 min_alignment_score = cli.SwitchAttr(['--min_alignment_score'], cli.Range(1, 255), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1253 help="Mimimum alignment score for saved hits (only applied to -v/--virana_hits)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1254 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1255
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1256 max_mismatches = cli.SwitchAttr(['--max_mismatches'], cli.Range(0, 10000000), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1257 help="Maximum number of mismatches for saved hits (only applied to -v/--virana_hits)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1258 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1259
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1260 max_relative_mismatches = cli.SwitchAttr(['--max_relative_mismatches'], float, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1261 help="Maximum number of mismatches relative to read length for saved hits (only applied to -v/--virana_hits)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1262 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1263
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1264 min_continiously_matching = cli.SwitchAttr(['--min_continiously_matching'], cli.Range(0, 10000000), mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1265 help="Minimum number of continious matches for saved hits (only applied to -v/--virana_hits)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1266 default=None)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1267
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1268 filter_complexity = cli.Flag(['--filter_complexity'],
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1269 help="Discard low-complexity reads (only applied to -v/--virana_hits). Adds some extra processing load to the mapping and may discard important information. Applies to all output files, including quality files (!)",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1270 default=False)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1271
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1272 sample_id = cli.SwitchAttr(['--sample_id'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1273 help="Alphanumeric string ([0-9a-zA-Z_-]*) used to designate sample information within the hit file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1274 default='no_sample_id')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1275
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1276 bam = cli.SwitchAttr(['-b', '--bam'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1277 help="Path to unsorted, unindexed output BAM file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1278 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1279
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1280 sam = cli.SwitchAttr(['-s', '--sam'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1281 help="Path to output SAM file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1282 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1283
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1284 qual = cli.SwitchAttr(['-q', '--qual'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1285 help="Path to output quality file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1286 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1287
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1288 hits = cli.SwitchAttr(['-v', '--virana_hits'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1289 help="Path to bzip2-compressed tab-delimited output virana hit file",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1290 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1291
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1292 hit_filter = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1293 ['-f', '--virana_hit_filter'], str, list=True, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1294 help="Only generate hit groups that include at last one read mapping to a reference of this reference group.",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1295 default=[])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1296
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1297 debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1298
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1299 zipped = cli.Flag(["-z", "--zipped"], help="Input reads are zipped")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1300
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1301 sensitive = cli.Flag(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1302 ["--sensitive"], help="If given, mapping will process slower and more sensitive")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1303
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1304
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1305 bwa_path = cli.SwitchAttr(['--bwa_path'], str, mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1306 help="Path to BWA executable",
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1307 default='')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1308
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1309 debug = cli.Flag(["-d", "--debug"], help="Enable debug information")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1310
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1311 if debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1312 logging.getLogger().setLevel(logging.DEBUG)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1313
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1314 reads = cli.SwitchAttr(
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1315 ['-r', '--reads'], str, list=True, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1316 help="Sets the input reads. Add this parameter twice for paired end reads.")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1317
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1318 def main(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1319
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1320 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1321 logging.getLogger().setLevel(logging.DEBUG)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1322
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1323 # Obtain star executable
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1324 bwa = [self.bwa_path and self.bwa_path or 'bwa']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1325 samtools = [self.samtools_path and self.samtools_path or 'samtools']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1326
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1327 # Check if genome directory is existing
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1328 if not os.path.exists(self.index_dir):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1329 sys.stdout.write('Index directory %s not existing, exiting'\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1330 % self.genome_dir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1331 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1332
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1333 if len(self.reads) not in (1, 2):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1334 message = 'Invalid number of FASTQ files; supply either one (single end) or two (paired end)\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1335 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1336 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1337
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1338 bwa_cline = bwa + ['mem', '-t', str(self.threads), '-M', os.path.join(self.index_dir, 'index')]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1339
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1340 bwa_cline += self.reads
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1341
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1342 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1343 print ' '.join(bwa_cline)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1344
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1345 bwa_process = subprocess.Popen(' '.join(bwa_cline), shell=True, stdout=PIPE)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1346
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1347 parser = SAMParser()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1348
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1349 if self.taxonomy:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1350 taxonomy = SAMTaxonomy(self.taxonomy)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1351
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1352 if self.qual:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1353 quality = SAMQuality(self.qual)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1354
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1355 if self.hits:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1356 hits = SAMHits(self.hits, self.sample_id, self.hit_filter,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1357 self.min_mapping_score,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1358 self.min_alignment_score,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1359 self.max_mismatches,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1360 self.max_relative_mismatches,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1361 self.min_continiously_matching,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1362 self.filter_complexity)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1363
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1364 if self.sam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1365 sam_file = open(self.sam, 'w', buffering=100 * 1024 * 1024)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1366
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1367 if self.bam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1368 with open(self.bam, 'wb', buffering=100 * 1024 * 1024) as bam_file:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1369 samtools_cline = samtools + [
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1370 'view', '-b', '-1', '-S', '-@', '4', '/dev/stdin']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1371 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1372 print ' '.join(samtools_cline)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1373 samtools_process = subprocess.Popen(' '.join(samtools_cline), shell=True, stdout=bam_file, stdin=PIPE)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1374
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1375 do_sam = self.sam
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1376 do_bam = self.bam
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1377 do_taxonomy = self.taxonomy
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1378 do_qual = self.qual
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1379 do_hits = self.hits
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1380 do_parse = do_taxonomy or do_qual or do_hits
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1381
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1382 for i, line in enumerate(iter(bwa_process.stdout.readline, '')):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1383
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1384 if do_sam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1385 sam_file.write(line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1386
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1387 if do_bam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1388 samtools_process.stdin.write(line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1389
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1390 if do_parse:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1391 parsed_line = parser.parse(line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1392
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1393 if do_taxonomy:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1394 taxonomy.count(parsed_line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1395
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1396 if i > 0 and (i % 10000) == 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1397 print ''.join(taxonomy.get_summary(10))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1398
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1399 if do_qual:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1400 quality.count(parsed_line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1401
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1402 if do_hits:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1403 hits.count(parsed_line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1404
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1405 if do_bam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1406 samtools_process.stdin.close()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1407
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1408 if do_sam:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1409 sam_file.close()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1410
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1411 if do_hits:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1412 hits.write()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1413
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1414 if do_taxonomy:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1415 print ''.join(taxonomy.get_summary(10))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1416 taxonomy.write()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1417
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1418 if do_qual:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1419 quality.write()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1420
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1421
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1422 if __name__ == "__main__":
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1423 CLI.run()