comparison pathoscope2.py @ 1:d153ec2e2fec draft

Uploaded
author jasper
date Sun, 08 Jan 2017 19:39:46 -0500
parents
children
comparison
equal deleted inserted replaced
0:3fc482539c36 1:d153ec2e2fec
1 #!/usr/bin/env python
2 # Initial author: Solaiappan Manimaran
3 # Wrapper file for the following modules:
4 # patholib: generates host/target genome libraries from ncbi nt database for given taxon IDs
5 # pathomap: aligns reads to host/target database independent of read type using Bowtie2
6 # pathoid: reassigns ambiguous reads to the correct genome using statistical models
7 # pathoreport: Writes sam files to xml format
8
9 #usage information: pathoscope.py -h
10
11 # Pathoscope 2.0 - Predicts strains of genomes in unassembled Nextgen seq data
12 # Copyright (C) 2013 Johnson Lab - Boston University
13 #
14 # This program is free software: you can redistribute it and/or modify
15 # it under the terms of the GNU General Public License as published by
16 # the Free Software Foundation, either version 3 of the License, or
17 # any later version.
18 #
19 # This program is distributed in the hope that it will be useful,
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 # GNU General Public License for more details.
23 #
24 # You should have received a copy of the GNU General Public License
25 # along with this program. If not, see <http://www.gnu.org/licenses/>.
26
27 import os, sys
28 pathoscopedir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
29 sys.path.insert(0,pathoscopedir)
30
31 import argparse
32 import pathoscope
33 from pathoscope.patholib import pathoLib
34 from pathoscope.pathomap import PathoMapA
35 from pathoscope.pathoid import PathoID
36 from pathoscope.pathoreport import PathoReportA
37 from time import time
38
39 # ===========================================================
40 # main ()
41 parser = argparse.ArgumentParser(description="Pathoscope")
42
43 # create the top-level parser
44 parser.add_argument('--version', action='version', version='%(prog)s 2.0.6')
45 parser.add_argument('--verbose', action='store_const', dest='verbose',
46 required=False, const=True, default=False, help='Prints verbose text while running')
47 subparsers = parser.add_subparsers(dest='subcommand', help='Select one of the following sub-commands')
48
49 # create the parser for the "LIB" command
50 parser_a = subparsers.add_parser('LIB', help='Pathoscope taxon level reference genome Library creation Module')
51 parser_a.add_argument('-genomeFile', action='store', dest='lib_reference', required=True,
52 help='Specify reference genome(Download ftp://ftp.ncbi.nih.gov/blast/db/FASTA/nt.gz)')
53 parser_a.add_argument('-taxonIds', action='store', dest='lib_taxon_ids', required=False, default='X',
54 help='Specify taxon ids of your interest with comma separated '
55 '(if you have multiple taxon ids). If you do not specify this option, '
56 'it will work on all entries in the reference file. For taxonomy id lookup, '
57 'refer to http://www.ncbi.nlm.nih.gov/taxonomy')
58 parser_a.add_argument('-excludeTaxonIds', action='store', dest='lib_exclude_taxon_ids',
59 required=False, default='X',
60 help='Specify taxon ids to exclude with comma separated '
61 '(if you have multiple taxon ids to exclude).')
62 parser_a.add_argument('--noDesc', action='store_const', dest='lib_nodesc', required=False, const=True,
63 default=False, help='Do not keep an additional description in original fasta seq header.'
64 'Depending on NGS aligner, a long sequence header may slow down its mapping process.')
65 parser_a.add_argument('--subTax', action='store_const', dest='lib_subtax', required=False,
66 const=True, default=False, help='To include all sub taxonomies under the query taxonomy id.'
67 ' e.g., if you set -t 4751 --subtax, it will cover all sub taxonomies under taxon id 4751 '
68 '(fungi).')
69 parser_a.add_argument('--online', action='store_const', dest='lib_online_search', required=False,
70 const=True, default=False, help='To enable online searching in case you cannot find a '
71 'correct taxonomy id for a given gi. When there are many entries in nt whose gi is invalid, '
72 'this option may slow down the overall process.')
73 parser_a.add_argument('-dbhost', action='store', dest='lib_dbhost', required=False,
74 default='localhost', help='specify hostname running mysql if you want to use mysql '
75 'instead of hash method in mapping gi to taxonomy id')
76 parser_a.add_argument('-dbport', action='store', dest='lib_dbport', required=False,
77 default=3306, type=int, help='provide mysql server port if different from default (3306)')
78 parser_a.add_argument('-dbuser', action='store', dest='lib_dbuser', required=False, default='X',
79 help='user name to access mysql')
80 parser_a.add_argument('-dbpasswd', action='store', dest='lib_dbpasswd', required=False, default='X',
81 help='provide password associate with user')
82 parser_a.add_argument('-db', action='store', dest='lib_db', required=False, default='pathodb',
83 help='mysql pathoscope database name (default: pathodb)')
84 parser_a.add_argument('-outDir', action='store', default='.', dest='lib_outdir',
85 help='Output Directory (Default=. (current directory))')
86 parser_a.add_argument('-outPrefix', action='store', dest='lib_outprefix', required=True,
87 help='specify an output prefix to name your target database')
88
89 # create the parser for the "MAP" command
90 parser_b = subparsers.add_parser('MAP', help='Pathoscope MAP Module')
91 parser_b.add_argument('-U', default='', action='store', dest='map_inputread', required=False,
92 help='Input Read Fastq File (Unpaired/Single-end)')
93 parser_b.add_argument('-1', default='', action='store', dest='map_inputread1', required=False,
94 help='Input Read Fastq File (Pair 1)')
95 parser_b.add_argument('-2', default='', action='store', dest='map_inputread2', required=False,
96 help='Input Read Fastq File (Pair 2)')
97 parser_b.add_argument('-targetRefFiles', default='', action='store',
98 dest='map_targetref', required=False,
99 help='Target Reference Genome Fasta Files Full Path (Comma Separated)')
100 parser_b.add_argument('-filterRefFiles', default='', action='store',
101 dest='map_filterref', required=False,
102 help='Filter Reference Genome Fasta Files Full Path (Comma Separated)')
103 parser_b.add_argument('-targetAlignParams', action='store',
104 dest='map_targetalignparams', default=None, required=False,
105 help='Target Mapping Bowtie2 Parameters (Default: Pathoscope chosen best parameters)')
106 parser_b.add_argument('-filterAlignParams', action='store',
107 dest='map_filteralignparams', default=None, required=False,
108 help='Filter Mapping Bowtie2 Parameters (Default: Use the same Target Mapping Bowtie2 parameters)')
109 parser_b.add_argument('-outDir', action='store', default='.',
110 dest='map_outdir', required=False,
111 help='Output Directory (Default=. (current directory))')
112 parser_b.add_argument('-outAlign', action='store', default='outalign.sam',
113 dest='map_outalign', required=False,
114 help='Output Alignment File Name (Default=outalign.sam)')
115 parser_b.add_argument('-indexDir', action='store', default='.',
116 dest='map_indexdir', required=False,
117 help='Index Directory (Default=. (current directory))')
118 parser_b.add_argument('-targetIndexPrefixes', default='', action='store',
119 dest='map_targetindex', required=False,
120 help='Target Index Prefixes (Comma Separated)')
121 parser_b.add_argument('-filterIndexPrefixes', default='', action='store',
122 dest='map_filterindex', required=False,
123 help='Filter Index Prefixes (Comma Separated)')
124 parser_b.add_argument('-targetAlignFiles', default='', action='store',
125 dest='map_targetalign', required=False,
126 help='Target Alignment Files Full Path (Comma Separated)')
127 parser_b.add_argument('-filterAlignFiles', default='', action='store',
128 dest='map_filteralign', required=False,
129 help='Filter Alignment Files Full Path (Comma Separated)')
130 parser_b.add_argument('-btHome', default=None, action='store',
131 dest='map_bthome', required=False,
132 help='Full Path to Bowtie2 binary directory (Default: Uses bowtie2 in system path)')
133 parser_b.add_argument('-numThreads', action='store', dest='map_numthreads', required=False,
134 default=8, type=int, help='Number of threads to use by aligner (bowtie2) if different from default (8)')
135 parser_b.add_argument('-expTag', action='store', default='pathomap', dest='map_exp_tag',
136 help='Experiment Tag added to files generated for identification (Default: pathomap)')
137 parser_b.add_argument('-outputFile', action='store', default='patho', dest='output',
138 help='OutputFile (Default: pathomap)')
139
140 # create the parser for the "ID" command
141 parser_c = subparsers.add_parser('ID', help='Pathoscope ID Module')
142 parser_c.add_argument('--outMatrix', action='store_true', default=False, dest='id_out_matrix',
143 help='Output alignment matrix')
144 parser_c.add_argument('--noUpdatedAlignFile', action='store_true', default=False,
145 dest='id_noalign', help='Do not generate an updated alignment file')
146 parser_c.add_argument('--noDisplayCutoff', action='store_true', default=False,
147 dest='id_nocutoff', help='Do not cutoff display of genomes, even if it is insignificant')
148 parser_c.add_argument('-scoreCutoff', action='store', default=0.01, type=float,
149 dest='id_score_cutoff', help='Score Cutoff')
150 parser_c.add_argument('-emEpsilon', action='store', default=1e-7, type=float,
151 dest='id_emEpsilon', help='EM Algorithm Epsilon cutoff')
152 parser_c.add_argument('-maxIter', action='store', default=50, type=int,
153 dest='id_maxIter', help='EM Algorithm maximum iterations')
154 parser_c.add_argument('-piPrior', action='store', default=0, type=int, dest='id_piPrior',
155 help='EM Algorithm Pi Prior equivalent to adding n unique reads (Default: n=0)')
156 parser_c.add_argument('-thetaPrior', action='store', default=0, type=int, dest='id_thetaPrior',
157 help='EM Algorithm Theta Prior equivalent to adding n non-unique reads (Default: n=0)')
158 parser_c.add_argument('-expTag', action='store', default='pathoid', dest='id_exp_tag',
159 help='Experiment tag added to output file for easy identification (Default: pathoid)')
160 parser_c.add_argument('-outDir', action='store', default='.', dest='id_outdir',
161 help='Output Directory (Default=. (current directory))')
162 parser_c.add_argument('-fileType', action='store', default='sam', dest='id_ali_format',
163 help='Alignment Format: sam/bl8/gnu-sam (Default: sam)')
164 parser_c.add_argument('-alignFile', action='store', dest='id_ali_file', required=True,
165 help='Alignment file path')
166 parser_c.add_argument('-tableFile', action='store', dest='id_tab_file', required=False,
167 help='Output Table file')
168 parser_c.add_argument('-reAlignFile', action='store', dest='id_re_file', required=False,
169 help='Realignment file')
170
171 # create the parser for the "REPORT" command
172 parser_d = subparsers.add_parser('REP', help='Pathoscope Report Module')
173 parser_d.add_argument('-samtoolsHome', default=None, action='store',
174 dest='rep_samtoolshome', required=False,
175 help='Full Path to samtools binary directory (Default: Uses samtools in system path)')
176 parser_d.add_argument('-dbhost', action='store', dest='rep_dbhost', required=False,
177 default='localhost', help='specify hostname running mysql if you want to use mysql '
178 'instead of hash method in mapping gi to taxonomy id')
179 parser_d.add_argument('-dbport', action='store', dest='rep_dbport', required=False,
180 default=3306, type=int, help='provide mysql server port if different from default (3306)')
181 parser_d.add_argument('-dbuser', action='store', dest='rep_dbuser', required=False, default='X',
182 help='user name to access mysql')
183 parser_d.add_argument('-dbpasswd', action='store', dest='rep_dbpasswd', required=False, default='X',
184 help='provide password associate with user')
185 parser_d.add_argument('-db', action='store', dest='rep_db', required=False, default='pathodb',
186 help='mysql pathoscope database name (default: pathodb)')
187 parser_d.add_argument('-outDir', action='store', default='.', dest='rep_outdir',
188 help='Output Directory')
189 parser_d.add_argument('--contig', action='store_true', default=False, dest='rep_contig_flag',
190 help='Generate Contig Information (Needs samtools package installed)')
191 parser_d.add_argument('-samfile', action='store', dest='rep_ali_file', required=True,
192 help='SAM Alignment file path')
193 parser_d.add_argument('-xmlFile', action='store', dest='xml_file', required=True,
194 help='SAM Alignment file path')
195 parser_d.add_argument('-tableFile', action='store', dest='tsv_file', required=True,
196 help='SAM Alignment file path')
197 parser_d.add_argument('--noDisplayCutoff', action='store_true', default=False,
198 dest='rep_nocutoff', help='Do not cutoff display of genomes, even if it is insignificant')
199
200 parser_e = subparsers.add_parser('QC', help="PathoQC: Quality control program for high throughput sequencing reads")
201 parser_e.add_argument('-1', action='store', dest='qc_read1', required=True, default='', help='1st pair of read in PER or SER')
202 parser_e.add_argument('-2', action='store', dest='qc_read2', required=False, default='', help='2nd pair of read in PER')
203 parser_e.add_argument('-r', action='store', dest='qc_readL', required=False, default=0, type = int, help='let us know a mean read length (0:ignore. [1]:I want to know the range of read length after trimming, i:user_specified_mean_read_length)')
204 parser_e.add_argument('-t', action='store', dest='qc_phred_offset', required=False, default=33, type = int, help='specify a phred offset used in encoding base quality(0:not sure?, [33]:phred+33, 64:phred+64)')
205 parser_e.add_argument('-o', action='store', dest='qc_outdir', required=False, default='', help='specify output directory in full path')
206 parser_e.add_argument('-s', action='store', dest='qc_sequencer', required=False, default='Illumina', help='tell us which sequencer generates the reads ([Illumina], PacBio, Roche454, IonTorrent)')
207 parser_e.add_argument('-m', action='store', dest='qc_len_cutoff', required=False, type=int, default=35, help='specify the min read length cutoff[35]')
208 parser_e.add_argument('-a', action='store', dest='qc_adaptor', required=False, default='N', help='specify an adaptor (Y:have pathoQC detect it, [N]:ignore, ACGT:adaptor)')
209 parser_e.add_argument('-a2', action='store', dest='qc_adaptor2', required=False, default='N', help='specify a second adaptor if it is different from the first one (Y:have pathoQC detect it, [N]:ignore, ACGT:adaptor)')
210 #parser_e.add_argument('-k', action='store', dest='qc_primer', required=False, default='N', help='specify a primer')
211 parser_e.add_argument('-q', action='store', dest='qc_qual_cutoff', required=False, type=int, default=0, help='specify a cutoff of base quality value to trim at the end of reads([0]:ignore, 1:let pathoQC take care of it, i:user_specified_cutoff_value)')
212 parser_e.add_argument('-R', action='store', dest='qc_lower454', required=False, type=int, default=0, help='set to 1 if you want to mask all lower case bases that may correspond to sequence tag or adaptor in Roche454 or IonTorrent')
213 parser_e.add_argument('-e', action='store', dest='qc_coff_entropy', required=False, type=int, default=30, help='specify a cutoff of entropy of low sequence complexity to filter out[0..100],default:30, set 0 to disable')
214 parser_e.add_argument('-d', action='store', dest='qc_derep', required=False, type=int, default=1, help='filtering duplicates: [1] (exact duplicate), 2 (5\' duplicate), 3 (3\' duplicate), 4 (reverse complement exact duplicate), 5 (reverse complement 5\'/3\' duplicate)');
215 parser_e.add_argument('-g', action='store', dest='qc_add_valid_single', required=False, type=int, default=0, help='Set to 1 if you want to add a valid single read into a reduced valid PER set. Note that this option works only with PER')
216 parser_e.add_argument('--no_prinseq', action='store_const', dest='qc_on_prinseq', required=False, const=False, default=True, help='to force to skip prinseq module')
217
218 parser_e.add_argument('-p', action='store', dest='qc_num_threads', required=False, type=int, default=1, help='specify a total number of cpus to use[1]')
219 parser_e.add_argument('--debug', action='store_const', dest='qc_debugF', required=False, const=True, default=False, help='working on debug mode')
220 parser_e.add_argument('--version', action='version', version='PathoQC 1.0')
221
222 def main():
223 # parse some argument lists
224 inputArgs = parser.parse_args()
225
226 #### PathoID modules ####
227
228 start = time();
229
230 if (inputArgs.subcommand=='LIB'):
231 ################################################$
232 #append taxon id in the front of sequence header
233 ################################################$
234 NAs = 'X'
235 if inputArgs.lib_dbuser!=NAs and inputArgs.lib_dbpasswd==NAs:
236 print 'if you want to use mysql, make sure that you install pathoDB and '
237 'also specify the corresponding mysql password correctly '
238 '(Ask your mysql admin to access the database).'
239 MysqlConf=(inputArgs.lib_dbhost,inputArgs.lib_dbport,inputArgs.lib_dbuser,inputArgs.lib_dbpasswd,inputArgs.lib_db)
240 taxon_ids=pathoLib.parse_input_app_build_nt_tgt(inputArgs.lib_taxon_ids)
241 exclude_taxon_ids=pathoLib.parse_input_app_build_nt_tgt(inputArgs.lib_exclude_taxon_ids)
242 (ncbiNt_ti,ncbiNt_invalid) = pathoLib.append_ti_into_fasta_app(inputArgs.lib_reference,
243 taxon_ids, exclude_taxon_ids, inputArgs.lib_subtax,MysqlConf,
244 not(inputArgs.lib_nodesc), inputArgs.lib_online_search, inputArgs.lib_outprefix,
245 inputArgs.lib_outdir)
246
247 if (inputArgs.subcommand=='MAP'):
248 pathoMapOptions = PathoMapA.PathoMapOptions()
249 pathoMapOptions.verbose = inputArgs.verbose
250 pathoMapOptions.outDir = inputArgs.map_outdir
251 pathoMapOptions.indexDir = inputArgs.map_indexdir
252 pathoMapOptions.outAlignFile = inputArgs.map_outalign
253 pathoMapOptions.inReadFile = inputArgs.map_inputread
254 pathoMapOptions.inReadFilePair1 = inputArgs.map_inputread1
255 pathoMapOptions.inReadFilePair2 = inputArgs.map_inputread2
256 pathoMapOptions.targetAlignParameters = inputArgs.map_targetalignparams
257 pathoMapOptions.filterAlignParameters = inputArgs.map_filteralignparams
258 if (len(inputArgs.map_targetref)>0):
259 pathoMapOptions.targetRefFiles = inputArgs.map_targetref.split(",")
260 if (len(inputArgs.map_filterref)>0):
261 pathoMapOptions.filterRefFiles = inputArgs.map_filterref.split(",")
262 if (len(inputArgs.map_targetindex)>0):
263 pathoMapOptions.targetIndexPrefixes = inputArgs.map_targetindex.split(",")
264 if (len(inputArgs.map_filterindex)>0):
265 pathoMapOptions.filterIndexPrefixes = inputArgs.map_filterindex.split(",")
266 if (len(inputArgs.map_targetalign)>0):
267 pathoMapOptions.targetAlignFiles = [inputArgs.map_targetalign]
268 if (len(inputArgs.map_filteralign)>0):
269 pathoMapOptions.filterAlignFiles = [inputArgs.map_filteralign]
270 pathoMapOptions.btHome = inputArgs.map_bthome
271 pathoMapOptions.numThreads = inputArgs.map_numthreads
272 pathoMapOptions.exp_tag = inputArgs.map_exp_tag + "-"
273 PathoMapA.processPathoMap(pathoMapOptions)
274 f1 = open('outalign.sam', 'r')
275 # for line in f1:
276 # print line
277 f2 = open(inputArgs.output, 'wt')
278 for line in f1:
279 f2.write(line)
280 f1.close()
281 f2.close()
282 # f = open(output, 'wt')
283 # f1 = open(outAlignFile, 'r')
284 # for line in f1:
285 # print line
286 # f.write(line)
287 # f.close()
288 # f1.close()
289
290 if (inputArgs.subcommand=='ID'):
291 pathoIdOptions = PathoID.PathoIdOptions(inputArgs.id_ali_file)
292 pathoIdOptions.ali_format = inputArgs.id_ali_format
293 pathoIdOptions.verbose = inputArgs.verbose
294 pathoIdOptions.out_matrix_flag = inputArgs.id_out_matrix
295 pathoIdOptions.score_cutoff = inputArgs.id_score_cutoff
296 pathoIdOptions.exp_tag = inputArgs.id_exp_tag
297 pathoIdOptions.outdir = inputArgs.id_outdir
298 pathoIdOptions.emEpsilon = inputArgs.id_emEpsilon
299 pathoIdOptions.maxIter = inputArgs.id_maxIter
300 pathoIdOptions.piPrior = inputArgs.id_piPrior
301 pathoIdOptions.thetaPrior = inputArgs.id_thetaPrior
302 pathoIdOptions.noalign = inputArgs.id_noalign
303 pathoIdOptions.noCutOff = inputArgs.id_nocutoff
304 values = PathoID.pathoscope_reassign(pathoIdOptions)
305 finalReport = values[0]
306 reAlignfile = values[-1]
307 tableFile = inputArgs.id_tab_file
308 realignFile = inputArgs.id_re_file
309
310 f = open(tableFile, 'wt')
311 f1 = open(finalReport, 'r')
312 for line in f1:
313 f.write(line)
314 f.close()
315 f1.close()
316 f2 = open(realignFile, 'wt')
317 f3 = open(reAlignfile, 'r')
318 for line in f3:
319 f2.write(line)
320 f2.close()
321 f3.close()
322
323 if (inputArgs.subcommand=='REP'):
324 pathoReportOptions = PathoReportA.PathoReportOptions(inputArgs.rep_ali_file)
325 pathoReportOptions.verbose = inputArgs.verbose
326 pathoReportOptions.contigFlag = inputArgs.rep_contig_flag
327 pathoReportOptions.outDir = inputArgs.rep_outdir
328 pathoReportOptions.samtoolsHome = inputArgs.rep_samtoolshome
329 pathoReportOptions.noCutOff = inputArgs.rep_nocutoff
330 mysqlConf=(inputArgs.rep_dbhost,inputArgs.rep_dbport,inputArgs.rep_dbuser,
331 inputArgs.rep_dbpasswd,inputArgs.rep_db)
332 pathoReportOptions.mysqlConf = mysqlConf
333 outTsv, outXml = PathoReportA.processPathoReport(pathoReportOptions)
334 tsvfile = inputArgs.tsv_file
335 xmlfile = inputArgs.xml_file
336
337 f = open(tsvfile, 'wt')
338 f1 = open(outTsv, 'r')
339 for line in f1:
340 f.write(line)
341 f.close()
342 f1.close()
343 f2 = open(xmlfile, 'wt')
344 f3 = open(outXml, 'r')
345 for line in f3:
346 f2.write(line)
347 f2.close()
348 f3.close()
349
350 if (inputArgs.subcommand=='QC'):
351 qcargs = sys.argv[2:]
352 pathoqcdir = pathoscopedir + os.path.sep + 'pathoscope' + os.path.sep + 'pathoqc'
353 pathoqcfile = pathoqcdir + os.path.sep + 'pathoqc.py'
354 if os.path.exists(pathoqcfile):
355 cmd = sys.executable
356 cmd += " " + pathoqcfile + " "
357 cmd += " ".join(qcargs)
358 print(cmd)
359 os.system(cmd)
360 else:
361 print("PathoQC (" + pathoqcfile + ") not found. Please download pathoqc_vXXX.tar.gz and "
362 "install it ("+pathoqcdir+") from http://sourceforge.net/projects/pathoscope/")
363
364 elapsed = time() - start;
365 if inputArgs.verbose:
366 print "Total Elapsed Time: %d" % (elapsed)
367
368
369 if __name__ == "__main__":
370 main()