comparison kraken.py @ 14:fd27c97c8366 draft default tip

Uploaded
author cschu
date Mon, 18 May 2015 15:55:47 -0400
parents 0916697409ea
children
comparison
equal deleted inserted replaced
13:dc02dbf5e1a9 14:fd27c97c8366
1 #!/usr/bin/env python
2
3 import os
4 import sys
5 import argparse
6 import subprocess
7 import struct
8 import shutil
9 import tempfile
10
11 # from collections import Counter
12
13 from checkFileFormat import verifyFileFormat
14 DEFAULT_KRAKEN_DB = '/tsl/data/krakendb/ktest/db1'
15
16 def main(argv):
17
18 descr = ''
19 parser = argparse.ArgumentParser(description=descr)
20 parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?')
21 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db')
22 parser.add_argument('--in1', help='Single-end reads or left paired-end reads')
23 parser.add_argument('--in2', help='Right paired-end reads')
24 parser.add_argument('--quick', action='store_true', help='Use quick mode?')
25 parser.add_argument('--min-hits', default=1, help='Minimum number of hits required.')
26 parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq')
27 parser.add_argument('kraken_summary_tsv', type=str, help='The output file.')
28 # parser.add_argument('classified_seqs_fq', type=str, help='The output file.')
29 # parser.add_argument('unclassified_seqs_fq', type=str, help='The output file.')
30 args = parser.parse_args()
31
32 # kraken --preload --db /tsl/scratch/macleand/ktest/db --threads 12 --quick --classified-out classified_seqs.fq --unclassified-out unclassified.fq --fastq-input --min-hits 1 --output classification.txt left_reads.fq right_reads.fq
33
34 kraken_params = ['--preload', '--threads', '8',
35 '--unclassified-out', args.unclassified_seqs_fq,
36 '--output', args.kraken_summary_tsv]
37 # '--classified-out', args.classified_seqs_fq,
38 kraken_input = []
39
40 if 'db' not in args or not os.path.exists(args.db):
41 sys.stderr.write('Error: database is missing.\n')
42 sys.exit(1)
43 kraken_params.extend(['--db', args.db])
44 # check whether input file(s) exist(s)
45 if not ('in1' in args and os.path.exists(args.in1)):
46 sys.stderr.write('Error: fwd/single input file (%s) is missing.\n' % args.in1)
47 sys.exit(1)
48 if not verifyFileFormat(args.in1, args.input_format):
49 fc = open(args.in1).read(1)
50 sys.stderr.write('Error: fwd/single input file has the wrong format assigned (%s). Starts with \'%s\'\n' % (args.input_format, fc))
51 sys.exit(1)
52 kraken_input.append(args.in1)
53 if 'in2' in args:
54 if args.in2 is not None and not os.path.exists(args.in2):
55 sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2)
56 sys.exit(1)
57 elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format):
58 sys.stderr.write('Error: rev input file has the wrong format assigned.\n')
59 sys.exit(1)
60 elif args.in2 is not None:
61 kraken_params.append('--paired')
62 kraken_input.append(args.in2)
63 else:
64 pass
65 else:
66 pass
67
68 if 'quick' in args:
69 kraken_params.append('--quick')
70 if 'min_hits' in args:
71 kraken_params.extend(['--min-hits', str(args.min_hits)])
72
73 if args.input_format == 'fq':
74 kraken_params.append('--fastq-input')
75 else:
76 kraken_params.append('--fasta-input')
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91 # check whether file is gzipped
92 header = ''
93
94 cmd = 'source kraken-0.10.5; '
95 cmd += ' '.join(['kraken'] + kraken_params + kraken_input) + '\n'
96 # out = open(argv[-1], 'wb').write(str(cmd) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n')
97 # out = open(argv[-1], 'wb').write(str(args) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n')
98
99 # proc = subprocess.Popen(args=cmd, shell=True, stderr=sys.stderr)
100 # returncode = proc.wait()
101
102
103 tmp_dir = tempfile.mkdtemp()
104 try:
105 tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name
106 tmp_stderr = open(tmp, 'wb')
107 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
108 returncode = proc.wait()
109 tmp_stderr.close()
110 # get stderr, allowing for case where it's very large
111 tmp_stderr = open(tmp, 'rb')
112 stderr = ''
113 buffsize = 1048576
114 try:
115 while True:
116 stderr += tmp_stderr.read(buffsize)
117 if not stderr or len(stderr) % buffsize != 0:
118 break
119 except OverflowError:
120 pass
121 tmp_stderr.close()
122 if returncode != 0:
123 raise Exception, stderr
124 except Exception, e:
125 # clean up temp dirs
126 if os.path.exists(tmp_dir):
127 shutil.rmtree(tmp_dir)
128 sys.stderr.write('Error running kraken: %s\n' % str(e))
129 sys.exit(1)
130
131 if os.path.exists(tmp_dir):
132 shutil.rmtree(tmp_dir)
133 """
134 open(args.kraken_summary_tsv, 'wb').write('\t'.join(list('ACGT')) + '\n')
135 open(args.classified_seqs_fq, 'wb').write(cmd + '\n')
136 open(args.unclassified_seqs_fq, 'wb').write('blruah\n') # check whether the database exists, if not exit with error
137 """
138 """
139 for arg in args:
140 out.write(str(arg) + '\n')
141 out.close()
142 """
143 pass
144
145
146
147
148 # main(sys.argv[1:])
149
150 if __name__ == '__main__': main(sys.argv[1:])