0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import os
|
|
4 import sys
|
|
5 import argparse
|
|
6 import subprocess
|
|
7 import struct
|
|
8 import shutil
|
|
9 import tempfile
|
|
10
|
|
11 # from collections import Counter
|
|
12
|
|
13 from checkFileFormat import verifyFileFormat
|
|
14 DEFAULT_KRAKEN_DB = '/tsl/data/krakendb/ktest/db1'
|
|
15
|
|
16 def main(argv):
|
|
17
|
|
18 descr = ''
|
|
19 parser = argparse.ArgumentParser(description=descr)
|
|
20 parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?')
|
|
21 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db')
|
|
22 parser.add_argument('--in1', help='Single-end reads or left paired-end reads')
|
|
23 parser.add_argument('--in2', help='Right paired-end reads')
|
|
24 parser.add_argument('--quick', action='store_true', help='Use quick mode?')
|
|
25 parser.add_argument('--min-hits', default=1, help='Minimum number of hits required.')
|
|
26 parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq')
|
|
27 parser.add_argument('kraken_summary_tsv', type=str, help='The output file.')
|
|
28 # parser.add_argument('classified_seqs_fq', type=str, help='The output file.')
|
|
29 # parser.add_argument('unclassified_seqs_fq', type=str, help='The output file.')
|
|
30 args = parser.parse_args()
|
|
31
|
|
32 # kraken --preload --db /tsl/scratch/macleand/ktest/db --threads 12 --quick --classified-out classified_seqs.fq --unclassified-out unclassified.fq --fastq-input --min-hits 1 --output classification.txt left_reads.fq right_reads.fq
|
|
33
|
|
34 kraken_params = ['--preload', '--threads', '8',
|
|
35 '--unclassified-out', args.unclassified_seqs_fq,
|
|
36 '--output', args.kraken_summary_tsv]
|
|
37 # '--classified-out', args.classified_seqs_fq,
|
|
38 kraken_input = []
|
|
39
|
|
40 if 'db' not in args or not os.path.exists(args.db):
|
|
41 sys.stderr.write('Error: database is missing.\n')
|
|
42 sys.exit(1)
|
|
43 kraken_params.extend(['--db', args.db])
|
|
44 # check whether input file(s) exist(s)
|
|
45 if not ('in1' in args and os.path.exists(args.in1)):
|
|
46 sys.stderr.write('Error: fwd/single input file (%s) is missing.\n' % args.in1)
|
|
47 sys.exit(1)
|
|
48 if not verifyFileFormat(args.in1, args.input_format):
|
|
49 fc = open(args.in1).read(1)
|
|
50 sys.stderr.write('Error: fwd/single input file has the wrong format assigned (%s). Starts with \'%s\'\n' % (args.input_format, fc))
|
|
51 sys.exit(1)
|
|
52 kraken_input.append(args.in1)
|
|
53 if 'in2' in args:
|
|
54 if args.in2 is not None and not os.path.exists(args.in2):
|
|
55 sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2)
|
|
56 sys.exit(1)
|
|
57 elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format):
|
|
58 sys.stderr.write('Error: rev input file has the wrong format assigned.\n')
|
|
59 sys.exit(1)
|
|
60 elif args.in2 is not None:
|
|
61 kraken_params.append('--paired')
|
|
62 kraken_input.append(args.in2)
|
|
63 else:
|
|
64 pass
|
|
65 else:
|
|
66 pass
|
|
67
|
|
68 if 'quick' in args:
|
|
69 kraken_params.append('--quick')
|
|
70 if 'min_hits' in args:
|
|
71 kraken_params.extend(['--min-hits', str(args.min_hits)])
|
|
72
|
|
73 if args.input_format == 'fq':
|
|
74 kraken_params.append('--fastq-input')
|
|
75 else:
|
|
76 kraken_params.append('--fasta-input')
|
|
77
|
|
78
|
|
79
|
|
80
|
|
81
|
|
82
|
|
83
|
|
84
|
|
85
|
|
86
|
|
87
|
|
88
|
|
89
|
|
90
|
|
91 # check whether file is gzipped
|
|
92 header = ''
|
|
93
|
|
94 cmd = 'source kraken-0.10.5; '
|
|
95 cmd += ' '.join(['kraken'] + kraken_params + kraken_input) + '\n'
|
|
96 # out = open(argv[-1], 'wb').write(str(cmd) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n')
|
|
97 # out = open(argv[-1], 'wb').write(str(args) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n')
|
|
98
|
|
99 # proc = subprocess.Popen(args=cmd, shell=True, stderr=sys.stderr)
|
|
100 # returncode = proc.wait()
|
|
101
|
|
102
|
|
103 tmp_dir = tempfile.mkdtemp()
|
|
104 try:
|
|
105 tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name
|
|
106 tmp_stderr = open(tmp, 'wb')
|
|
107 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
|
|
108 returncode = proc.wait()
|
|
109 tmp_stderr.close()
|
|
110 # get stderr, allowing for case where it's very large
|
|
111 tmp_stderr = open(tmp, 'rb')
|
|
112 stderr = ''
|
|
113 buffsize = 1048576
|
|
114 try:
|
|
115 while True:
|
|
116 stderr += tmp_stderr.read(buffsize)
|
|
117 if not stderr or len(stderr) % buffsize != 0:
|
|
118 break
|
|
119 except OverflowError:
|
|
120 pass
|
|
121 tmp_stderr.close()
|
|
122 if returncode != 0:
|
|
123 raise Exception, stderr
|
|
124 except Exception, e:
|
|
125 # clean up temp dirs
|
|
126 if os.path.exists(tmp_dir):
|
|
127 shutil.rmtree(tmp_dir)
|
|
128 sys.stderr.write('Error running kraken: %s\n' % str(e))
|
|
129 sys.exit(1)
|
|
130
|
|
131 if os.path.exists(tmp_dir):
|
|
132 shutil.rmtree(tmp_dir)
|
|
133 """
|
|
134 open(args.kraken_summary_tsv, 'wb').write('\t'.join(list('ACGT')) + '\n')
|
|
135 open(args.classified_seqs_fq, 'wb').write(cmd + '\n')
|
|
136 open(args.unclassified_seqs_fq, 'wb').write('blruah\n') # check whether the database exists, if not exit with error
|
|
137 """
|
|
138 """
|
|
139 for arg in args:
|
|
140 out.write(str(arg) + '\n')
|
|
141 out.close()
|
|
142 """
|
|
143 pass
|
|
144
|
|
145
|
|
146
|
|
147
|
|
148 # main(sys.argv[1:])
|
|
149
|
|
150 if __name__ == '__main__': main(sys.argv[1:])
|