Mercurial > repos > cschu > kraken_tools
comparison kraken.py @ 14:fd27c97c8366 draft default tip
Uploaded
author | cschu |
---|---|
date | Mon, 18 May 2015 15:55:47 -0400 |
parents | 0916697409ea |
children |
comparison
equal
deleted
inserted
replaced
13:dc02dbf5e1a9 | 14:fd27c97c8366 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os | |
4 import sys | |
5 import argparse | |
6 import subprocess | |
7 import struct | |
8 import shutil | |
9 import tempfile | |
10 | |
11 # from collections import Counter | |
12 | |
13 from checkFileFormat import verifyFileFormat | |
14 DEFAULT_KRAKEN_DB = '/tsl/data/krakendb/ktest/db1' | |
15 | |
16 def main(argv): | |
17 | |
18 descr = '' | |
19 parser = argparse.ArgumentParser(description=descr) | |
20 parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?') | |
21 parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db') | |
22 parser.add_argument('--in1', help='Single-end reads or left paired-end reads') | |
23 parser.add_argument('--in2', help='Right paired-end reads') | |
24 parser.add_argument('--quick', action='store_true', help='Use quick mode?') | |
25 parser.add_argument('--min-hits', default=1, help='Minimum number of hits required.') | |
26 parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq') | |
27 parser.add_argument('kraken_summary_tsv', type=str, help='The output file.') | |
28 # parser.add_argument('classified_seqs_fq', type=str, help='The output file.') | |
29 # parser.add_argument('unclassified_seqs_fq', type=str, help='The output file.') | |
30 args = parser.parse_args() | |
31 | |
32 # kraken --preload --db /tsl/scratch/macleand/ktest/db --threads 12 --quick --classified-out classified_seqs.fq --unclassified-out unclassified.fq --fastq-input --min-hits 1 --output classification.txt left_reads.fq right_reads.fq | |
33 | |
34 kraken_params = ['--preload', '--threads', '8', | |
35 '--unclassified-out', args.unclassified_seqs_fq, | |
36 '--output', args.kraken_summary_tsv] | |
37 # '--classified-out', args.classified_seqs_fq, | |
38 kraken_input = [] | |
39 | |
40 if 'db' not in args or not os.path.exists(args.db): | |
41 sys.stderr.write('Error: database is missing.\n') | |
42 sys.exit(1) | |
43 kraken_params.extend(['--db', args.db]) | |
44 # check whether input file(s) exist(s) | |
45 if not ('in1' in args and os.path.exists(args.in1)): | |
46 sys.stderr.write('Error: fwd/single input file (%s) is missing.\n' % args.in1) | |
47 sys.exit(1) | |
48 if not verifyFileFormat(args.in1, args.input_format): | |
49 fc = open(args.in1).read(1) | |
50 sys.stderr.write('Error: fwd/single input file has the wrong format assigned (%s). Starts with \'%s\'\n' % (args.input_format, fc)) | |
51 sys.exit(1) | |
52 kraken_input.append(args.in1) | |
53 if 'in2' in args: | |
54 if args.in2 is not None and not os.path.exists(args.in2): | |
55 sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2) | |
56 sys.exit(1) | |
57 elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format): | |
58 sys.stderr.write('Error: rev input file has the wrong format assigned.\n') | |
59 sys.exit(1) | |
60 elif args.in2 is not None: | |
61 kraken_params.append('--paired') | |
62 kraken_input.append(args.in2) | |
63 else: | |
64 pass | |
65 else: | |
66 pass | |
67 | |
68 if 'quick' in args: | |
69 kraken_params.append('--quick') | |
70 if 'min_hits' in args: | |
71 kraken_params.extend(['--min-hits', str(args.min_hits)]) | |
72 | |
73 if args.input_format == 'fq': | |
74 kraken_params.append('--fastq-input') | |
75 else: | |
76 kraken_params.append('--fasta-input') | |
77 | |
78 | |
79 | |
80 | |
81 | |
82 | |
83 | |
84 | |
85 | |
86 | |
87 | |
88 | |
89 | |
90 | |
91 # check whether file is gzipped | |
92 header = '' | |
93 | |
94 cmd = 'source kraken-0.10.5; ' | |
95 cmd += ' '.join(['kraken'] + kraken_params + kraken_input) + '\n' | |
96 # out = open(argv[-1], 'wb').write(str(cmd) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n') | |
97 # out = open(argv[-1], 'wb').write(str(args) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n') | |
98 | |
99 # proc = subprocess.Popen(args=cmd, shell=True, stderr=sys.stderr) | |
100 # returncode = proc.wait() | |
101 | |
102 | |
103 tmp_dir = tempfile.mkdtemp() | |
104 try: | |
105 tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name | |
106 tmp_stderr = open(tmp, 'wb') | |
107 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno()) | |
108 returncode = proc.wait() | |
109 tmp_stderr.close() | |
110 # get stderr, allowing for case where it's very large | |
111 tmp_stderr = open(tmp, 'rb') | |
112 stderr = '' | |
113 buffsize = 1048576 | |
114 try: | |
115 while True: | |
116 stderr += tmp_stderr.read(buffsize) | |
117 if not stderr or len(stderr) % buffsize != 0: | |
118 break | |
119 except OverflowError: | |
120 pass | |
121 tmp_stderr.close() | |
122 if returncode != 0: | |
123 raise Exception, stderr | |
124 except Exception, e: | |
125 # clean up temp dirs | |
126 if os.path.exists(tmp_dir): | |
127 shutil.rmtree(tmp_dir) | |
128 sys.stderr.write('Error running kraken: %s\n' % str(e)) | |
129 sys.exit(1) | |
130 | |
131 if os.path.exists(tmp_dir): | |
132 shutil.rmtree(tmp_dir) | |
133 """ | |
134 open(args.kraken_summary_tsv, 'wb').write('\t'.join(list('ACGT')) + '\n') | |
135 open(args.classified_seqs_fq, 'wb').write(cmd + '\n') | |
136 open(args.unclassified_seqs_fq, 'wb').write('blruah\n') # check whether the database exists, if not exit with error | |
137 """ | |
138 """ | |
139 for arg in args: | |
140 out.write(str(arg) + '\n') | |
141 out.close() | |
142 """ | |
143 pass | |
144 | |
145 | |
146 | |
147 | |
148 # main(sys.argv[1:]) | |
149 | |
150 if __name__ == '__main__': main(sys.argv[1:]) |