comparison data_manager/kraken2_build_database.py @ 0:c13785ca6192 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 39e87c095e426fc3f147d55de0434cd54ae0354a
author nate
date Wed, 06 Nov 2024 20:00:39 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c13785ca6192
1 #!/usr/bin/env python
2
3 import argparse
4 import datetime
5 import errno
6 import json
7 import os
8 import re
9 import shutil
10 import subprocess
11 import sys
12 import tarfile
13 from enum import Enum
14
15 try:
16 # Python3
17 from urllib.request import urlopen
18 from urllib.error import URLError
19 except ImportError:
20 from urllib2 import urlopen
21 from urllib2 import URLError
22
23
24 DATA_TABLE_NAME = "kraken2_databases"
25
26
27 class KrakenDatabaseTypes(Enum):
28 standard_local_build = 'standard_local_build'
29 standard_prebuilt = 'standard_prebuilt'
30 minikraken = 'minikraken'
31 special_prebuilt = 'special_prebuilt'
32 special = 'special'
33 custom = 'custom'
34
35 def __str__(self):
36 return self.value
37
38
39 class SpecialDatabaseTypes(Enum):
40 rdp = 'rdp'
41 greengenes = 'greengenes'
42 silva = 'silva'
43
44 def __str__(self):
45 return self.value
46
47
48 class Minikraken2Versions(Enum):
49 v1 = 'v1'
50 v2 = 'v2'
51
52 def __str__(self):
53 return self.value
54
55
56 class StandardPrebuiltSizes(Enum):
57 viral = "viral"
58 minusb = "minusb"
59 standard = "standard"
60 standard_08gb = "standard_08gb"
61 standard_16gb = "standard_16gb"
62 pluspf = "pluspf"
63 pluspf_08gb = "pluspf_08gb"
64 pluspf_16gb = "pluspf_16gb"
65 pluspfp = "pluspfp"
66 pluspfp_08gb = "pluspfp_08gb"
67 pluspfp_16gb = "pluspfp_16gb"
68 eupathdb48 = "eupathdb48"
69
70 def __str__(self):
71 return self.value
72
73
74 def kraken2_build_standard(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME):
75 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
76
77 database_value = "_".join([
78 now,
79 "standard",
80 "kmer-len", str(kraken2_args["kmer_len"]),
81 "minimizer-len", str(kraken2_args["minimizer_len"]),
82 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]),
83 "load-factor", str(kraken2_args["load_factor"]),
84 ])
85
86 database_name = " ".join([
87 "Standard (Local Build)",
88 "(Created:",
89 now + ",",
90 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",",
91 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",",
92 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")",
93 "load-factor", str(kraken2_args["load_factor"]),
94 ])
95
96 database_path = database_value
97
98 args = [
99 '--threads', str(kraken2_args["threads"]),
100 '--standard',
101 '--kmer-len', str(kraken2_args["kmer_len"]),
102 '--minimizer-len', str(kraken2_args["minimizer_len"]),
103 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]),
104 '--load-factor', str(kraken2_args["load_factor"]),
105 '--db', database_path
106 ]
107
108 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
109
110 if kraken2_args["clean"]:
111 args = [
112 '--threads', str(kraken2_args["threads"]),
113 '--clean',
114 '--db', database_path
115 ]
116
117 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
118
119 data_table_entry = {
120 'data_tables': {
121 data_table_name: [
122 {
123 "value": database_value,
124 "name": database_name,
125 "path": database_path,
126 }
127 ]
128 }
129 }
130
131 return data_table_entry
132
133
134 def kraken2_build_standard_prebuilt(prebuilt_db, prebuilt_date, target_directory, data_table_name=DATA_TABLE_NAME):
135
136 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
137
138 prebuild_name = {
139 'viral': "Viral",
140 'minusb': "MinusB (archaea, viral, plasmid, human, UniVec_Core)",
141 'standard': "Standard-Full (archaea, bacteria, viral, plasmid, human,UniVec_Core)",
142 'standard_08gb': "Standard-8 (Standard with DB capped at 8 GB)",
143 'standard_16gb': "Standard-16 (Standard with DB capped at 16 GB)",
144 'pluspf': "PlusPF (Standard plus protozoa and fungi)",
145 'pluspf_08gb': "PlusPF-8 (PlusPF with DB capped at 8 GB)",
146 'pluspf_16gb': "PlusPF-16 (PlusPF with DB capped at 16 GB)",
147 'pluspfp': "PlusPFP (Standard plus protozoa, fungi and plant)",
148 'pluspfp_08gb': "PlusPFP-8 (PlusPFP with DB capped at 8 GB)",
149 'pluspfp_16gb': "PlusPFP-16 (PlusPFP with DB capped at 16 GB)",
150 'eupathdb48': "EuPathDB-46",
151 }
152
153 database_value = "_".join([
154 now,
155 "standard_prebuilt",
156 prebuilt_db,
157 prebuilt_date
158 ])
159
160 database_name = " ".join([
161 "Prebuilt Refseq indexes: ",
162 prebuild_name[prebuilt_db],
163 "(Version: ",
164 prebuilt_date,
165 "- Downloaded:",
166 now + ")"
167 ])
168
169 database_path = database_value
170
171 # we may need to let the user choose the date when new DBs are posted.
172 date_url_str = prebuilt_date.replace('-', '')
173 # download the pre-built database
174 try:
175 download_url = 'https://genome-idx.s3.amazonaws.com/kraken/k2_%s_%s.tar.gz' % (prebuilt_db, date_url_str)
176 src = urlopen(download_url)
177 except URLError as e:
178 print('url: ' + download_url, file=sys.stderr)
179 print(e, file=sys.stderr)
180 exit(1)
181
182 with open('tmp_data.tar.gz', 'wb') as dst:
183 shutil.copyfileobj(src, dst)
184 # unpack the downloaded archive to the target directory
185 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh:
186 for member in fh.getmembers():
187 if member.isreg():
188 member.name = os.path.basename(member.name)
189 fh.extract(member, os.path.join(target_directory, database_path))
190
191 data_table_entry = {
192 'data_tables': {
193 data_table_name: [
194 {
195 "value": database_value,
196 "name": database_name,
197 "path": database_path,
198 }
199 ]
200 }
201 }
202
203 return data_table_entry
204
205
206 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME):
207
208 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
209
210 database_value = "_".join([
211 now,
212 "minikraken2",
213 minikraken2_version,
214 "8GB",
215 ])
216
217 database_name = " ".join([
218 "Minikraken2",
219 minikraken2_version,
220 "(Created:",
221 now + ")"
222 ])
223
224 database_path = database_value
225
226 # download the minikraken2 data
227 try:
228 download_url = 'https://genome-idx.s3.amazonaws.com/kraken/minikraken2_%s_8GB_201904.tgz' % minikraken2_version
229 src = urlopen(download_url)
230 except URLError as e:
231 print('url: ' + download_url, file=sys.stderr)
232 print(e, file=sys.stderr)
233 exit(1)
234
235 with open('tmp_data.tar.gz', 'wb') as dst:
236 shutil.copyfileobj(src, dst)
237 # unpack the downloaded archive to the target directory
238 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh:
239 for member in fh.getmembers():
240 if member.isreg():
241 member.name = os.path.basename(member.name)
242 fh.extract(member, os.path.join(target_directory, database_path))
243
244 data_table_entry = {
245 'data_tables': {
246 data_table_name: [
247 {
248 "value": database_value,
249 "name": database_name,
250 "path": database_path,
251 }
252 ]
253 }
254 }
255
256 return data_table_entry
257
258
259 def kraken2_build_special(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME):
260
261 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
262
263 special_database_names = {
264 "rdp": "RDP",
265 "greengenes": "Greengenes",
266 "silva": "Silva",
267 }
268
269 database_value = "_".join([
270 now,
271 kraken2_args["special_database_type"],
272 "kmer-len", str(kraken2_args["kmer_len"]),
273 "minimizer-len", str(kraken2_args["minimizer_len"]),
274 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]),
275 "load-factor", str(kraken2_args["load_factor"]),
276 ])
277
278 database_name = " ".join([
279 special_database_names[kraken2_args["special_database_type"]],
280 "(Created:",
281 now + ",",
282 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",",
283 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",",
284 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")",
285 "load-factor=" + str(kraken2_args["load_factor"]) + ")",
286 ])
287
288 database_path = database_value
289
290 args = [
291 '--threads', str(kraken2_args["threads"]),
292 '--special', kraken2_args["special_database_type"],
293 '--kmer-len', str(kraken2_args["kmer_len"]),
294 '--minimizer-len', str(kraken2_args["minimizer_len"]),
295 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]),
296 '--load-factor', str(kraken2_args["load_factor"]),
297 '--db', database_path
298 ]
299
300 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
301
302 if kraken2_args["clean"]:
303 args = [
304 '--threads', str(kraken2_args["threads"]),
305 '--clean',
306 '--db', database_path
307 ]
308
309 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
310
311 data_table_entry = {
312 'data_tables': {
313 data_table_name: [
314 {
315 "value": database_value,
316 "name": database_name,
317 "path": database_path,
318 }
319 ]
320 }
321 }
322
323 return data_table_entry
324
325
326 def kraken2_build_custom(kraken2_args, custom_database_name, custom_source_info, target_directory, data_table_name=DATA_TABLE_NAME):
327 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ")
328
329 database_value = "_".join([
330 now,
331 re.sub(r'[^\w_.-]+', '_', custom_database_name).strip('_'),
332 "kmer-len", str(kraken2_args["kmer_len"]),
333 "minimizer-len", str(kraken2_args["minimizer_len"]),
334 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]),
335 "load-factor", str(kraken2_args["load_factor"]),
336 ])
337
338 database_name = " ".join([
339 custom_database_name,
340 "(" + custom_source_info + ",",
341 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",",
342 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",",
343 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ",",
344 "load-factor=" + str(kraken2_args["load_factor"]) + ")",
345 ])
346
347 database_path = database_value
348
349 args = [
350 '--threads', str(kraken2_args["threads"]),
351 '--download-taxonomy',
352 '--db', database_path,
353 ]
354
355 if kraken2_args['skip_maps']:
356 args.append('--skip-maps')
357
358 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
359
360 args = [
361 '--threads', str(kraken2_args["threads"]),
362 '--add-to-library', kraken2_args["custom_fasta"],
363 '--db', database_path,
364 ]
365
366 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
367
368 args = [
369 '--threads', str(kraken2_args["threads"]),
370 '--build',
371 '--kmer-len', str(kraken2_args["kmer_len"]),
372 '--minimizer-len', str(kraken2_args["minimizer_len"]),
373 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]),
374 '--load-factor', str(kraken2_args["load_factor"]),
375 '--db', database_path,
376 ]
377
378 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
379
380 if kraken2_args["clean"]:
381 args = [
382 '--threads', str(kraken2_args["threads"]),
383 '--clean',
384 '--db', database_path,
385 ]
386
387 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory)
388
389 data_table_entry = {
390 'data_tables': {
391 data_table_name: [
392 {
393 "value": database_value,
394 "name": database_name,
395 "path": database_path,
396 }
397 ]
398 }
399 }
400
401 return data_table_entry
402
403
404 def main():
405 parser = argparse.ArgumentParser()
406 parser.add_argument('data_manager_json')
407 parser.add_argument('--kmer-len', dest='kmer_len', type=int, default=35, help='kmer length')
408 parser.add_argument('--minimizer-len', dest='minimizer_len', type=int, default=31, help='minimizer length')
409 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces')
410 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor')
411 parser.add_argument('--threads', dest='threads', default=1, help='threads')
412 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build')
413 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)')
414 parser.add_argument('--prebuilt-db', dest='prebuilt_db', type=StandardPrebuiltSizes, choices=list(StandardPrebuiltSizes), help='Prebuilt database to download. Only applies to --database-type standard_prebuilt or special_prebuilt.')
415 parser.add_argument('--prebuilt-date', dest='prebuilt_date', help='Database build date (YYYY-MM-DD). Only applies to --database-type standard_prebuilt.')
416 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)')
417 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)')
418 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)')
419 parser.add_argument('--custom-source-info', dest='custom_source_info', help='Description of how this build has been sourced (only applies to --database-type custom)')
420 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='')
421 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files')
422 args = parser.parse_args()
423
424 with open(args.data_manager_json) as fh:
425 data_manager_input = json.load(fh)
426
427 target_directory = data_manager_input['output_data'][0]['extra_files_path']
428
429 try:
430 os.mkdir(target_directory)
431 except OSError as exc:
432 if exc.errno == errno.EEXIST and os.path.isdir(target_directory):
433 pass
434 else:
435 raise
436
437 data_manager_output = {}
438
439 if str(args.database_type) == 'standard_local_build':
440 kraken2_args = {
441 "kmer_len": args.kmer_len,
442 "minimizer_len": args.minimizer_len,
443 "minimizer_spaces": args.minimizer_spaces,
444 "load_factor": args.load_factor,
445 "threads": args.threads,
446 "clean": args.clean,
447 }
448 data_manager_output = kraken2_build_standard(
449 kraken2_args,
450 target_directory,
451 )
452 elif str(args.database_type) in ('standard_prebuilt', 'special_prebuilt'):
453 data_manager_output = kraken2_build_standard_prebuilt(
454 str(args.prebuilt_db),
455 str(args.prebuilt_date),
456 target_directory
457 )
458 elif str(args.database_type) == 'minikraken':
459 data_manager_output = kraken2_build_minikraken(
460 str(args.minikraken2_version),
461 target_directory
462 )
463 elif str(args.database_type) == 'special':
464 kraken2_args = {
465 "special_database_type": str(args.special_database_type),
466 "kmer_len": args.kmer_len,
467 "minimizer_len": args.minimizer_len,
468 "minimizer_spaces": args.minimizer_spaces,
469 "load_factor": args.load_factor,
470 "threads": args.threads,
471 "clean": args.clean,
472 }
473 data_manager_output = kraken2_build_special(
474 kraken2_args,
475 target_directory,
476 )
477 elif str(args.database_type) == 'custom':
478 kraken2_args = {
479 "custom_fasta": args.custom_fasta,
480 "skip_maps": args.skip_maps,
481 "kmer_len": args.kmer_len,
482 "minimizer_len": args.minimizer_len,
483 "minimizer_spaces": args.minimizer_spaces,
484 "load_factor": args.load_factor,
485 "threads": args.threads,
486 "clean": args.clean,
487 }
488 data_manager_output = kraken2_build_custom(
489 kraken2_args,
490 args.custom_database_name,
491 args.custom_source_info,
492 target_directory,
493 )
494 else:
495 sys.exit("Invalid database type")
496
497 with open(args.data_manager_json, 'w') as fh:
498 json.dump(data_manager_output, fh, sort_keys=True)
499
500
501 if __name__ == "__main__":
502 main()