Mercurial > repos > nate > data_manager_build_kraken2_database
comparison data_manager/kraken2_build_database.py @ 0:c13785ca6192 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_kraken2_database/ commit 39e87c095e426fc3f147d55de0434cd54ae0354a
| author | nate |
|---|---|
| date | Wed, 06 Nov 2024 20:00:39 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c13785ca6192 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse | |
| 4 import datetime | |
| 5 import errno | |
| 6 import json | |
| 7 import os | |
| 8 import re | |
| 9 import shutil | |
| 10 import subprocess | |
| 11 import sys | |
| 12 import tarfile | |
| 13 from enum import Enum | |
| 14 | |
| 15 try: | |
| 16 # Python3 | |
| 17 from urllib.request import urlopen | |
| 18 from urllib.error import URLError | |
| 19 except ImportError: | |
| 20 from urllib2 import urlopen | |
| 21 from urllib2 import URLError | |
| 22 | |
| 23 | |
| 24 DATA_TABLE_NAME = "kraken2_databases" | |
| 25 | |
| 26 | |
| 27 class KrakenDatabaseTypes(Enum): | |
| 28 standard_local_build = 'standard_local_build' | |
| 29 standard_prebuilt = 'standard_prebuilt' | |
| 30 minikraken = 'minikraken' | |
| 31 special_prebuilt = 'special_prebuilt' | |
| 32 special = 'special' | |
| 33 custom = 'custom' | |
| 34 | |
| 35 def __str__(self): | |
| 36 return self.value | |
| 37 | |
| 38 | |
| 39 class SpecialDatabaseTypes(Enum): | |
| 40 rdp = 'rdp' | |
| 41 greengenes = 'greengenes' | |
| 42 silva = 'silva' | |
| 43 | |
| 44 def __str__(self): | |
| 45 return self.value | |
| 46 | |
| 47 | |
| 48 class Minikraken2Versions(Enum): | |
| 49 v1 = 'v1' | |
| 50 v2 = 'v2' | |
| 51 | |
| 52 def __str__(self): | |
| 53 return self.value | |
| 54 | |
| 55 | |
| 56 class StandardPrebuiltSizes(Enum): | |
| 57 viral = "viral" | |
| 58 minusb = "minusb" | |
| 59 standard = "standard" | |
| 60 standard_08gb = "standard_08gb" | |
| 61 standard_16gb = "standard_16gb" | |
| 62 pluspf = "pluspf" | |
| 63 pluspf_08gb = "pluspf_08gb" | |
| 64 pluspf_16gb = "pluspf_16gb" | |
| 65 pluspfp = "pluspfp" | |
| 66 pluspfp_08gb = "pluspfp_08gb" | |
| 67 pluspfp_16gb = "pluspfp_16gb" | |
| 68 eupathdb48 = "eupathdb48" | |
| 69 | |
| 70 def __str__(self): | |
| 71 return self.value | |
| 72 | |
| 73 | |
| 74 def kraken2_build_standard(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 75 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 76 | |
| 77 database_value = "_".join([ | |
| 78 now, | |
| 79 "standard", | |
| 80 "kmer-len", str(kraken2_args["kmer_len"]), | |
| 81 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
| 82 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
| 83 "load-factor", str(kraken2_args["load_factor"]), | |
| 84 ]) | |
| 85 | |
| 86 database_name = " ".join([ | |
| 87 "Standard (Local Build)", | |
| 88 "(Created:", | |
| 89 now + ",", | |
| 90 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
| 91 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
| 92 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
| 93 "load-factor", str(kraken2_args["load_factor"]), | |
| 94 ]) | |
| 95 | |
| 96 database_path = database_value | |
| 97 | |
| 98 args = [ | |
| 99 '--threads', str(kraken2_args["threads"]), | |
| 100 '--standard', | |
| 101 '--kmer-len', str(kraken2_args["kmer_len"]), | |
| 102 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
| 103 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
| 104 '--load-factor', str(kraken2_args["load_factor"]), | |
| 105 '--db', database_path | |
| 106 ] | |
| 107 | |
| 108 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 109 | |
| 110 if kraken2_args["clean"]: | |
| 111 args = [ | |
| 112 '--threads', str(kraken2_args["threads"]), | |
| 113 '--clean', | |
| 114 '--db', database_path | |
| 115 ] | |
| 116 | |
| 117 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 118 | |
| 119 data_table_entry = { | |
| 120 'data_tables': { | |
| 121 data_table_name: [ | |
| 122 { | |
| 123 "value": database_value, | |
| 124 "name": database_name, | |
| 125 "path": database_path, | |
| 126 } | |
| 127 ] | |
| 128 } | |
| 129 } | |
| 130 | |
| 131 return data_table_entry | |
| 132 | |
| 133 | |
| 134 def kraken2_build_standard_prebuilt(prebuilt_db, prebuilt_date, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 135 | |
| 136 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 137 | |
| 138 prebuild_name = { | |
| 139 'viral': "Viral", | |
| 140 'minusb': "MinusB (archaea, viral, plasmid, human, UniVec_Core)", | |
| 141 'standard': "Standard-Full (archaea, bacteria, viral, plasmid, human,UniVec_Core)", | |
| 142 'standard_08gb': "Standard-8 (Standard with DB capped at 8 GB)", | |
| 143 'standard_16gb': "Standard-16 (Standard with DB capped at 16 GB)", | |
| 144 'pluspf': "PlusPF (Standard plus protozoa and fungi)", | |
| 145 'pluspf_08gb': "PlusPF-8 (PlusPF with DB capped at 8 GB)", | |
| 146 'pluspf_16gb': "PlusPF-16 (PlusPF with DB capped at 16 GB)", | |
| 147 'pluspfp': "PlusPFP (Standard plus protozoa, fungi and plant)", | |
| 148 'pluspfp_08gb': "PlusPFP-8 (PlusPFP with DB capped at 8 GB)", | |
| 149 'pluspfp_16gb': "PlusPFP-16 (PlusPFP with DB capped at 16 GB)", | |
| 150 'eupathdb48': "EuPathDB-46", | |
| 151 } | |
| 152 | |
| 153 database_value = "_".join([ | |
| 154 now, | |
| 155 "standard_prebuilt", | |
| 156 prebuilt_db, | |
| 157 prebuilt_date | |
| 158 ]) | |
| 159 | |
| 160 database_name = " ".join([ | |
| 161 "Prebuilt Refseq indexes: ", | |
| 162 prebuild_name[prebuilt_db], | |
| 163 "(Version: ", | |
| 164 prebuilt_date, | |
| 165 "- Downloaded:", | |
| 166 now + ")" | |
| 167 ]) | |
| 168 | |
| 169 database_path = database_value | |
| 170 | |
| 171 # we may need to let the user choose the date when new DBs are posted. | |
| 172 date_url_str = prebuilt_date.replace('-', '') | |
| 173 # download the pre-built database | |
| 174 try: | |
| 175 download_url = 'https://genome-idx.s3.amazonaws.com/kraken/k2_%s_%s.tar.gz' % (prebuilt_db, date_url_str) | |
| 176 src = urlopen(download_url) | |
| 177 except URLError as e: | |
| 178 print('url: ' + download_url, file=sys.stderr) | |
| 179 print(e, file=sys.stderr) | |
| 180 exit(1) | |
| 181 | |
| 182 with open('tmp_data.tar.gz', 'wb') as dst: | |
| 183 shutil.copyfileobj(src, dst) | |
| 184 # unpack the downloaded archive to the target directory | |
| 185 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | |
| 186 for member in fh.getmembers(): | |
| 187 if member.isreg(): | |
| 188 member.name = os.path.basename(member.name) | |
| 189 fh.extract(member, os.path.join(target_directory, database_path)) | |
| 190 | |
| 191 data_table_entry = { | |
| 192 'data_tables': { | |
| 193 data_table_name: [ | |
| 194 { | |
| 195 "value": database_value, | |
| 196 "name": database_name, | |
| 197 "path": database_path, | |
| 198 } | |
| 199 ] | |
| 200 } | |
| 201 } | |
| 202 | |
| 203 return data_table_entry | |
| 204 | |
| 205 | |
| 206 def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 207 | |
| 208 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 209 | |
| 210 database_value = "_".join([ | |
| 211 now, | |
| 212 "minikraken2", | |
| 213 minikraken2_version, | |
| 214 "8GB", | |
| 215 ]) | |
| 216 | |
| 217 database_name = " ".join([ | |
| 218 "Minikraken2", | |
| 219 minikraken2_version, | |
| 220 "(Created:", | |
| 221 now + ")" | |
| 222 ]) | |
| 223 | |
| 224 database_path = database_value | |
| 225 | |
| 226 # download the minikraken2 data | |
| 227 try: | |
| 228 download_url = 'https://genome-idx.s3.amazonaws.com/kraken/minikraken2_%s_8GB_201904.tgz' % minikraken2_version | |
| 229 src = urlopen(download_url) | |
| 230 except URLError as e: | |
| 231 print('url: ' + download_url, file=sys.stderr) | |
| 232 print(e, file=sys.stderr) | |
| 233 exit(1) | |
| 234 | |
| 235 with open('tmp_data.tar.gz', 'wb') as dst: | |
| 236 shutil.copyfileobj(src, dst) | |
| 237 # unpack the downloaded archive to the target directory | |
| 238 with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: | |
| 239 for member in fh.getmembers(): | |
| 240 if member.isreg(): | |
| 241 member.name = os.path.basename(member.name) | |
| 242 fh.extract(member, os.path.join(target_directory, database_path)) | |
| 243 | |
| 244 data_table_entry = { | |
| 245 'data_tables': { | |
| 246 data_table_name: [ | |
| 247 { | |
| 248 "value": database_value, | |
| 249 "name": database_name, | |
| 250 "path": database_path, | |
| 251 } | |
| 252 ] | |
| 253 } | |
| 254 } | |
| 255 | |
| 256 return data_table_entry | |
| 257 | |
| 258 | |
| 259 def kraken2_build_special(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 260 | |
| 261 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 262 | |
| 263 special_database_names = { | |
| 264 "rdp": "RDP", | |
| 265 "greengenes": "Greengenes", | |
| 266 "silva": "Silva", | |
| 267 } | |
| 268 | |
| 269 database_value = "_".join([ | |
| 270 now, | |
| 271 kraken2_args["special_database_type"], | |
| 272 "kmer-len", str(kraken2_args["kmer_len"]), | |
| 273 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
| 274 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
| 275 "load-factor", str(kraken2_args["load_factor"]), | |
| 276 ]) | |
| 277 | |
| 278 database_name = " ".join([ | |
| 279 special_database_names[kraken2_args["special_database_type"]], | |
| 280 "(Created:", | |
| 281 now + ",", | |
| 282 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
| 283 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
| 284 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", | |
| 285 "load-factor=" + str(kraken2_args["load_factor"]) + ")", | |
| 286 ]) | |
| 287 | |
| 288 database_path = database_value | |
| 289 | |
| 290 args = [ | |
| 291 '--threads', str(kraken2_args["threads"]), | |
| 292 '--special', kraken2_args["special_database_type"], | |
| 293 '--kmer-len', str(kraken2_args["kmer_len"]), | |
| 294 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
| 295 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
| 296 '--load-factor', str(kraken2_args["load_factor"]), | |
| 297 '--db', database_path | |
| 298 ] | |
| 299 | |
| 300 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 301 | |
| 302 if kraken2_args["clean"]: | |
| 303 args = [ | |
| 304 '--threads', str(kraken2_args["threads"]), | |
| 305 '--clean', | |
| 306 '--db', database_path | |
| 307 ] | |
| 308 | |
| 309 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 310 | |
| 311 data_table_entry = { | |
| 312 'data_tables': { | |
| 313 data_table_name: [ | |
| 314 { | |
| 315 "value": database_value, | |
| 316 "name": database_name, | |
| 317 "path": database_path, | |
| 318 } | |
| 319 ] | |
| 320 } | |
| 321 } | |
| 322 | |
| 323 return data_table_entry | |
| 324 | |
| 325 | |
| 326 def kraken2_build_custom(kraken2_args, custom_database_name, custom_source_info, target_directory, data_table_name=DATA_TABLE_NAME): | |
| 327 now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") | |
| 328 | |
| 329 database_value = "_".join([ | |
| 330 now, | |
| 331 re.sub(r'[^\w_.-]+', '_', custom_database_name).strip('_'), | |
| 332 "kmer-len", str(kraken2_args["kmer_len"]), | |
| 333 "minimizer-len", str(kraken2_args["minimizer_len"]), | |
| 334 "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), | |
| 335 "load-factor", str(kraken2_args["load_factor"]), | |
| 336 ]) | |
| 337 | |
| 338 database_name = " ".join([ | |
| 339 custom_database_name, | |
| 340 "(" + custom_source_info + ",", | |
| 341 "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", | |
| 342 "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", | |
| 343 "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ",", | |
| 344 "load-factor=" + str(kraken2_args["load_factor"]) + ")", | |
| 345 ]) | |
| 346 | |
| 347 database_path = database_value | |
| 348 | |
| 349 args = [ | |
| 350 '--threads', str(kraken2_args["threads"]), | |
| 351 '--download-taxonomy', | |
| 352 '--db', database_path, | |
| 353 ] | |
| 354 | |
| 355 if kraken2_args['skip_maps']: | |
| 356 args.append('--skip-maps') | |
| 357 | |
| 358 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 359 | |
| 360 args = [ | |
| 361 '--threads', str(kraken2_args["threads"]), | |
| 362 '--add-to-library', kraken2_args["custom_fasta"], | |
| 363 '--db', database_path, | |
| 364 ] | |
| 365 | |
| 366 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 367 | |
| 368 args = [ | |
| 369 '--threads', str(kraken2_args["threads"]), | |
| 370 '--build', | |
| 371 '--kmer-len', str(kraken2_args["kmer_len"]), | |
| 372 '--minimizer-len', str(kraken2_args["minimizer_len"]), | |
| 373 '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), | |
| 374 '--load-factor', str(kraken2_args["load_factor"]), | |
| 375 '--db', database_path, | |
| 376 ] | |
| 377 | |
| 378 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 379 | |
| 380 if kraken2_args["clean"]: | |
| 381 args = [ | |
| 382 '--threads', str(kraken2_args["threads"]), | |
| 383 '--clean', | |
| 384 '--db', database_path, | |
| 385 ] | |
| 386 | |
| 387 subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) | |
| 388 | |
| 389 data_table_entry = { | |
| 390 'data_tables': { | |
| 391 data_table_name: [ | |
| 392 { | |
| 393 "value": database_value, | |
| 394 "name": database_name, | |
| 395 "path": database_path, | |
| 396 } | |
| 397 ] | |
| 398 } | |
| 399 } | |
| 400 | |
| 401 return data_table_entry | |
| 402 | |
| 403 | |
| 404 def main(): | |
| 405 parser = argparse.ArgumentParser() | |
| 406 parser.add_argument('data_manager_json') | |
| 407 parser.add_argument('--kmer-len', dest='kmer_len', type=int, default=35, help='kmer length') | |
| 408 parser.add_argument('--minimizer-len', dest='minimizer_len', type=int, default=31, help='minimizer length') | |
| 409 parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') | |
| 410 parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor') | |
| 411 parser.add_argument('--threads', dest='threads', default=1, help='threads') | |
| 412 parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') | |
| 413 parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)') | |
| 414 parser.add_argument('--prebuilt-db', dest='prebuilt_db', type=StandardPrebuiltSizes, choices=list(StandardPrebuiltSizes), help='Prebuilt database to download. Only applies to --database-type standard_prebuilt or special_prebuilt.') | |
| 415 parser.add_argument('--prebuilt-date', dest='prebuilt_date', help='Database build date (YYYY-MM-DD). Only applies to --database-type standard_prebuilt.') | |
| 416 parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') | |
| 417 parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') | |
| 418 parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') | |
| 419 parser.add_argument('--custom-source-info', dest='custom_source_info', help='Description of how this build has been sourced (only applies to --database-type custom)') | |
| 420 parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') | |
| 421 parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files') | |
| 422 args = parser.parse_args() | |
| 423 | |
| 424 with open(args.data_manager_json) as fh: | |
| 425 data_manager_input = json.load(fh) | |
| 426 | |
| 427 target_directory = data_manager_input['output_data'][0]['extra_files_path'] | |
| 428 | |
| 429 try: | |
| 430 os.mkdir(target_directory) | |
| 431 except OSError as exc: | |
| 432 if exc.errno == errno.EEXIST and os.path.isdir(target_directory): | |
| 433 pass | |
| 434 else: | |
| 435 raise | |
| 436 | |
| 437 data_manager_output = {} | |
| 438 | |
| 439 if str(args.database_type) == 'standard_local_build': | |
| 440 kraken2_args = { | |
| 441 "kmer_len": args.kmer_len, | |
| 442 "minimizer_len": args.minimizer_len, | |
| 443 "minimizer_spaces": args.minimizer_spaces, | |
| 444 "load_factor": args.load_factor, | |
| 445 "threads": args.threads, | |
| 446 "clean": args.clean, | |
| 447 } | |
| 448 data_manager_output = kraken2_build_standard( | |
| 449 kraken2_args, | |
| 450 target_directory, | |
| 451 ) | |
| 452 elif str(args.database_type) in ('standard_prebuilt', 'special_prebuilt'): | |
| 453 data_manager_output = kraken2_build_standard_prebuilt( | |
| 454 str(args.prebuilt_db), | |
| 455 str(args.prebuilt_date), | |
| 456 target_directory | |
| 457 ) | |
| 458 elif str(args.database_type) == 'minikraken': | |
| 459 data_manager_output = kraken2_build_minikraken( | |
| 460 str(args.minikraken2_version), | |
| 461 target_directory | |
| 462 ) | |
| 463 elif str(args.database_type) == 'special': | |
| 464 kraken2_args = { | |
| 465 "special_database_type": str(args.special_database_type), | |
| 466 "kmer_len": args.kmer_len, | |
| 467 "minimizer_len": args.minimizer_len, | |
| 468 "minimizer_spaces": args.minimizer_spaces, | |
| 469 "load_factor": args.load_factor, | |
| 470 "threads": args.threads, | |
| 471 "clean": args.clean, | |
| 472 } | |
| 473 data_manager_output = kraken2_build_special( | |
| 474 kraken2_args, | |
| 475 target_directory, | |
| 476 ) | |
| 477 elif str(args.database_type) == 'custom': | |
| 478 kraken2_args = { | |
| 479 "custom_fasta": args.custom_fasta, | |
| 480 "skip_maps": args.skip_maps, | |
| 481 "kmer_len": args.kmer_len, | |
| 482 "minimizer_len": args.minimizer_len, | |
| 483 "minimizer_spaces": args.minimizer_spaces, | |
| 484 "load_factor": args.load_factor, | |
| 485 "threads": args.threads, | |
| 486 "clean": args.clean, | |
| 487 } | |
| 488 data_manager_output = kraken2_build_custom( | |
| 489 kraken2_args, | |
| 490 args.custom_database_name, | |
| 491 args.custom_source_info, | |
| 492 target_directory, | |
| 493 ) | |
| 494 else: | |
| 495 sys.exit("Invalid database type") | |
| 496 | |
| 497 with open(args.data_manager_json, 'w') as fh: | |
| 498 json.dump(data_manager_output, fh, sort_keys=True) | |
| 499 | |
| 500 | |
| 501 if __name__ == "__main__": | |
| 502 main() |
