Mercurial > repos > greg > ideas
comparison ideas.R @ 174:b0ca3591242e draft
Uploaded
| author | greg |
|---|---|
| date | Thu, 25 Jan 2018 11:53:15 -0500 |
| parents | 59ed3d424524 |
| children | 2b6b5e2769f5 |
comparison
equal
deleted
inserted
replaced
| 173:843bfa2dff2c | 174:b0ca3591242e |
|---|---|
| 2 | 2 |
| 3 suppressPackageStartupMessages(library("data.table")) | 3 suppressPackageStartupMessages(library("data.table")) |
| 4 suppressPackageStartupMessages(library("optparse")) | 4 suppressPackageStartupMessages(library("optparse")) |
| 5 | 5 |
| 6 option_list <- list( | 6 option_list <- list( |
| 7 make_option(c("--burnin_num"), action="store", dest="burnin_num", type="integer", help="Number of burnin steps"), | 7 make_option(c("--burnin_num"), action="store", dest="burnin_num", type="integer", help="Number of burnin steps"), |
| 8 make_option(c("--bychr"), action="store_true", dest="bychr", default=FALSE, help="Output chromosomes in separate files"), | 8 make_option(c("--bychr"), action="store_true", dest="bychr", default=FALSE, help="Output chromosomes in separate files"), |
| 9 make_option(c("--chrom_bed_input"), action="store", dest="chrom_bed_input", default=NULL, help="Chromosome windows positions file"), | 9 make_option(c("--chrom_bed_input"), action="store", dest="chrom_bed_input", default=NULL, help="Chromosome windows positions file"), |
| 10 make_option(c("--chromosome_windows"), action="store", dest="chromosome_windows", default=NULL, help="Windows positions by chroms config file"), | 10 make_option(c("--chromosome_windows"), action="store", dest="chromosome_windows", default=NULL, help="Windows positions by chroms config file"), |
| 11 make_option(c("--hp"), action="store_true", dest="hp", default=FALSE, help="Discourage state transition across chromosomes"), | 11 make_option(c("--hp"), action="store_true", dest="hp", default=FALSE, help="Discourage state transition across chromosomes"), |
| 12 make_option(c("--initial_states"), action="store", dest="initial_states", type="integer", default=NULL, help="Initial number of states"), | 12 make_option(c("--initial_states"), action="store", dest="initial_states", type="integer", default=NULL, help="Initial number of states"), |
| 13 make_option(c("--input"), action="store", dest="input", help="IdeasPre input dataset"), | 13 make_option(c("--input"), action="store", dest="input", help="IdeasPre input dataset"), |
| 14 make_option(c("--input_files_path"), action="store", dest="input_files_path", help="IdeasPre input dataset extra files path"), | 14 make_option(c("--input_files_path"), action="store", dest="input_files_path", help="IdeasPre input dataset extra files path"), |
| 15 make_option(c("--ideas_input_config"), action="store", dest="ideas_input_config", help="IDEAS_input_config file"), | 15 make_option(c("--ideas_input_config"), action="store", dest="ideas_input_config", help="IDEAS_input_config file"), |
| 16 make_option(c("--log2"), action="store", dest="log2", type="double", default=NULL, help="log2 transformation"), | 16 make_option(c("--log2"), action="store", dest="log2", type="double", default=NULL, help="log2 transformation"), |
| 17 make_option(c("--maxerr"), action="store", dest="maxerr", type="double", default=NULL, help="Maximum standard deviation for the emission Gaussian distribution"), | 17 make_option(c("--maxerr"), action="store", dest="maxerr", type="double", default=NULL, help="Maximum standard deviation for the emission Gaussian distribution"), |
| 18 make_option(c("--max_cell_type_clusters"), action="store", dest="max_cell_type_clusters", type="integer", default=NULL, help="Maximum number of cell type clusters allowed"), | 18 make_option(c("--max_cell_type_clusters"), action="store", dest="max_cell_type_clusters", type="integer", default=NULL, help="Maximum number of cell type clusters allowed"), |
| 19 make_option(c("--max_position_classes"), action="store", dest="max_position_classes", type="integer", default=NULL, help="Maximum number of position classes to be inferred"), | 19 make_option(c("--max_position_classes"), action="store", dest="max_position_classes", type="integer", default=NULL, help="Maximum number of position classes to be inferred"), |
| 20 make_option(c("--max_states"), action="store", dest="max_states", type="double", default=NULL, help="Maximum number of states to be inferred"), | 20 make_option(c("--max_states"), action="store", dest="max_states", type="double", default=NULL, help="Maximum number of states to be inferred"), |
| 21 make_option(c("--mcmc_num"), action="store", dest="mcmc_num", type="integer", help="Number of maximization steps"), | 21 make_option(c("--mcmc_num"), action="store", dest="mcmc_num", type="integer", help="Number of maximization steps"), |
| 22 make_option(c("--minerr"), action="store", dest="minerr", type="double", default=NULL, help="Minimum standard deviation for the emission Gaussian distribution"), | 22 make_option(c("--minerr"), action="store", dest="minerr", type="double", default=NULL, help="Minimum standard deviation for the emission Gaussian distribution"), |
| 23 make_option(c("--output_log"), action="store", dest="output_log", default=NULL, help="Output log file path"), | 23 make_option(c("--output_dir"), action="store", dest="output_dir", help="Output directory, used only if job ends in error and process log needs saving"), |
| 24 make_option(c("--prior_concentration"), action="store", dest="prior_concentration", type="double", default=NULL, help="Prior concentration"), | 24 make_option(c("--output_log"), action="store", dest="output_log", default=NULL, help="Output log file path"), |
| 25 make_option(c("--project_name"), action="store", dest="project_name", help="Outputs will have this base name"), | 25 make_option(c("--prior_concentration"), action="store", dest="prior_concentration", type="double", default=NULL, help="Prior concentration"), |
| 26 make_option(c("--rseed"), action="store", dest="rseed", type="integer", help="Seed for IDEAS model initialization"), | 26 make_option(c("--project_name"), action="store", dest="project_name", help="Outputs will have this base name"), |
| 27 make_option(c("--save_ideas_log"), action="store", dest="save_ideas_log", default=NULL, help="Flag to save IDEAS process log"), | 27 make_option(c("--rseed"), action="store", dest="rseed", type="integer", help="Seed for IDEAS model initialization"), |
| 28 make_option(c("--standardize_datasets"), action="store_true", dest="standardize_datasets", default=FALSE, help="Standardize all datasets"), | 28 make_option(c("--save_ideas_log"), action="store", dest="save_ideas_log", default=NULL, help="Flag to save IDEAS process log"), |
| 29 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), | 29 make_option(c("--standardize_datasets"), action="store_true", dest="standardize_datasets", default=FALSE, help="Standardize all datasets"), |
| 30 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), | 30 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), |
| 31 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations") | 31 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), |
| 32 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations") | |
| 32 ) | 33 ) |
| 33 | 34 |
| 34 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) | 35 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) |
| 35 args <- parse_args(parser, positional_arguments=TRUE) | 36 args <- parse_args(parser, positional_arguments=TRUE) |
| 36 opt <- args$options | 37 opt <- args$options |
| 37 | 38 |
| 38 add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) { | 39 add_output_redirect <- function(cmd, output_log) { |
| 39 if (is.null(save_ideas_log)) { | 40 new_cmd = c(cmd, "&>>", output_log); |
| 40 new_cmd = c(cmd, "&>>", default_log_name); | |
| 41 }else { | |
| 42 new_cmd = c(cmd, "&>>", output_log); | |
| 43 } | |
| 44 return(paste(new_cmd, collapse=" ")); | 41 return(paste(new_cmd, collapse=" ")); |
| 45 } | 42 } |
| 46 | 43 |
| 47 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { | 44 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { |
| 48 X = NULL; | 45 X = NULL; |
| 349 for (f in files) { | 346 for (f in files) { |
| 350 unlink(f); | 347 unlink(f); |
| 351 } | 348 } |
| 352 } | 349 } |
| 353 | 350 |
| 354 run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) { | 351 run_cmd <- function(cmd, save_ideas_log, output_log, output_dir) { |
| 352 cat("save_ideas_log: ", save_ideas_log, "\n"); | |
| 353 cat("output_log: ", output_log, "\n"); | |
| 354 cat("output_dir: ", output_dir, "\n"); | |
| 355 cat("\nRunning cmd:\n", cmd, "\n\n"); | |
| 355 rc = system(cmd); | 356 rc = system(cmd); |
| 356 if (rc != 0) { | 357 if (rc != 0) { |
| 357 if (is.null(save_ideas_log)) { | 358 if (is.null(save_ideas_log)) { |
| 358 file.rename(default_log_name, output_log); | 359 to_path = paste(output_dir, output_log, sep="/"); |
| 360 file.rename(output_log, to_path); | |
| 359 } | 361 } |
| 360 quit(save="no", status=rc); | 362 quit(save="no", status=rc); |
| 361 } | 363 } |
| 362 } | 364 } |
| 363 | 365 |
| 364 # Initialize values. | 366 # Initialize values. |
| 365 default_log_name = "ideas_log.txt"; | 367 if (is.null(opt$save_ideas_log)) { |
| 368 output_log = "ideas_log.txt"; | |
| 369 } else { | |
| 370 output_log = opt$output_log; | |
| 371 } | |
| 366 # Get full path of chromosomes.bed if not NULL. | 372 # Get full path of chromosomes.bed if not NULL. |
| 367 chrom_bed_input = get_file_path(opt$input_files_path, opt$chrom_bed_input); | 373 chrom_bed_input = get_file_path(opt$input_files_path, opt$chrom_bed_input); |
| 374 cat("chrom_bed_input: ", chrom_bed_input, "\n"); | |
| 368 # Get full path of chromosome_windows.txt if not NULL. | 375 # Get full path of chromosome_windows.txt if not NULL. |
| 369 chromosome_windows = get_file_path(opt$input_files_path, opt$chromosome_windows); | 376 chromosome_windows = get_file_path(opt$input_files_path, opt$chromosome_windows); |
| 377 cat("chromosome_windows: ", chromosome_windows, "\n"); | |
| 370 if (is.null(chromosome_windows)) { | 378 if (is.null(chromosome_windows)) { |
| 371 windows_by_chrom = NULL; | 379 windows_by_chrom = NULL; |
| 372 } else { | 380 } else { |
| 373 # Read chromosome_windows.txt into memory. | 381 # Read chromosome_windows.txt into memory. |
| 374 windows_by_chrom = get_windows_by_chrom(chromosome_windows); | 382 windows_by_chrom = get_windows_by_chrom(chromosome_windows); |
| 375 } | 383 } |
| 376 ideas_input_config = get_file_path(opt$input_files_path, opt$ideas_input_config); | 384 ideas_input_config = get_file_path(opt$input_files_path, opt$ideas_input_config); |
| 385 cat("ideas_input_config: ", ideas_input_config, "\n"); | |
| 377 base_cmd = get_base_cmd(ideas_input_config, chrom_bed_input, opt$training_iterations, opt$bychr, opt$hp, | 386 base_cmd = get_base_cmd(ideas_input_config, chrom_bed_input, opt$training_iterations, opt$bychr, opt$hp, |
| 378 opt$standardize_datasets, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes, | 387 opt$standardize_datasets, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes, |
| 379 opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr, | 388 opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr, |
| 380 opt$maxerr, opt$rseed, opt$thread); | 389 opt$maxerr, opt$rseed, opt$thread); |
| 390 cat("base_cmd: ", base_cmd, "\n"); | |
| 381 output_base_name = opt$project_name; | 391 output_base_name = opt$project_name; |
| 392 cat("output_base_name: ", output_base_name, "\n"); | |
| 382 | 393 |
| 383 if (is.null(opt$training_iterations)) { | 394 if (is.null(opt$training_iterations)) { |
| 384 # Not performing training. | 395 # Not performing training. |
| 385 if (is.null(windows_by_chrom)) { | 396 if (is.null(windows_by_chrom)) { |
| 386 # Not performing windows by chromosome. | 397 # Not performing windows by chromosome. |
| 387 output_name = output_base_name; | 398 output_name = output_base_name; |
| 388 cmd = paste(base_cmd, "-o", output_name, sep=" "); | 399 cmd = paste(base_cmd, "-o", output_name, sep=" "); |
| 389 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 400 cmd = add_output_redirect(cmd, output_log); |
| 390 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 401 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
| 391 } else { | 402 } else { |
| 392 # Performing windows by chromosome. | 403 # Performing windows by chromosome. |
| 393 for (i in 1:length(windows_by_chrom)) { | 404 for (i in 1:length(windows_by_chrom)) { |
| 394 line = windows_by_chrom[i]; | 405 line = windows_by_chrom[i]; |
| 395 items = strsplit(line, " ")[[1]]; | 406 items = strsplit(line, " ")[[1]]; |
| 397 window_start = items[2]; | 408 window_start = items[2]; |
| 398 window_end = items[3]; | 409 window_end = items[3]; |
| 399 output_name = paste(output_base_name, chrom, sep="."); | 410 output_name = paste(output_base_name, chrom, sep="."); |
| 400 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); | 411 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); |
| 401 cmd = paste(cmd, "-o", output_name, sep=" "); | 412 cmd = paste(cmd, "-o", output_name, sep=" "); |
| 402 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 413 cmd = add_output_redirect(cmd, output_log); |
| 403 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 414 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
| 404 } | 415 } |
| 405 } | 416 } |
| 406 } else { | 417 } else { |
| 407 # performing training. | 418 # performing training. |
| 408 output_para0 = paste(output_base_name, "para0", sep="."); | 419 output_para0 = paste(output_base_name, "para0", sep="."); |
| 409 output_profile0 = paste(output_base_name, "profile0", sep="."); | 420 output_profile0 = paste(output_base_name, "profile0", sep="."); |
| 410 for (i in 1:opt$training_iterations) { | 421 for (i in 1:opt$training_iterations) { |
| 411 cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); | 422 cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); |
| 412 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 423 cmd = add_output_redirect(cmd, output_log); |
| 413 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 424 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
| 414 } | 425 } |
| 415 tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5); | 426 tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5); |
| 416 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); | 427 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); |
| 417 para = tpara$para; | 428 para = tpara$para; |
| 418 para = apply(para, 1, function(x){paste(x, collapse=" ")}); | 429 para = apply(para, 1, function(x){paste(x, collapse=" ")}); |
| 421 # Now run IDEAS based on the files produced during training. | 432 # Now run IDEAS based on the files produced during training. |
| 422 base_cmd = get_post_training_base_cmd(base_cmd, para); | 433 base_cmd = get_post_training_base_cmd(base_cmd, para); |
| 423 base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" "); | 434 base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" "); |
| 424 if (is.null(windows_by_chrom)) { | 435 if (is.null(windows_by_chrom)) { |
| 425 cmd = c(base_cmd, "-o", output_base_name); | 436 cmd = c(base_cmd, "-o", output_base_name); |
| 426 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 437 cmd = add_output_redirect(cmd, output_log); |
| 427 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 438 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
| 428 } else { | 439 } else { |
| 429 # Performing windows by chromosome. | 440 # Performing windows by chromosome. |
| 430 if (length(windows_by_chrom) == 1) { | 441 if (length(windows_by_chrom) == 1) { |
| 431 output_name = paste(output_base_name, i, sep="."); | 442 output_name = paste(output_base_name, i, sep="."); |
| 432 cmd = c(base_cmd, "-o", output_name); | 443 cmd = c(base_cmd, "-o", output_name); |
| 433 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 444 cmd = add_output_redirect(cmd, output_log); |
| 434 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 445 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
| 435 } else { | 446 } else { |
| 436 for (i in 1:length(windows_by_chrom)) { | 447 for (i in 1:length(windows_by_chrom)) { |
| 437 line = windows_by_chrom[i]; | 448 line = windows_by_chrom[i]; |
| 438 items = strsplit(line, " ")[[1]]; | 449 items = strsplit(line, " ")[[1]]; |
| 439 chrom = items[[1]]; | 450 chrom = items[[1]]; |
| 440 window_start = items[[2]]; | 451 window_start = items[[2]]; |
| 441 window_end = items[[3]]; | 452 window_end = items[[3]]; |
| 442 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); | 453 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); |
| 443 output_name = paste(output_base_name, chrom, sep="."); | 454 output_name = paste(output_base_name, chrom, sep="."); |
| 444 cmd = paste(cmd, "-o", output_name, sep=" "); | 455 cmd = paste(cmd, "-o", output_name, sep=" "); |
| 445 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 456 cmd = add_output_redirect(cmd, output_log); |
| 446 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 457 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
| 447 } | 458 } |
| 448 } | 459 } |
| 449 } | 460 } |
| 450 # Remove temporary outputs. | 461 # Remove temporary outputs. |
| 451 remove_files(path=".", pattern="tmp"); | 462 remove_files(path=".", pattern="tmp"); |
