changeset 83:2f49b6bddccc draft

Uploaded
author greg
date Thu, 04 Jan 2018 14:44:38 -0500
parents cf0a62d084dc
children daeba7e9dd11
files ideas_genome_tracks.R
diffstat 1 files changed, 54 insertions(+), 22 deletions(-) [+]
line wrap: on
line diff
--- a/ideas_genome_tracks.R	Thu Jan 04 14:44:24 2018 -0500
+++ b/ideas_genome_tracks.R	Thu Jan 04 14:44:38 2018 -0500
@@ -4,22 +4,22 @@
 suppressPackageStartupMessages(library("optparse"))
 
 option_list <- list(
-    make_option(c("--build"), action="store", dest="build", help="Genome build"),
-    make_option(c("--chrom_len_file"), action="store", dest="chrom_len_file", help="Chromosome length file"),
-    make_option(c("--email"),  action="store", dest="email", help="User email address"),
-    make_option(c("--galaxy_url"),  action="store", dest="galaxy_url", help="Galaxy instance base URL"),
-    make_option(c("--hub_long_label"), action="store", dest="hub_long_label", help="Hub long label"),
-    make_option(c("--hub_name"),  action="store", dest="hub_name", default=NULL, help="Hub name without spaces"),
-    make_option(c("--hub_short_label"), action="store", dest="hub_short_label", help="Hub short label"),
-    make_option(c("--input_dir_para"), action="store", dest="input_dir_para", help="Directory containing .para outputs from IDEAS"),
-    make_option(c("--input_dir_state"), action="store", dest="input_dir_state", help="Directory containing .state outputs from IDEAS"),
-    make_option(c("--output_trackhub"),  action="store", dest="output_trackhub", help="Output hub file"),
-    make_option(c("--output_trackhub_files_path"),  action="store", dest="output_trackhub_files_path", help="Output hub extra files path"),
-    make_option(c("--output_trackhub_id"),  action="store", dest="output_trackhub_id", help="Encoded output_trackhub dataset id"),
-    make_option(c("--script_dir"), action="store", dest="script_dir", help="R script source directory"),
-    make_option(c("--state_colors"), action="store", dest="state_colors", default=NULL, help="List of state colors"),
-    make_option(c("--state_indexes"), action="store", dest="state_indexes", default=NULL, help="List of state indexes"),
-    make_option(c("--state_names"), action="store", dest="state_names", default=NULL, help="List of state names")
+            make_option(c("--build"), action="store", dest="build", help="Genome build"),
+            make_option(c("--chrom_len_file"), action="store", dest="chrom_len_file", help="Chromosome length file"),
+            make_option(c("--email"),  action="store", dest="email", help="User email address"),
+            make_option(c("--galaxy_url"),  action="store", dest="galaxy_url", help="Galaxy instance base URL"),
+            make_option(c("--hub_long_label"), action="store", dest="hub_long_label", help="Hub long label"),
+            make_option(c("--hub_name"),  action="store", dest="hub_name", default=NULL, help="Hub name without spaces"),
+            make_option(c("--hub_short_label"), action="store", dest="hub_short_label", help="Hub short label"),
+            make_option(c("--input_dir_para"), action="store", dest="input_dir_para", help="Directory containing .para outputs from IDEAS"),
+            make_option(c("--input_dir_state"), action="store", dest="input_dir_state", help="Directory containing .state outputs from IDEAS"),
+            make_option(c("--output_trackhub"),  action="store", dest="output_trackhub", help="Output hub file"),
+            make_option(c("--output_trackhub_files_path"),  action="store", dest="output_trackhub_files_path", help="Output hub extra files path"),
+            make_option(c("--output_trackhub_id"),  action="store", dest="output_trackhub_id", help="Encoded output_trackhub dataset id"),
+            make_option(c("--script_dir"), action="store", dest="script_dir", help="R script source directory"),
+            make_option(c("--state_colors"), action="store", dest="state_colors", default=NULL, help="List of state colors"),
+            make_option(c("--state_indexes"), action="store", dest="state_indexes", default=NULL, help="List of state indexes"),
+            make_option(c("--state_names"), action="store", dest="state_names", default=NULL, help="List of state names")
 )
 
 parser <- OptionParser(usage="%prog [options] file", option_list=option_list)
@@ -41,7 +41,8 @@
 create_cells = function(input_dir_state, chrom_len_file, base_track_file_name, state_indexes, state_names, state_colors) {
     # Create everything needed, including the bigbed file,
     # to render the tracks within the UCSC track hub.
-    state_files <- list.files(path=input_dir_state, full.names=TRUE);
+    state_files = list.files(path=input_dir_state, full.names=TRUE);
+    cell_type_names = get_cell_type_names(state_files[1]);
     genome_size = read.table(chrom_len_file);
     g = NULL;
     for(i in state_files) {
@@ -70,8 +71,9 @@
     posst = as.numeric(g1[,3]);
     posed = as.numeric(g1[,4]);
     state = as.matrix(g1[,5:(dim(g1)[2]-1)]);
-    state_name = 0:max(state);
     L = dim(g1)[1];
+    # Here n will be the same length as the
+    # list of cell_type_names defined above.
     n = dim(state)[2];
     cells = as.character(colnames(g1)[5:(dim(g1)[2]-1)]);
     options(scipen=999);
@@ -79,6 +81,7 @@
     tt = c(tt, which(posst[2:L]!=posed[2:L-1]));
     tt = sort(unique(tt));
     for(i in 1:n) {
+        state_name = cell_type_names[i];
         tstate = state[,i];
         t = c(tt, which(tstate[2:L]!=tstate[2:L-1]));
         t = sort(unique(t));
@@ -88,7 +91,7 @@
         track_file_name_bed_unsorted <- get_track_file_name(base_track_file_name, i, "bed_unsorted");
         track_file_name_bed <- get_track_file_name(base_track_file_name, i, "bed");
         track_file_name_bigbed <- get_track_file_name(base_track_file_name, i, "bigbed");
-        x = cbind(np[, 1:3], state_name[as.integer(np[,4])+1], 1000, ".", np[,2:3]);
+        x = cbind(np[, 1:3], state_name, 1000, ".", np[,2:3]);
         write.table(as.matrix(x), track_file_name_bed_unsorted, quote=F, row.names=F, col.names=F);
         cmd = paste("LC_COLLATE=C sort -k1,1 -k2,2n < ", track_file_name_bed_unsorted, " > ", track_file_name_bed);
         system(cmd);
@@ -125,7 +128,7 @@
             s_indexes = c(s_indexes, as.integer(item));
         }
         # Split state_names into a list of strings.
-        name_str = as.character(state_colors);
+        name_str = as.character(state_names);
         items = strsplit(name_str, ",");
         item_index = 0;
         for (item in items) {
@@ -145,8 +148,16 @@
     }
     base_track_file_name <- paste(tracks_dir, hub_name, sep="");
     cells = create_cells(input_dir_state, chrom_len_file, base_track_file_name, s_indexes, s_names, s_colors);
+    cell_info = cbind(cells, cells, cells, "#000000");
+    cell_info = array(cell_info, dim=c(length(cells), 4));
+    cell_info = as.matrix(cell_info);
     track_db = NULL;
     for (i in 1:length(cells)) {
+        ii = which(cells[i] == cell_info[,1]);
+        if(length(ii) == 0) {
+            next;
+        }
+        ii = ii[1];
         # Get the color for the current state.
         color <- paste(c(col2rgb(s_colors[i])), collapse=",");
         # Get the bigDataUrl.
@@ -155,8 +166,8 @@
         track_db = c(track_db, paste("track ", hub_name, "_track_", i, sep=""));
         track_db = c(track_db, "type bigBed");
         track_db = c(track_db, paste("bigDataUrl", big_data_url, sep=" "));
-        track_db = c(track_db, paste("shortLabel", hub_short_label, sep=" "));
-        track_db = c(track_db, paste("longLabel", hub_long_label, sep=" "));
+        track_db = c(track_db, paste("shortLabel", cell_info[ii, 2], sep=" "));
+        track_db = c(track_db, paste("longLabel", paste(hub_name, cell_info[ii, 3], sep=" ")));
         track_db = c(track_db, paste("priority", i));
         track_db = c(track_db, "itemRgb on");
         track_db = c(track_db, "maxItems 100000");
@@ -173,6 +184,27 @@
     return(s)
 }
 
+get_cell_type_names = function(state_file) {
+    # The first line of a state file is a comment
+    # that looks something like this:
+    # ID CHR POSst POSed E001 E002 PosClass
+    # The cell type names are the elemets whose
+    # 1-based indexes start at 5 and end with -1,
+    # which in the above case are E001 and E002.
+    fh = file(state_file,"r");
+    line = readLines(fh, n=1);
+    close(fh);
+    # Split line into a list of strings.
+    items = strsplit(line, "\\s+")[[1]];
+    # Extract the cell type names into a list.
+    last_cell_type_name_index = length(items) -1;
+    cell_type_names = c();
+    for (i in 5:last_cell_type_name_index) {
+        cell_type_names = c(cell_type_names, items[i]);
+    }
+    return(cell_type_names);
+}
+
 get_track_file_name = function(base_track_file_name, index, ext) {
     track_file_name <- paste(base_track_file_name, index, ext, sep=".");
     return(track_file_name);