Mercurial > repos > peter-waltman > ucsc_cluster_tools2

--- a/cluster.tools/format.raw.TCGA.clinical.data.R	Fri Mar 01 10:16:53 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,184 +0,0 @@
-#!/usr/bin/env Rscript
-##
-## formats raw clinical data from TCGA to contain a single status & time colums
-##
-## Input (required):
-##    - clinical data
-## Input (optional):
-##    - status & time columns: (NOT USED IN THIS SCRIPT - see comment below)
-##         ideally, a better design would allow a user to specify 1 or more columns
-##         to check for the status & time columns - however, due to the necessities
-##         required to pre-process the TCGA clinical data, the script would not be
-##         generalizeable - and for this reason, the TCGA columns are hard-coded.
-##
-## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time
-##
-## Date: August 21, 2012
-## Author: Peter Waltman
-##
-
-##usage, options and doc goes here
-argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data
-and generates a KM-plot, along with the log-rank p-values
-
-        Usage:
-                format.raw.TCGA.clinical.data.R -c <clinical.file>
-        Options:
-                -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time))
-              ")
-args <- commandArgs(TRUE)
-if ( length( args ) == 1 && args =="--help") {
-  write(argspec, stderr())
-  q();
-}
-
-lib.load.quiet <- function( package ) {
-   package <- as.character(substitute(package))
-   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
-}
-lib.load.quiet(getopt)
-
-spec <- matrix( c( "clinical.fname", "d", 1, "character",
-                   "output.fname",   "o", 2, "character"
-                  ),
-                ncol=4,
-                byrow=TRUE
-               )
-opt <- getopt( spec=spec )
-
-##set some reasonable defaults for the options that are needed,
-##but were not specified.
-if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) }
-
-##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 )
-orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE )
-orig.clinical.data <- unique( orig.clinical.data )
-rownames( orig.clinical.data ) <- orig.clinical.data[,1]
-orig.clinical.data <- orig.clinical.data[, -1 ]
-
-##  ugh, some TCGA data sets have all NAs in the "days_to_..." columns
-if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
-  time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" )
-} else {
-  time.cols <- c( "days_to_death", "days_to_last_followup"  )
-}
-good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) )
-
-orig.clinical.data <- orig.clinical.data[ good.samps, ]
-
-if ( is.null(opt$status.column ) ) {
-  status.colname <- "vital_status"
-  if ( status.colname %in% colnames( orig.clinical.data ) ) {
-    opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
-    clinical.data <- orig.clinical.data[ , opt$status.column ]
-  }
-  else {
-    status.colname <- "days_to_death"
-    if ( status.colname %in% colnames( orig.clinical.data ) ) {
-      opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
-      clinical.data <- orig.clinical.data[ , opt$status.column ]
-    }
-    else {
-      stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" )
-    }
-  }
-  clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) )
-}
-if ( is.null(opt$time.column ) ) {
-  time.colname <- "CDE.clinical_time"
-
-  if ( time.colname %in% colnames( orig.clinical.data ) ) {
-    opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname )
-    clinical.data <- cbind( clinical.data,
-                           as.numeric( orig.clinical.data[, opt$time.column ] ) )
-  }
-  else {
-    dec.mat <- matrix( NA,
-                       nc=length( time.cols ),
-                       nr=nrow( orig.clinical.data ),
-                       dimnames=list( rownames( orig.clinical.data ),
-                                       time.cols )
-                      )
-    for ( cname in colnames( dec.mat ) ) {
-      if ( cname %in% colnames( orig.clinical.data ) ) {
-        dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] )
-      }
-    }
-
-
-
-    if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
-
-      opt$time.column <- sapply( 1:length( clinical.data ),
-                                 function(i) {
-                                   if ( clinical.data[i] ) {
-                                     ## this is a deceased sample
-                                     return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
-                                                     dec.mat[ i, "days_to_death" ],
-                                                     ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ),
-                                                             dec.mat[ i, "days_to_last_known_alive" ],
-                                                             dec.mat[ i, "days_to_last_followup" ] ) ) )
-
-                                   }
-                                   else {
-                                     return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) )
-                                   }
-                                 }
-                                )
-    } else {
-      opt$time.column <- sapply( 1:length( clinical.data ),
-                                 function(i) {
-                                   if ( clinical.data[i] ) {
-                                     ## this is a deceased sample
-                                     return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
-                                                     dec.mat[ i, "days_to_death" ],
-                                                     dec.mat[ i, "days_to_last_followup" ] ) )
-
-                                   }
-                                   else {
-                                     return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) )
-                                   }
-                                 }
-                                )
-    }
-
-
-    clinical.data <- cbind( clinical.data,
-                           as.numeric( opt$time.column ) )
-  }
-}
-
-clinical.data <- as.data.frame( clinical.data )
-colnames( clinical.data ) <- c( "status", "time" )
-rownames( clinical.data ) <- rownames( orig.clinical.data )
-
-
-##  check to make sure that the id's are sync'd correctly
-## the default format is to use hyphens to separate the elt's of the name
-## and to only use the 1st 3 elements of the name
-## so we check to see if they're using something else as separators and/or using more than 3 elts
-reformat.ids <- function( ids ) {
-
-  if ( grepl( "TCGA\\.", ids[1] ) ) {
-    ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) )
-  } else {
-    ## do this just in case there's more than 3 elements to the names
-    if ( grepl( "TCGA-", ids[1] ) ) {
-      ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) )
-    }
-  }
-  return( ids )
-}
-
-
-new.samp.ids <- reformat.ids( rownames( clinical.data ) )
-if ( any( duplicated( new.samp.ids ) ) ) {
-  ## in some cases, we have duplicate sample ids in the raw data after we truncate to
-  ##   the 1st 3 elts in the barcode, so just simplify the data
-  uniqs <- ! duplicated( new.samp.ids )
-  clinical.data <- clinical.data[ uniqs, ]
-  new.samp.ids <- new.samp.ids[ uniqs ]
-}
-
-rownames( clinical.data ) <- new.samp.ids
-write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
--- a/cluster.tools/format.raw.TCGA.clinical.data.py	Fri Mar 01 10:16:53 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,14 +0,0 @@
-#!/usr/bin/env python
-import os
-import sys
-import subprocess
-
-select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "format.raw.TCGA.clinical.data.R")
-
-cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:]
-
-proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
-(stdoutdata, stderrdata) = proc.communicate()
-if proc.returncode:
-	sys.stderr.write(stderrdata)
-sys.stdout.write(stdoutdata)
--- a/cluster.tools/format.raw.TCGA.clinical.data.xml	Fri Mar 01 10:16:53 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,21 +0,0 @@
-<tool id="format_raw_TCGA_clinical_data" name="Format Raw TCGA Clinical Data" force_history_refresh="True">
-    <command interpreter="python">format.raw.TCGA.clinical.data.py
--d $dataset
--o ${output}
-
-</command>
-    <inputs>
-    	<param name="dataset" type="data" format='tabular' label="Raw Clinical Data"/>
-    </inputs>
-    <outputs>
-        <data format="tabular" name="output" label="Formatted Clinical Data"/>
-    </outputs>
-<help>
-.. class:: infomark
-
-**Format Raw TCGA Clinical Data** - Tool to convert a raw clinical TCGA data file into a the format expected by the Survival Analysis tools
-
-**OUTPUT:**  A new clinical data file that is a 2 column, tab-delimited file of the format that is expected by the Survival Analysis tools
-
-</help>
-</tool>