Mercurial > repos > peter-waltman > ucsc_cluster_tools2
changeset 2:b8f262149ee2 draft
Deleted selected files
author | peter-waltman |
---|---|
date | Fri, 01 Mar 2013 10:17:24 -0500 |
parents | dddfeedb85af |
children | 563832f48c08 |
files | cluster.tools/format.raw.TCGA.clinical.data.R cluster.tools/format.raw.TCGA.clinical.data.py cluster.tools/format.raw.TCGA.clinical.data.xml |
diffstat | 3 files changed, 0 insertions(+), 219 deletions(-) [+] |
line wrap: on
line diff
--- a/cluster.tools/format.raw.TCGA.clinical.data.R Fri Mar 01 10:16:53 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,184 +0,0 @@ -#!/usr/bin/env Rscript -## -## formats raw clinical data from TCGA to contain a single status & time colums -## -## Input (required): -## - clinical data -## Input (optional): -## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below) -## ideally, a better design would allow a user to specify 1 or more columns -## to check for the status & time columns - however, due to the necessities -## required to pre-process the TCGA clinical data, the script would not be -## generalizeable - and for this reason, the TCGA columns are hard-coded. -## -## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time -## -## Date: August 21, 2012 -## Author: Peter Waltman -## - -##usage, options and doc goes here -argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data -and generates a KM-plot, along with the log-rank p-values - - Usage: - format.raw.TCGA.clinical.data.R -c <clinical.file> - Options: - -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time)) - ") -args <- commandArgs(TRUE) -if ( length( args ) == 1 && args =="--help") { - write(argspec, stderr()) - q(); -} - -lib.load.quiet <- function( package ) { - package <- as.character(substitute(package)) - suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) -} -lib.load.quiet(getopt) - -spec <- matrix( c( "clinical.fname", "d", 1, "character", - "output.fname", "o", 2, "character" - ), - ncol=4, - byrow=TRUE - ) -opt <- getopt( spec=spec ) - -##set some reasonable defaults for the options that are needed, -##but were not specified. -if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) } - -##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 ) -orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE ) -orig.clinical.data <- unique( orig.clinical.data ) -rownames( orig.clinical.data ) <- orig.clinical.data[,1] -orig.clinical.data <- orig.clinical.data[, -1 ] - -## ugh, some TCGA data sets have all NAs in the "days_to_..." columns -if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) { - time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" ) -} else { - time.cols <- c( "days_to_death", "days_to_last_followup" ) -} -good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) ) - -orig.clinical.data <- orig.clinical.data[ good.samps, ] - -if ( is.null(opt$status.column ) ) { - status.colname <- "vital_status" - if ( status.colname %in% colnames( orig.clinical.data ) ) { - opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname ) - clinical.data <- orig.clinical.data[ , opt$status.column ] - } - else { - status.colname <- "days_to_death" - if ( status.colname %in% colnames( orig.clinical.data ) ) { - opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname ) - clinical.data <- orig.clinical.data[ , opt$status.column ] - } - else { - stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" ) - } - } - clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) ) -} -if ( is.null(opt$time.column ) ) { - time.colname <- "CDE.clinical_time" - - if ( time.colname %in% colnames( orig.clinical.data ) ) { - opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname ) - clinical.data <- cbind( clinical.data, - as.numeric( orig.clinical.data[, opt$time.column ] ) ) - } - else { - dec.mat <- matrix( NA, - nc=length( time.cols ), - nr=nrow( orig.clinical.data ), - dimnames=list( rownames( orig.clinical.data ), - time.cols ) - ) - for ( cname in colnames( dec.mat ) ) { - if ( cname %in% colnames( orig.clinical.data ) ) { - dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] ) - } - } - - - - if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) { - - opt$time.column <- sapply( 1:length( clinical.data ), - function(i) { - if ( clinical.data[i] ) { - ## this is a deceased sample - return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ), - dec.mat[ i, "days_to_death" ], - ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ), - dec.mat[ i, "days_to_last_known_alive" ], - dec.mat[ i, "days_to_last_followup" ] ) ) ) - - } - else { - return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) ) - } - } - ) - } else { - opt$time.column <- sapply( 1:length( clinical.data ), - function(i) { - if ( clinical.data[i] ) { - ## this is a deceased sample - return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ), - dec.mat[ i, "days_to_death" ], - dec.mat[ i, "days_to_last_followup" ] ) ) - - } - else { - return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) ) - } - } - ) - } - - - clinical.data <- cbind( clinical.data, - as.numeric( opt$time.column ) ) - } -} - -clinical.data <- as.data.frame( clinical.data ) -colnames( clinical.data ) <- c( "status", "time" ) -rownames( clinical.data ) <- rownames( orig.clinical.data ) - - -## check to make sure that the id's are sync'd correctly -## the default format is to use hyphens to separate the elt's of the name -## and to only use the 1st 3 elements of the name -## so we check to see if they're using something else as separators and/or using more than 3 elts -reformat.ids <- function( ids ) { - - if ( grepl( "TCGA\\.", ids[1] ) ) { - ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) ) - } else { - ## do this just in case there's more than 3 elements to the names - if ( grepl( "TCGA-", ids[1] ) ) { - ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) ) - } - } - return( ids ) -} - - -new.samp.ids <- reformat.ids( rownames( clinical.data ) ) -if ( any( duplicated( new.samp.ids ) ) ) { - ## in some cases, we have duplicate sample ids in the raw data after we truncate to - ## the 1st 3 elts in the barcode, so just simplify the data - uniqs <- ! duplicated( new.samp.ids ) - clinical.data <- clinical.data[ uniqs, ] - new.samp.ids <- new.samp.ids[ uniqs ] -} - -rownames( clinical.data ) <- new.samp.ids -write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
--- a/cluster.tools/format.raw.TCGA.clinical.data.py Fri Mar 01 10:16:53 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -#!/usr/bin/env python -import os -import sys -import subprocess - -select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "format.raw.TCGA.clinical.data.R") - -cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] - -proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) -(stdoutdata, stderrdata) = proc.communicate() -if proc.returncode: - sys.stderr.write(stderrdata) -sys.stdout.write(stdoutdata)
--- a/cluster.tools/format.raw.TCGA.clinical.data.xml Fri Mar 01 10:16:53 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -<tool id="format_raw_TCGA_clinical_data" name="Format Raw TCGA Clinical Data" force_history_refresh="True"> - <command interpreter="python">format.raw.TCGA.clinical.data.py --d $dataset --o ${output} - -</command> - <inputs> - <param name="dataset" type="data" format='tabular' label="Raw Clinical Data"/> - </inputs> - <outputs> - <data format="tabular" name="output" label="Formatted Clinical Data"/> - </outputs> -<help> -.. class:: infomark - -**Format Raw TCGA Clinical Data** - Tool to convert a raw clinical TCGA data file into a the format expected by the Survival Analysis tools - -**OUTPUT:** A new clinical data file that is a 2 column, tab-delimited file of the format that is expected by the Survival Analysis tools - -</help> -</tool>