Mercurial > repos > peter-waltman > ucsc_cluster_tools2
comparison cluster.tools/format.raw.TCGA.RNASeq.data.R @ 1:dddfeedb85af draft
Uploaded
| author | peter-waltman |
|---|---|
| date | Fri, 01 Mar 2013 10:16:53 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:0decf3fd54bc | 1:dddfeedb85af |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 ## | |
| 3 ## formats raw clinical data from TCGA to contain a single status & time colums | |
| 4 ## | |
| 5 ## Input (required): | |
| 6 ## - clinical data | |
| 7 ## Input (optional): | |
| 8 ## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below) | |
| 9 ## ideally, a better design would allow a user to specify 1 or more columns | |
| 10 ## to check for the status & time columns - however, due to the necessities | |
| 11 ## required to pre-process the TCGA clinical data, the script would not be | |
| 12 ## generalizeable - and for this reason, the TCGA columns are hard-coded. | |
| 13 ## | |
| 14 ## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time | |
| 15 ## | |
| 16 ## Date: August 21, 2012 | |
| 17 ## Author: Peter Waltman | |
| 18 ## | |
| 19 | |
| 20 ##usage, options and doc goes here | |
| 21 argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data | |
| 22 and generates a KM-plot, along with the log-rank p-values | |
| 23 | |
| 24 Usage: | |
| 25 format.raw.TCGA.clinical.data.R -c <clinical.file> | |
| 26 Options: | |
| 27 -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time)) | |
| 28 ") | |
| 29 args <- commandArgs(TRUE) | |
| 30 if ( length( args ) == 1 && args =="--help") { | |
| 31 write(argspec, stderr()) | |
| 32 q(); | |
| 33 } | |
| 34 | |
| 35 ## some helper fn's | |
| 36 write.2.tab <- function( mat, | |
| 37 fname ) { | |
| 38 mat <- rbind( colnames( mat ), mat ) | |
| 39 mat <- cbind( c( "ID", rownames( mat )[-1] ), | |
| 40 mat ) | |
| 41 write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) | |
| 42 } | |
| 43 | |
| 44 fix.genes <- function( mat ) { | |
| 45 ## filter out the unknowns | |
| 46 mat <- mat[ ( ! grepl( "^\\?", rownames( mat ) ) ), ] | |
| 47 | |
| 48 genes <- rownames( mat ) | |
| 49 ## select the HUGO name portion of the "gene-names" that are in the TCGA matrices | |
| 50 genes <- sapply( strsplit( genes, "\\|" ), function(x) x[1] ) | |
| 51 | |
| 52 if ( any( duplicated( genes ) ) ) { | |
| 53 dupes <- unique( genes[ duplicated( genes ) ] ) | |
| 54 no.duped.mat <- which( ! genes %in% dupes ) | |
| 55 no.duped.mat <- mat[ no.duped.mat, , drop=FALSE ] | |
| 56 | |
| 57 ## now merge in the duplicates | |
| 58 for ( dup in dupes ) { | |
| 59 duped.mat <- mat[ grepl( paste( "^", dup, sep="" ), rownames( mat ) ), ] | |
| 60 duped.mat <- apply( duped.mat, 2, mean, na.rm=TRUE ) | |
| 61 no.duped.mat <- rbind( no.duped.mat, duped.mat ) | |
| 62 ## weird bit of follow up to make sure the new row/gene is named correctly | |
| 63 rownames( no.duped.mat ) <- c( rownames( no.duped.mat )[ -nrow(no.duped.mat) ], | |
| 64 paste( dup, "|0000", sep="" ) ) | |
| 65 } | |
| 66 mat <- no.duped.mat; rm( no.duped.mat ); gc() | |
| 67 } | |
| 68 | |
| 69 rownames( mat ) <- sapply( strsplit( rownames( mat ), "\\|" ), function(x) x[1] ) | |
| 70 return( mat ) | |
| 71 } | |
| 72 | |
| 73 fix.sample.ids <- function( mat ) { | |
| 74 colnames( mat ) <- gsub( "\\.", "-", colnames( mat ) ) | |
| 75 return( mat ) | |
| 76 } | |
| 77 | |
| 78 lib.load.quiet <- function( package ) { | |
| 79 package <- as.character(substitute(package)) | |
| 80 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
| 81 } | |
| 82 lib.load.quiet(getopt) | |
| 83 | |
| 84 spec <- matrix( c( "dataset", "d", 1, "character", | |
| 85 "log.tranform", "l", 2, "character", | |
| 86 "filter.low.variant", "f", 2, "integer", | |
| 87 "output.fname", "o", 2, "character" | |
| 88 ), | |
| 89 ncol=4, | |
| 90 byrow=TRUE | |
| 91 ) | |
| 92 opt <- getopt( spec=spec ) | |
| 93 | |
| 94 ##set some reasonable defaults for the options that are needed, | |
| 95 ##but were not specified. | |
| 96 if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.RNASeq.data" ) } | |
| 97 if ( is.null(opt$log.transform ) ) { | |
| 98 opt$log.transform <- TRUE | |
| 99 } else { | |
| 100 opt$log.transform <- ( tolower( opt$log.transform ) %in% "yes" ) | |
| 101 } | |
| 102 | |
| 103 if ( is.null(opt$filter.low.variant ) ) { opt$log.transform <- 10 } | |
| 104 | |
| 105 | |
| 106 data <- as.matrix( read.delim( opt$dataset, row.names=1, check.names=F ) ) | |
| 107 data <- fix.genes( data ) | |
| 108 data <- fix.sample.ids( data ) | |
| 109 | |
| 110 if ( opt$filter.low.variant > 0 ) { | |
| 111 rpkm.meds <- apply( data, 1, median, na.rm=TRUE ) | |
| 112 rpkm.meds <- rpkm.meds >= opt$filter.low.variant | |
| 113 data <- data[ rpkm.meds, ] | |
| 114 } | |
| 115 | |
| 116 if ( opt$log.transform ) { | |
| 117 data <- log( data + 1 ) | |
| 118 } | |
| 119 | |
| 120 write.2.tab( data, opt$output.fname ) |
