diff QDNAseq.R @ 59:bfe9d9b7e261 draft

Uploaded
author stef
date Thu, 07 Aug 2014 11:02:25 -0400
parents c962b56a2dd8
children 2f0af8970aea
line wrap: on
line diff
--- a/QDNAseq.R	Thu Aug 07 11:02:15 2014 -0400
+++ b/QDNAseq.R	Thu Aug 07 11:02:25 2014 -0400
@@ -54,27 +54,27 @@
 ## --------------------
 ## copied code for extracting the regions by segment call status
 ## --------------------
-fuse.regions_test <- function(cgh, onlyCalled=T) {
-	if (ncol(cgh) > 1) stop('Please specify which sample...')
+fuseRegions <- function( cgh, onlyCalled=T ) {
+	if ( ncol(cgh) > 1 ) stop('Please specify which sample...')
 
-	x <- data.frame(cgh@featureData@data[,1:3], calls(cgh), copynumber(cgh), segmented(cgh), check.names=FALSE, stringsAsFactors=FALSE)
-	colnames( x ) <- c( "chr", "start", "end", "call", "log2", "segmentval" )
+	data <- data.frame( cgh@featureData@data[,1:3], calls(cgh), copynumber(cgh), segmented(cgh), check.names=FALSE, stringsAsFactors=FALSE)
+	colnames( data ) <- c( "chr", "start", "end", "call", "log2", "segmentval" )
 	
 	fused.data <- data.frame()
 	curr.bin <- 1
-	for (chr in unique(x$chr)) {
+	for ( chr in unique( data$chr ) ) {
 		
-		chr.data <- x[x$chr == chr,]
-		prev.bin <- curr.bin
-		prev.call <- chr.data[1, 'call']
-		prev.log2 <- chr.data[1, 'log2']
-		prev.segm <- chr.data[1, 'segmentval']
-		start <- chr.data[1, 'start']
+		chr.data  <- data[ data$chr == chr, ]
+		prev.bin  <- curr.bin
+		prev.call <- chr.data[ 1, 'call' ]
+		prev.log2 <- chr.data[ 1, 'log2' ]
+		prev.segm <- chr.data[ 1, 'segmentval' ]
+		start     <- chr.data[ 1, 'start' ]
 
 		if ( nrow(chr.data) > 1) {
 			for ( i in 2:nrow(chr.data) ) {
 
-				curr.bin <- curr.bin + 1
+				curr.bin  <- curr.bin + 1
 				curr.call <- chr.data[ i, 'call']
 				curr.segm <- chr.data[ i, 'segmentval']
 
@@ -86,15 +86,16 @@
 					}
 					prev.call <- curr.call
 					prev.segm <- curr.segm
-					prev.bin <- curr.bin
-					start <- chr.data[ i, 'start']
+					prev.bin  <- curr.bin
+					start     <- chr.data[ i, 'start']
 				}
 			}
 			fused.data <- rbind( fused.data, data.frame( chr=chr, start=start, end=chr.data[ i-1, 'end'], call=prev.call, segmentval=round(prev.segm, digits=DECIMALS) ) )
-		}else {
+		}else{
 			fused.data <- rbind( fused.data, data.frame( chr=chr, start=start, end=chr.data[ i-1, 'end'], call=prev.call, segmentval=round(prev.segm, digits=DECIMALS) ) )
 		}
 	}
+	## if requested remove the regions with call status 0 (= normal)
 	if ( onlyCalled == T ){
 		fused.data <- fused.data[ fused.data$call != 0, ]
 	}
@@ -125,25 +126,16 @@
 		regionCount <- nrow( regionsList[[sample]] )
 		
 		outSampleBase   <- paste( outputBasename, '_', sample, '_QDNAseqRegions', sep='')
-		outBedFile      <- paste( outSampleBase, '.bed', sep="" )
-		outBedPath      <- paste( outputDir, '/', outBedFile, sep="" )
 		outBedgraphFile <- paste( outSampleBase, '.bedGraph', sep="" )
 		outBedgraphPath <- paste( outputDir, '/', outBedgraphFile, sep="" )
 
-		## ---------- BED ----------
-		txt <- "#"
-		sink( outBedPath )
-			cat( txt )
-		sink()
-		write.table( regionsList[[sample]], outBedPath, quote=F, sep="\t", row.names=F, append=T)
-
 		## ---------- BEDGRAPH ----------
 		txt <- paste( "track type=bedGraph color=0,100,0 altColor=255,0,0 name=", sample,"_segmReg description=segmented_regions_from_QDNAseq\n", sep="")
 		sink( outBedgraphPath )
 			cat( txt )
 		sink()
 		write.table( regionsList[[sample]][,bedgraphColumns], outBedgraphPath, quote=F, sep="\t", row.names=F, append=T, col.names=F)
-		outFiles[[sample]] <- c( outBedFile, outBedgraphFile )
+		outFiles[[sample]] <- c( outBedgraphFile )
 	}
 	outFiles
 }
@@ -156,6 +148,7 @@
 CSS_FILE  <- paste( TOOL_PATH, '/QDNAseq.css', sep="" )
 DECIMALS  <- 3
 WEB_LINK  <- 'http://www.bioconductor.org/packages/release/bioc/html/QDNAseq.html'
+PURE_CSS  <- 'http://yui.yahooapis.com/pure/0.5.0/pure-min.css'
 PAR_SET   <- list( pch=22 )
 
 catMsg( "Starting QDNAseq wrapper" )	
@@ -168,7 +161,7 @@
 config      <- cmdLineArgs[1]
 
 ## sourcing the config file will load all input params
-## many variables are created in sourced "config"
+## many variables are imported via sourced "config"
 source( config )
 
 PLOT_RES  <- min( PLOT_WIDTH, PLOT_HEIGHT ) / 6.3 # desparate try to make png text scale well, damn you R...!
@@ -176,9 +169,10 @@
 
 systemUser <- system("whoami",T)
 qdnaseqVersion <- packageDescription( "QDNAseq" )$Version
+rVersion <- R.version.string
 catMsg( c("Analysis run with user: ", systemUser ) )
 catMsg( c("QDNAseq version loaded: ", qdnaseqVersion) )
-catMsg( c( sessionInfo()$R.version$version.string ) )
+catMsg( c( rVersion ) )
 
 ## get the comma separated list of chromosomes to exclude
 excludeChrs <- unlist( strsplit( excludeChrsString, ",") )
@@ -192,11 +186,7 @@
 ## /DEBUG
 ## ------------------------
 
-## setup bam filelist for easy retrieval later
-fileList    <- makeBamFileList( bamsPaths, bamsNames )
-bamCount    <- length( fileList[[ 'all_paths' ]] )
-
-## help msg for running script separate still needs work!
+## help msg for running script without wrapper still needs work!
 #if ( length(cmdLineArgs) == 0 || cmdLineArgs[1] == "-h" || cmdLineArgs[1] == "--help"){ 
 #	catMsg( c( "Usage: ", paste(params_help,sep=",") ) )
 #	quit( save = 'no', status=0, runLast=F )
@@ -214,36 +204,40 @@
 ## construct output file-names and -paths
 ## ------------------------
 htmlOutputName <- 'index.html'
-gzipOutputName <- paste( 'QDNAseqResults_', outputName, '.zip', sep='' )
+gzipOutputName <- paste( 'QDNAseqResults_', binSize, 'kbp_', outputName, '.zip', sep='' )
 robjReadCoName <- paste( binSize, 'kbp_QDNAseqReadCounts.rds', sep='')
 robjCopyNrName <- paste( binSize, 'kbp_QDNAseqCopyNumbers.rds', sep='')
 robjCalledName <- paste( binSize, 'kbp_QDNAseqCalledSegments.rds', sep='')
 regiOutputName <- paste( binSize, 'kbp_QDNAseqRegions.rds', sep='')
-igvCalledName  <- paste( binSize, 'kbp_QDNAseq-calls.igv', sep='')
 igvCopyNrName  <- paste( binSize, 'kbp_QDNAseq-normalized.igv', sep='')
 noiseImgName   <- paste( 'QDNAseqNoisePlot.png',  sep='')
 freqImgName    <- paste( 'QDNAseqFrequencyPlot.png',  sep='')
 
-
 gzipOutputPath <- paste( outputPath, '/', gzipOutputName, sep='')
 htmlOutputPath <- paste( outputPath, '/', htmlOutputName, sep='')
 robjReadCoPath <- paste( outputPath, '/', robjReadCoName, sep='')
 robjCopyNrPath <- paste( outputPath, '/', robjCopyNrName, sep='')
 robjCalledPath <- paste( outputPath, '/', robjCalledName, sep='')
 robjRegionPath <- paste( outputPath, '/', regiOutputName, sep='')
-igvCalledPath  <- paste( outputPath, '/', igvCalledName, sep='')
 igvCopyNrPath  <- paste( outputPath, '/', igvCopyNrName, sep='')
 noiseImgPath   <- paste( outputPath, '/', noiseImgName, sep='' )
 freqImgPath    <- paste( outputPath, '/', freqImgName, sep='' )
 
+## setup bam filelist for easy retrieval later
+fileList    <- makeBamFileList( bamsPaths, bamsNames )
+bamCount    <- length( fileList[[ 'all_paths' ]] )
 
 ## ------------------------
 ## performing QDNAseq analysis steps
 ## ------------------------
-if ( debug ){
-	## in case of debug just use inbuilt LGG data for speedup
+if ( debug ){ ## in case of debug just use inbuilt LGG data for speedup
 	data(LGG150)
 	readCounts <- LGG150
+	binSize <- 15
+	bamsPaths  <- c( "BUILD_IN_DATA_LGG150")
+	bamsNames  <- c( "LGG150")
+	fileList   <- makeBamFileList( bamsPaths, bamsNames )
+	bamCount   <- length( fileList[[ 'all_paths' ]] )
 }else{
 	if ( nchar(binAnnotations) == 0 ){
 	binAnnotations <- getBinAnnotations( binSize=binSize, type=experimentType )
@@ -277,6 +271,9 @@
 if ( doOutputCopynumbersRds ){
 	saveRDS( copyNumbersSmooth, copyNumbersDatasetFile );
 }
+if ( doOutputCopynumbersIgv ){
+	saveRDS( copyNumbersSmooth, copyNumbersIgvDatasetFile );
+}
 
 ## proceed with calling if requested
 if ( doCall ){
@@ -288,8 +285,8 @@
 	if ( doOutputCallsRds ){
 		saveRDS( copyNumbersCalled, calledSegmentsDatasetFile );
 	}
-	exportBins( copyNumbersCalled, file=igvCalledPath, format="igv")
-	
+	## actual calls are not included in exportBins output, so not used now
+	#exportBins( copyNumbersCalled, file=igvCalledPath, format="igv")
 }
 
 
@@ -312,9 +309,7 @@
 	dev.off()	
 }
 
-
 for (i in 1:length(sampleNames) ){
-#for (sample in sampleNames(copyNumbersSmooth) ){
 	sample <- sampleNames[i]
 	usedReads  <- readCountsFiltered@phenoData@data$used.reads[i]
 	catMsg( c("Creating plots for sample: ", sample ) )	
@@ -339,7 +334,7 @@
 		plotted_images[[ sample ]][[ type ]] <- img_file
 
 		catMsg( c(" Fusing regions of sample: ", sample) )
-		regions[[ sample ]] <- fuse.regions_test( cgh[, sample] )
+		regions[[ sample ]] <- fuseRegions( cgh[, sample] )
 		region_count <- nrow( data.frame( regions[[ sample ]] ) )
 		catMsg( c( ' sample "', sample, '" has ', region_count, " regions" ) )
 		plotted_images[[ sample ]][[ 'region_count' ]] <- region_count
@@ -364,13 +359,12 @@
 ## ------------------------
 ## get filesizes for report
 ## ------------------------
-zippedSize <- paste( round( file.info( gzipOutputPath )[["size"]] / 1000000, digits=2 ), 'MB' )
-readCoSize <- paste( round( file.info( robjReadCoPath )[["size"]] / 1000000, digits=2 ), 'MB' )
-copyNrSize <- paste( round( file.info( robjCopyNrPath )[["size"]] / 1000000, digits=2 ), 'MB' )
-calledSize <- paste( round( file.info( robjCalledPath )[["size"]] / 1000000, digits=2 ), 'MB' )
-regionSize <- paste( round( file.info( robjRegionPath )[["size"]] / 1000000, digits=2 ), 'MB' )
-igvCopyNrSize <- paste( round( file.info( igvCopyNrPath )[["size"]] / 1000000, digits=2 ), 'MB' )
-igvCalledSize <- paste( round( file.info( igvCalledPath )[["size"]] / 1000000, digits=2 ), 'MB' )
+zippedSize    <- paste( round( file.info( gzipOutputPath )[["size"]] / 1e+6, digits=2 ), 'MB' )
+readCoSize    <- paste( round( file.info( robjReadCoPath )[["size"]] / 1e+6, digits=2 ), 'MB' )
+copyNrSize    <- paste( round( file.info( robjCopyNrPath )[["size"]] / 1e+6, digits=2 ), 'MB' )
+calledSize    <- paste( round( file.info( robjCalledPath )[["size"]] / 1e+6, digits=2 ), 'MB' )
+regionSize    <- paste( round( file.info( robjRegionPath )[["size"]] / 1e+6, digits=2 ), 'MB' )
+igvCopyNrSize <- paste( round( file.info( igvCopyNrPath  )[["size"]] / 1e+6, digits=2 ), 'MB' )
 
 ## ------------------------
 ## creating html output to be linked to from the middle galaxy pane
@@ -381,14 +375,14 @@
 		cat( "<html>\n")
 		cat( "<head>\n")
 			#cat( '<link rel="stylesheet" type="text/css" href="test.css" media="screen" />', "\n" )
-			cat( "\t", '<link rel="stylesheet" href="http://yui.yahooapis.com/pure/0.4.2/pure-min.css">', "\n" )
-			cat( "\t<style>\n")
-				## have to include CSS into html file, because css referencing outside own dir doesn't seem to work...
-				cat( paste( "\t\t", '/* the css here originates from ', CSS_FILE,' */', "\n") )
-				cat( paste( "\t\t", readLines( CSS_FILE, n = -1)), sep="\n" )
+			cat( "\t", '<title>QDNAseq Report | ', outputName,'</title>', "\n", sep='' )
+			cat( "\t", '<link rel="stylesheet" href="', PURE_CSS, '">', "\n", sep='' )
+			cat( "\t<style>\n", sep='')
+				## include CSS into html file, makes it more portable
+				cat( paste( "\t\t", '/* the css here originates from ', CSS_FILE,' */', "\n"), sep='' )
+				cat( "\t\t", readLines( CSS_FILE ), sep="\n\t\t" )
 				#cat( "\t\th1 {color:red;}", "\n")
-
-			cat( "\t</style>\n" )
+			cat( "\n\t</style>\n" )
 			
 		cat( "\n</head>\n")
 		cat( "\n<body>\n")
@@ -396,39 +390,41 @@
 		cat( "<h1>QDNAseq Report</h1>", "\n")
 		
 		cat( '<h3 class="qdnaseq">About this analysis</h3>', "\n")
-		cat( '<p>This page provides access to all results. To have a local copy of this report just download the <a href="', gzipOutputName, '" class="button-success button-small pure-button">zipfile</a> with all output (', zippedSize, ')</p>', "\n", sep='')		
+		cat( '<p>This page provides access to all results. To have a local copy of this report just download the <a href="', gzipOutputName, '" class="button">zipfile</a> with all output (', zippedSize, ')</p>', "\n", sep='')		
 		
 		## ------------------------
 		## table with general info
 		## ------------------------
 		cat( '<h3 class="qdnaseq">Settings</h3><p>', "\n")
-		cat( '<table class="pure-table pure-table-striped"><thead><tr><th>setting</th><th>value</th></tr></thead><tbody>' )
+		cat( '<table class="pure-table pure-table-striped"><tbody>' )
 			cat( htmlTableRow( c( "AnalysisName", outputName ) ) )
-			cat( htmlTableRow( c( "AnalysisDate", 'todo' ) ) )
+			cat( htmlTableRow( c( "AnalysisDate", as.character(Sys.time()) ) ) )
 			cat( htmlTableRow( c( "BinSize (kb)", binSize ) ) )
+			cat( htmlTableRow( c( "R info", rVersion ) ) )
+			cat( htmlTableRow( c( "QDNAseq info", qdnaseqVersion ) ) )
 			
+			sampleStrings <- c()
 			for ( galaxyName in fileList[[ 'all_files' ]] ){
 				sampleName <- fileList[[ galaxyName ]]
-				cat( htmlTableRow( c( "InputBam", paste( galaxyName, ' (', sampleName, ')', sep='' ) ) ) )
+				sampleStrings <- c( sampleStrings, paste( galaxyName, ' (', sampleName, ')', sep='' ) )
 			}
+			cat( htmlTableRow( c( "InputBams", paste( sampleStrings, collapse=", ") ) ) )
+
 		cat( "</tbody></table></p>", "\n")
 		
 		## ------------------------
 		## list with links to all output files
 		## ------------------------
-		r_code <- '<p>'
-		r_code <- paste( r_code, '<code class="code">## R code to load files</code><br />', sep="\n" )
-		r_code <- paste( r_code, '<code class="code">library( QDNAseq )</code><br />', sep="\n")
+
 
 		cat( '<h3 class="qdnaseq">Output files</h3><p>', "\n")
 		cat( '<dl>', "\n" )
             cat( '<dt>', htmlLink( path=robjReadCoName, robjReadCoName ), '</dt>', "\n" ) 
             cat( '<dd>QDNAseq object with read counts per bin, ', readCoSize,'</dd>', "\n" )
-            r_code <- paste( r_code, '<code class="code">readCounts <- readRDS(', robjReadCoName, ")</code><br />", sep="")
+            
 
             cat( '<dt>', htmlLink( path=robjCopyNrName, robjCopyNrName ), '</dt>', "\n" ) 
             cat( '<dd>QDNAseq object with copy numbers per bin, ', copyNrSize,'</dd>', "\n" )
-            r_code <- paste( r_code, '<code class="code">copyNumbersSmooth <- readRDS(', robjCopyNrName, ")</code><br />", sep="")
             
             cat( '<dt>', htmlLink( path=igvCopyNrName, igvCopyNrName ), '</dt>', "\n" ) 
             cat( '<dd>IGV formatted text file with copy numbers per bin, ', igvCopyNrSize,'</dd>', "\n" )
@@ -436,21 +432,20 @@
             if ( doCall ){
             	cat( '<dt>', htmlLink( path=robjCalledName, robjCalledName ), '</dt>', "\n" ) 
             	cat( '<dd>QDNAseq object with segment and call status per bin, ', calledSize,'</dd>', "\n" )
-            	r_code <- paste( r_code, '<code class="code">copyNumbersCalled <- readRDS(', robjCalledName, ")</code><br />", sep="")
 
             	cat( '<dt>', htmlLink( path=regiOutputName, regiOutputName ), '</dt>', "\n" ) 
             	cat( '<dd>list with segmented/called regions for each sample, ', regionSize, '</dd>', "\n" )
-            	r_code <- paste( r_code, '<code class="code">calledRegions <- readRDS(', regiOutputName, ")</code><br />", sep="")
-
-            	cat( '<dt>', htmlLink( path=igvCalledName, igvCalledName ), '</dt>', "\n" ) 
-            	cat( '<dd>IGV formatted text file with calls per bin , ', igvCalledSize,'</dd>', "\n" )
-
             }
         cat( '</dl></p>', "\n" )
 
+ 		r_code <- '## start own downstream analysis in R with:'
+ 		r_code <- c( r_code, 'library( QDNAseq )' )
+		r_code <- c( r_code, paste( 'readRDS( ', robjReadCoName, ' ) -> readCounts', sep="") )
+
         ## r_code shows example code on how to load output files
         #cat( r_code, "</p>\n", sep="\n")
         cat( '<p>See ', htmlLink( WEB_LINK, 'the bioconductor QDNAseq documentation' ), ' for more information on how to work with these files</p>', "\n", sep='' )
+        cat( '<pre>', paste( r_code, collapse="\n" ), '</pre>', "\n", sep='' )
 
 		## ------------------------
 		## table with links to files	
@@ -475,7 +470,6 @@
 				html_copy_img <- htmlLink( path=copy_img, paste('<img id="', copy_img,'" src="',copy_img,'" alt="',bam_file, '" width="', width, '" height="', height, '">', sep='') )
 				html_call_thumb <- 'NA'
 				html_call_img <- ''
-				html_bed <- 'NA'
 				html_bedGraph <- 'NA'
 				region_count <- 'NA'
 
@@ -484,14 +478,13 @@
 					region_count <- plotted_images[[ bam_file ]][[ 'region_count' ]]
 					html_call_thumb <- htmlLink( path=paste('#', call_img, sep=''), paste('<img src="', call_img, '" alt="', bam_file, '" width="', width_t,'" height="', height_t,'">', sep='') )
 
-					files <- printedFiles[[ bam_file ]]
-					html_bed <- htmlLink( path=files[1], 'bed' )	
-					html_bedGraph <- htmlLink( path=files[2], 'bedGraph' )	
+					## list structure with output files just is an array per BAM
+					html_bedGraph <- htmlLink( path=printedFiles[[ bam_file ]][1], 'bedGraph' )	
 					html_call_img <- htmlLink( path=call_img, paste('<img id="', call_img,'" src="', call_img,'" alt="', bam_file, '" width="', width, '" height="', height,'">', sep='') )
 				}
 
 				## add info to overview table, including small thumbnails
-				cat( htmlTableRow( c(bam_file, html_copy_thumb, html_call_thumb, usedReads, region_count, paste( html_bed, html_bedGraph, sep=", ")) ), "\n" )
+				cat( htmlTableRow( c(bam_file, html_copy_thumb, html_call_thumb, usedReads, region_count, paste( html_bedGraph, sep=", ")) ), "\n" )
 				## now include (large) images in html page
 				plots_html <- paste( plots_html, html_copy_img, "\n", html_call_img, "\n<hr \\>\n", sep='' )
 			}
@@ -510,11 +503,8 @@
 		)
 		## there only is a frequency plot when data is called
 		if ( !doCall ){ html_freq_img <- '' }
-		
-		extra_plots_html <- paste( 
-			html_noise_img, " \n", 
-			html_freq_img, "\n", sep='' 
-		)
+
+		extra_plots_html <- paste( html_noise_img, " \n", html_freq_img, "\n", sep='' )
 		
 		## ------------------------
 		## section with various output shown
@@ -533,7 +523,7 @@
 sink( file = outputHtml, type = "output" )
 		
 		cat( "<head>", "\n")
-			cat( "\t", '<link rel="stylesheet" href="http://yui.yahooapis.com/pure/0.4.2/pure-min.css">', "\n" )
+			cat( "\t", '<link rel="stylesheet" href="', PURE_CSS, '">', "\n", sep='' )
 
 			cat( "<style>", "\n")
 				## include CSS directly into html file
@@ -543,8 +533,8 @@
 		cat( "</head>", "\n")
 
 		cat( '<h1>QDNAseq Results (', outputName,')</h1>', "\n", sep="")
-		cat( '<p>Explore <a href="', htmlOutputName, '" class="button-success button-small pure-button">the results</a> directly within galaxy</p>', "\n", sep="")
-		cat( '<p>Or download a <a href="', gzipOutputName, '" class="button-success button-small pure-button">zipfile</a> with all output (', zippedSize, ')</p>', "\n", sep="" )
+		cat( '<p>Explore <a href="', htmlOutputName, '" class="button">the results</a> directly within galaxy</p>', "\n", sep="")
+		cat( '<p>Or download a <a href="', gzipOutputName, '" class="button">zipfile</a> with all output (', zippedSize, ')</p>', "\n", sep="" )
 
 		cat( '<p>The zip file contains all output files, including *.rds files allowing you to load the R copyNumber object(s) and perform further detailed analysis or create your own output for further processing. You can load the rds file with <code class="code">loadRDS(file.rds)</code></p>', "\n", sep="")