Mercurial > repos > shians > shrnaseq
changeset 1:cad1cc829e2e
- test commit
author | shian_su <registertonysu@gmail.com> |
---|---|
date | Tue, 18 Mar 2014 12:45:36 +1100 |
parents | 8848246b5612 |
children | 02d484876935 |
files | hairpinTool.R hairpinTool.xml |
diffstat | 2 files changed, 188 insertions(+), 50 deletions(-) [+] |
line wrap: on
line diff
--- a/hairpinTool.R Fri Feb 21 12:49:45 2014 +1100 +++ b/hairpinTool.R Tue Mar 18 12:45:36 2014 +1100 @@ -1,3 +1,51 @@ +# ARGS: 1.inputType -String specifying format of input (fastq or table) +# IF inputType is "fastQ": +# 2*.fastqPath -One or more strings specifying path to fastq files +# 2.annoPath -String specifying path to hairpin annotation table +# 3.samplePath -String specifying path to sample annotation table +# 4.barStart -Integer specifying starting position of barcode +# 5.barEnd -Integer specifying ending position of barcode +# 6.hpStart -Integer specifying startins position of hairpin +# unique region +# 7.hpEnd -Integer specifying ending position of hairpin +# unique region +# ### +# IF inputType is "counts": +# 2.countPath -String specifying path to count table +# 3.annoPath -String specifying path to hairpin annotation table +# 4.samplePath -String specifying path to sample annotation table +# ### +# 8.cpmReq -Float specifying cpm requirement +# 9.sampleReq -Integer specifying cpm requirement +# 10.fdrThresh -Float specifying the FDR requirement +# 11.lfcThresh -Float specifying the log-fold-change requirement +# 12.workMode -String specifying exact test or GLM usage +# 13.htmlPath -String specifying path to HTML file +# 14.folderPath -STring specifying path to folder for output +# IF workMode is "classic" (exact test) +# 15.pairData[2] -String specifying first group for exact test +# 16.pairData[1] -String specifying second group for exact test +# ### +# IF workMode is "glm" +# 15.contrastData -String specifying contrasts to be made +# 16.roastOpt -String specifying usage of gene-wise tests +# 17.hairpinReq -String specifying hairpin requirement for gene- +# wise test +# 18.selectOpt -String specifying type of selection for barcode +# plots +# 19.selectVals -String specifying members selected for barcode +# plots +# +# OUT: Bar Plot of Counts Per Index +# Bar Plot of Counts Per Hairpin +# MDS Plot +# Smear Plot +# Barcode Plots (If Genewise testing was selected) +# Top Expression Table +# HTML file linking to the ouputs +# +# Author: Shian Su - registertonysu@gmail.com - Jan 2014 + # Record starting time timeStart <- as.character(Sys.time()) @@ -115,7 +163,7 @@ fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], fixed=TRUE)) argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] - hairpinPath <- as.character(argv[2]) + annoPath <- as.character(argv[2]) samplePath <- as.character(argv[3]) barStart <- as.numeric(argv[4]) barEnd <- as.numeric(argv[5]) @@ -147,14 +195,13 @@ } # Read in inputs -if (inputType=="fastq") { - samples <- read.table(samplePath, header=TRUE, sep="\t") - hairpins <- read.table(hairpinPath, header=TRUE, sep="\t") -} else if (inputType=="counts") { - samples <- read.table(samplePath, header=TRUE, sep="\t") + +samples <- read.table(samplePath, header=TRUE, sep="\t") +anno <- read.table(annoPath, header=TRUE, sep="\t") +if (inputType=="counts") { counts <- read.table(countPath, header=TRUE, sep="\t") - anno <- read.table(annoPath, header=TRUE, sep="\t") } + ###################### Check inputs for correctness ############################ samples$ID <- make.names(samples$ID) @@ -166,32 +213,41 @@ tab <- table(samples$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") offenders <- unmake.names(offenders) - stop("ID column of sample annotation must have unique values, values ", + stop("'ID' column of sample annotation must have unique values, values ", offenders, " are repeated") } # Check that IDs in sample annotation are unique -if (any(is.na(match(samples$ID, colnames(counts))))) { - stop("not all samples have groups specified") -} # Check that a group has be specifed for each sample - if (inputType=="fastq") { - - if (any(table(hairpins$ID)>1)){ - tab <- table(hairpins$ID) + + if (any(table(anno$ID)>1)){ + tab <- table(anno$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") - stop("ID column of hairpin annotation must have unique values, values ", + stop("'ID' column of hairpin annotation must have unique values, values ", offenders, " are repeated") } # Check that IDs in hairpin annotation are unique - + } else if (inputType=="counts") { + if (any(is.na(match(samples$ID, colnames(counts))))) { + stop("not all samples have groups specified") + } # Check that a group has be specifed for each sample if (any(table(counts$ID)>1)){ tab <- table(counts$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") - stop("ID column of count table must have unique values, values ", + stop("'ID' column of count table must have unique values, values ", offenders, " are repeated") } # Check that IDs in count table are unique } +if (workMode=="glm") { + if (roastOpt == "yes") { + if (is.na(match("Gene", colnames(anno)))) { + tempStr <- paste("Gene-wise tests selected but'Gene' column not", + "specified in hairpin annotation file") + stop(tempStr) + } + } +} + ################################################################################ # Process arguments @@ -277,23 +333,22 @@ } if (inputType=="fastq") { -# Use EdgeR hairpin process and capture outputs -hpReadout <- capture.output( - data <- processHairpinReads(fastqPath, samplePath, hairpinPath, + # Use EdgeR hairpin process and capture outputs + hpReadout <- capture.output( + data <- processHairpinReads(fastqPath, samplePath, annoPath, hairpinStart=hpStart, hairpinEnd=hpEnd, verbose=TRUE) -) - -# Remove function output entries that show processing data or is empty -hpReadout <- hpReadout[hpReadout!=""] -hpReadout <- hpReadout[!grepl("Processing", hpReadout)] -hpReadout <- hpReadout[!grepl("in file", hpReadout)] -hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) - - -# Make the names of groups syntactically valid (replace spaces with periods) -data$samples$group <- make.names(data$samples$group) -} else { + ) + + # Remove function output entries that show processing data or is empty + hpReadout <- hpReadout[hpReadout!=""] + hpReadout <- hpReadout[!grepl("Processing", hpReadout)] + hpReadout <- hpReadout[!grepl("in file", hpReadout)] + hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) + + # Make the names of groups syntactically valid (replace spaces with periods) + data$samples$group <- make.names(data$samples$group) +} else if (inputType=="counts") { # Process counts information, set ID column to be row names rownames(counts) <- counts$ID counts <- counts[ , !(colnames(counts)=="ID")] @@ -319,14 +374,17 @@ # Create DGEList data <- DGEList(counts=counts, lib.size=colSums(counts), norm.factors=rep(1,ncol(counts)), genes=anno, group=factors) - + # Make the names of groups syntactically valid (replace spaces with periods) data$samples$group <- make.names(data$samples$group) } # Filter hairpins with low counts +preFilterCount <- nrow(data) sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq data <- data[sel, ] +postFilterCount <- nrow(data) +filteredCount <- preFilterCount-postFilterCount # Estimate dispersions data <- estimateDisp(data) @@ -541,8 +599,10 @@ } } -# Record ending time +# Record ending time and calculate total run time timeEnd <- as.character(Sys.time()) +timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3)) +timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE) ################################################################################ ### HTML Generation ################################################################################ @@ -620,12 +680,79 @@ cata("or other spreadsheet programs</p>\n") cata("<table border=\"0\">\n") +cata("<h4>Additional Information:</h4>\n") + +if (inputType == "fastq") { + ListItem("Data was gathered from fastq raw read file(s).") +} else if (inputType == "counts") { + ListItem("Data was gathered from a table of counts.") +} + +if (cpmReq!=0 && sampleReq!=0) { + tempStr <- paste("Hairpins that do not have more than", cpmReq, + "CPM in at least", sampleReq, "samples are considered", + "insignificant and filtered out.") + ListItem(tempStr) + filterProp <- round(filteredCount/preFilterCount*100, digits=2) + tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp, + "%) hairpins were filtered out for low count-per-million.") + ListItem(tempStr) +} + +if (workMode == "classic") { + ListItem("An exact test was performed on each hairpin.") +} else if (workMode == "glm") { + ListItem("A generalised linear model was fitted to each hairpin.") +} + + + +cit <- character() +link <-character() +link[1] <- paste0("<a href=\"", + "http://www.bioconductor.org/packages/release/bioc/", + "vignettes/limma/inst/doc/usersguide.pdf", + "\">", "limma User's Guide", "</a>.") +link[2] <- paste0("<a href=\"", + "http://www.bioconductor.org/packages/release/bioc/", + "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf", + "\">", "edgeR User's Guide", "</a>") + +cit[1] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010).", + "edgeR: a Bioconductor package for differential", + "expression analysis of digital gene expression", + "data. Bioinformatics 26, 139-140") +cit[2] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests", + "for assessing differences in tag abundance. Bioinformatics", + "23, 2881-2887") +cit[3] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of", + "negative binomial dispersion, with applications to SAGE data.", + "Biostatistics, 9, 321-332") + +cit[4] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential", + "expression analysis of multifactor RNA-Seq experiments with", + "respect to biological variation. Nucleic Acids Research 40,", + "4288-4297") + +cata("<h4>Citations</h4>") +cata("<ol>\n") +ListItem(cit[1]) +ListItem(cit[2]) +ListItem(cit[3]) +ListItem(cit[4]) +cata("</ol>\n") + +cata("<table border=\"0\">\n") cata("<tr>\n") TableItem("Task started at:"); TableItem(timeStart) cata("</tr>\n") cata("<tr>\n") TableItem("Task ended at:"); TableItem(timeEnd) cata("</tr>\n") +cata("<tr>\n") +TableItem("Task run time:"); TableItem(timeTaken) +cata("<tr>\n") +cata("</table>\n") cata("</body>\n") cata("</html>")
--- a/hairpinTool.xml Fri Feb 21 12:49:45 2014 +1100 +++ b/hairpinTool.xml Tue Mar 18 12:45:36 2014 +1100 @@ -1,4 +1,4 @@ -<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.5"> +<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.6"> <description> Analyse hairpin differential representation using edgeR </description> @@ -13,8 +13,8 @@ </stdio> <command interpreter="Rscript"> - hairpinTool.R $inputOpt.type - #if $inputOpt.type=="fastq": + hairpinTool.R $inputOpt.inputType + #if $inputOpt.inputType=="fastq": #for $i, $fas in enumerate($inputOpt.fastq): fastq::$fas.file #end for @@ -22,7 +22,7 @@ $inputOpt.hairpin $inputOpt.samples - #if $inputOpt.positions.option=="yes": + #if $inputOpt.positions.posOption=="yes": $inputOpt.positions.barstart $inputOpt.positions.barend $inputOpt.positions.hpstart @@ -36,11 +36,11 @@ #else: $inputOpt.counts $inputOpt.anno - "$inputOpt.factors" + $inputOpt.factors 0 0 0 #end if - #if $filterCPM.option=="yes": + #if $filterCPM.filtOption=="yes": $filterCPM.cpmReq $filterCPM.sampleReq #else: @@ -59,10 +59,10 @@ "$workMode.pair2" #else: "$workMode.contrast" - $workMode.roast.option - #if $workMode.roast.option=="yes": + $workMode.roast.roastOption + #if $workMode.roast.roastOption=="yes": $workMode.roast.hairpinReq - $workMode.roast.select.option + $workMode.roast.select.selOption "$workMode.roast.select.selection" #else: 0 @@ -74,7 +74,7 @@ <inputs> <conditional name="inputOpt"> - <param name="type" type="select" label="Input File Type"> + <param name="inputType" type="select" label="Input File Type"> <option value="fastq">FastQ File</option> <option value="counts">Table of Counts</option> </param> @@ -92,7 +92,7 @@ </repeat> <conditional name="positions"> - <param name="option" type="select" + <param name="posOption" type="select" label="Specify Barcode and Hairpin Locations?" help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57."> <option value="no" selected="True">No</option> @@ -126,7 +126,7 @@ </conditional> <conditional name="filterCPM"> - <param name="option" type="select" label="Filter Low CPM?" + <param name="filtOption" type="select" label="Filter Low CPM?" help="Ignore hairpins with very low representation when performing analysis."> <option value="yes">Yes</option> @@ -172,7 +172,7 @@ expression."/> <conditional name="roast"> - <param name="option" type="select" + <param name="roastOption" type="select" label="Perform Gene Level Analysis?" help="Analyse LogFC tendencies for hairpins belonging to the same gene."> @@ -187,7 +187,7 @@ be analysed."/> <conditional name="select"> - <param name="option" type="select" + <param name="selOption" type="select" label="Gene Selection Method"> <option value="rank">By p-value Rank</option> <option value="geneID">By Gene Identifier</option> @@ -232,7 +232,18 @@ <outputs> <data format="html" name="outFile" label="shRNAseq Analysis"/> </outputs> - + <!-- Tests under construction + <tests> + <test> + <param name="counts" value="zuber_count.txt"/> + <param name="anno" value="zuber_anno.txt"/> + <param name="factors" value="zuber_samples.txt"/> + <param name="fdr" value="0.05"/> + <output name="output" file="shRNAseq_Analysis_html" lines_diff="50"/> + </test> + + </tests> + --> <help> .. class:: infomark