Mercurial > repos > mvdbeek > ruvseq
comparison ruvseq.R @ 0:958ed8091d7b draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ruvseq commit 545e858c8f600fa6e12a0a38546e155f22019dcb-dirty
| author | mvdbeek |
|---|---|
| date | Mon, 03 Sep 2018 01:18:09 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:958ed8091d7b |
|---|---|
| 1 # setup R error handling to go to stderr | |
| 2 library("getopt") | |
| 3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | |
| 4 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) | |
| 5 | |
| 6 setup_cmdline_options <- function() { | |
| 7 args <- commandArgs(trailingOnly = TRUE) | |
| 8 spec <- matrix(c( | |
| 9 "help", "h", 0, "logical", | |
| 10 "alpha", "a", 1, "double", | |
| 11 "min_mean_count", "min_c", 1, "double", | |
| 12 "min_k", "min_k", 1, "double", | |
| 13 "max_k", "max_k", 1, "double", | |
| 14 "sample_json", "s", 1, "character", | |
| 15 "plots" , "p", 1, "character", | |
| 16 "header", "H", 0, "logical", | |
| 17 "txtype", "y", 1, "character", | |
| 18 "tx2gene", "x", 1, "character"), # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF) | |
| 19 byrow=TRUE, ncol=4) | |
| 20 | |
| 21 opt <- getopt(spec) | |
| 22 # if help was asked for print a friendly message | |
| 23 # and exit with a non-zero error code | |
| 24 if (!is.null(opt$help)) { | |
| 25 cat(getopt(spec, usage=TRUE)) | |
| 26 q(status=1) | |
| 27 } else { | |
| 28 load_libraries() | |
| 29 } | |
| 30 return(opt) | |
| 31 } | |
| 32 | |
| 33 load_libraries <- function() { | |
| 34 # Allows displaying help without waiting for libraries to load | |
| 35 library("tools") | |
| 36 library("jsonlite") | |
| 37 library("reshape2") | |
| 38 library("RUVSeq") | |
| 39 library("RColorBrewer") | |
| 40 library("tximport") | |
| 41 library("DESeq2") | |
| 42 library("ggrepel") | |
| 43 } | |
| 44 | |
| 45 source_local <- function(fname){ | |
| 46 argv <- commandArgs(trailingOnly = FALSE) | |
| 47 base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) | |
| 48 source(paste(base_dir, fname, sep="/")) | |
| 49 } | |
| 50 | |
| 51 # Source get_deseq_dataset.R for getting deseq dataset from htseq/featurecounts/tximport | |
| 52 source_local('get_deseq_dataset.R') | |
| 53 | |
| 54 # RUVseq function definitions | |
| 55 | |
| 56 plot_pca_rle <- function (set, title) { | |
| 57 x <- pData(set)[,1] | |
| 58 colors <- brewer.pal(3, "Set2") | |
| 59 label <- paste0(' for ', title) | |
| 60 plotRLE(set, outline=FALSE, ylim=c(-4, 4), col=colors[x]) | |
| 61 title(main=paste0("RLE", label)) | |
| 62 plotPCA(set, col=colors[x], cex=1.2) | |
| 63 title(main=paste0("PCA", label)) | |
| 64 } | |
| 65 | |
| 66 plot_factors_of_unwanted_variation <- function(set, method, k){ | |
| 67 pd <- pData(set) | |
| 68 pd['sample'] <- row.names(pd) | |
| 69 colnames(pd)[1] <- 'condition' | |
| 70 d = melt(pd, id.vars = c('sample', 'condition')) | |
| 71 d['x'] <- 1 # There is no information on the X, so we just fake it to be able to do a scatterplot | |
| 72 print(ggplot(d, aes(x=x, y=value, color=condition, label=sample)) + | |
| 73 geom_point() + | |
| 74 ggtitle(paste0('Factors of unwanted variation for method: ', method, ", k=", k)) + | |
| 75 facet_wrap( ~ variable, scales = "free_x") + | |
| 76 geom_text_repel() + | |
| 77 theme(axis.title.x=element_blank(), | |
| 78 axis.text.x=element_blank(), | |
| 79 axis.ticks.x=element_blank(), | |
| 80 plot.title = element_text(hjust = 0.5)) | |
| 81 ) | |
| 82 } | |
| 83 | |
| 84 create_seq_expression_set <- function (dds, min_mean_count) { | |
| 85 count_values <- counts(dds) | |
| 86 filter <- apply(count_values, 1, function(x) mean(x) > min_mean_count) | |
| 87 filtered <- count_values[filter,] | |
| 88 set = newSeqExpressionSet(as.matrix(count_values), | |
| 89 phenoData = data.frame(colData(dds)$condition, row.names=colnames(filtered))) | |
| 90 plot_pca_rle(set = set, title = 'raw data') | |
| 91 set <- betweenLaneNormalization(set, which="upper") | |
| 92 plot_pca_rle(set = set, title = 'upper quartile normalized') | |
| 93 return(set) | |
| 94 } | |
| 95 | |
| 96 get_empirical_control_genes <- function(set, cutoff_p) { | |
| 97 x <- pData(set)[,1] | |
| 98 design <- model.matrix(~x, data=pData(set)) | |
| 99 y <- DGEList(counts=counts(set), group=x) | |
| 100 y <- calcNormFactors(y, method="upperquartile") | |
| 101 y <- estimateGLMCommonDisp(y, design) | |
| 102 y <- estimateGLMTagwiseDisp(y, design) | |
| 103 fit <- glmFit(y, design) | |
| 104 lrt <- glmLRT(fit, coef=2) | |
| 105 top <- topTags(lrt, n=nrow(set))$table | |
| 106 top_rows <- rownames(top)[which(top$PValue > cutoff_p)] | |
| 107 empirical <- rownames(set)[which(!(rownames(set) %in% top_rows))] | |
| 108 return(empirical) | |
| 109 } | |
| 110 | |
| 111 ruv_control_gene_method <- function(set, k, control_genes='empirical', cutoff_p=0.2) { | |
| 112 if (control_genes == 'empirical') { | |
| 113 control_genes = get_empirical_control_genes(set, cutoff_p=cutoff_p) | |
| 114 } | |
| 115 set <- RUVg(set, control_genes, k=k) | |
| 116 plot_pca_rle(set, paste0("RUVg with empirical control genes, k=", k)) | |
| 117 plot_factors_of_unwanted_variation(set, method="RUVg with empirical control genes", k=k) | |
| 118 return(set) | |
| 119 } | |
| 120 | |
| 121 ruv_residual_method <- function(set, k) { | |
| 122 genes <- rownames(counts(set)) | |
| 123 x <- pData(set)[,1] | |
| 124 # Initial edger residuals | |
| 125 design <- model.matrix(~x, data=pData(set)) | |
| 126 y <- DGEList(counts=counts(set), group=x) | |
| 127 y <- calcNormFactors(y, method="upperquartile") | |
| 128 y <- estimateGLMCommonDisp(y, design) | |
| 129 y <- estimateGLMTagwiseDisp(y, design) | |
| 130 fit <- glmFit(y, design) | |
| 131 res <- residuals(fit, type="deviance") | |
| 132 set <- RUVr(set, genes, k=k, res) | |
| 133 plot_pca_rle(set = set, title = paste0('RUVr using residuals, k=', k)) | |
| 134 plot_factors_of_unwanted_variation(set, method="RUVr using residuals", k=k) | |
| 135 return(set) | |
| 136 } | |
| 137 | |
| 138 ruv_replicate_method <- function (set, k) { | |
| 139 genes <- rownames(counts(set)) | |
| 140 x <- pData(set)[,1] | |
| 141 differences <- makeGroups(x) | |
| 142 set <- RUVs(set, genes, k=k, differences) | |
| 143 plot_pca_rle(set, paste0('RUVs with replicate samples, k=', k)) | |
| 144 plot_factors_of_unwanted_variation(set, method="RUVs using replicates", k=k) | |
| 145 return(set) | |
| 146 } | |
| 147 | |
| 148 get_differentially_expressed_genes <- function(dds, contrast, alpha=0.01) { | |
| 149 r <- results(dds, contrast=contrast, alpha=alpha) | |
| 150 return(rownames(r[which(r$padj < alpha),])) | |
| 151 } | |
| 152 | |
| 153 opt <- setup_cmdline_options() | |
| 154 alpha <- opt$alpha | |
| 155 min_k <- opt$min_k | |
| 156 max_k <- opt$max_k | |
| 157 sample_json <- fromJSON(opt$sample_json) | |
| 158 sample_paths <- sample_json$path | |
| 159 sample_names <- sample_json$label | |
| 160 condition <- as.factor(sample_json$condition) | |
| 161 sampleTable <- data.frame(samplename=sample_names, | |
| 162 filename = sample_paths, | |
| 163 condition=condition) | |
| 164 rownames(sampleTable) <- sample_names | |
| 165 | |
| 166 dds <- get_deseq_dataset(sampleTable, header=opt$header, designFormula= ~ condition, tximport=opt$txtype, txtype=opt$txtype, tx2gene=opt$tx2gene) | |
| 167 if (!is.null(opt$plots)) { | |
| 168 pdf(opt$plots) | |
| 169 } | |
| 170 | |
| 171 # Run through the ruvseq variants | |
| 172 set <- create_seq_expression_set(dds, min_mean_count = opt$min_mean_count) | |
| 173 result <- list(no_correction = set) | |
| 174 for (k in seq(min_k, max_k)) { | |
| 175 result[[paste0('residual_method_k', k)]] <- ruv_residual_method(set, k=k) | |
| 176 result[[paste0('replicate_method_k', k)]] <- ruv_replicate_method(set, k=k) | |
| 177 result[[paste0('control_method_k', k)]] <- ruv_control_gene_method(set, k=k, cutoff_p=0.5) | |
| 178 } | |
| 179 | |
| 180 for (name in names(result)) { | |
| 181 if (!startsWith(name, "no_correction")) { | |
| 182 set <- result[[name]] | |
| 183 unwanted_variation <- pData(set) | |
| 184 df <- data.frame(identifier = rownames(unwanted_variation)) | |
| 185 df <- cbind(df, unwanted_variation) | |
| 186 colnames(df)[2] <- 'condition' | |
| 187 write.table(df, file=paste0("batch_effects_", name, ".tabular"), sep="\t", quote=F, row.names=F) | |
| 188 } | |
| 189 } | |
| 190 | |
| 191 # close the plot device | |
| 192 if (!is.null(opt$plots)) { | |
| 193 cat("closing plot device\n") | |
| 194 dev.off() | |
| 195 } | |
| 196 | |
| 197 cat("Session information:\n\n") | |
| 198 sessionInfo() |
