Mercurial > repos > artbio > gsc_signature_score
view signature_score.R @ 0:bf99be04d098 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_signature_score commit 70c7abb20f76651e61325e3aa90809f6874f8b85
author | artbio |
---|---|
date | Mon, 24 Jun 2019 07:22:06 -0400 |
parents | |
children | 11ff7bd04308 |
line wrap: on
line source
######################### # Signature score # ######################### # Compute the signature score based on the geometric mean of the target gene expression # and split cells in 2 groups (high/low) using this signature score. # Example of command # Rscript 4-signature_score.R --input <input.tsv> --genes ARNT2,SALL2,SOX9,OLIG2,POU3F2 # --output <output.tab> --pdf <output.pdf> # load packages that are provided in the conda env options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() library(optparse) library(psych) library(ggplot2) library(gridExtra) # Arguments option_list = list( make_option( "--input", default = NA, type = 'character', help = "Input file that contains log2(CPM +1) values" ), make_option( "--sep", default = '\t', type = 'character', help = "File separator [default : '%default' ]" ), make_option( "--colnames", default = TRUE, type = 'logical', help = "Consider first line as header ? [default : '%default' ]" ), make_option( "--genes", default = NA, type = 'character', help = "List of genes comma separated" ), make_option( "--percentile_threshold", default = 20, type = 'integer', help = "detection threshold to keep a gene in signature set [default : '%default' ]" ), make_option( "--output", default = "./output.tab", type = 'character', help = "Output path [default : '%default' ]" ), make_option( "--stats", default = "./statistics.tab", type = 'character', help = "statistics path [default : '%default' ]" ), make_option( "--pdf", default = "~/output.pdf", type = 'character', help = "pdf path [default : '%default' ]" ) ) opt = parse_args(OptionParser(option_list = option_list), args = commandArgs(trailingOnly = TRUE)) if (opt$sep == "tab") {opt$sep = "\t"} if (opt$sep == "comma") {opt$sep = ","} # Take input data data.counts <- read.table( opt$input, h = opt$colnames, row.names = 1, sep = opt$sep, check.names = F ) # Get vector of target genes genes <- unlist(strsplit(opt$genes, ",")) if (length(unique(genes %in% rownames(data.counts))) == 1) { if (unique(genes %in% rownames(data.counts)) == F) stop("None of these genes are in your dataset: ", opt$genes) } logical_genes <- rownames(data.counts) %in% genes # Retrieve target genes in counts data signature.counts <- subset(data.counts, logical_genes) ## Descriptive Statistics Function descriptive_stats = function(InputData) { SummaryData = data.frame( mean = rowMeans(InputData), SD = apply(InputData, 1, sd), Variance = apply(InputData, 1, var), Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { (sum(x != 0) / ncol(y)) * 100 }) ) return(SummaryData) } signature_stats <- descriptive_stats(signature.counts) # Find poorly detected genes from the signature kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold # Add warnings if (length(unique(kept_genes)) > 1) { cat( "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), "\n" ) } else { if (unique(kept_genes) == F) { stop( "None of these genes are detected in ", opt$percent, "% of your cells: ", paste(rownames(signature_stats), collapse = ", "), ". You can be less stringent thanks to --percent parameter." ) } } # Remove genes poorly detected in the dataset signature.counts <- signature.counts[kept_genes,] # Replace 0 by 1 counts signature.counts[signature.counts == 0] <- 1 # Geometric mean by cell score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych # Add results in signature_output signature_output <- data.frame( cell = names(score), score = score, rate = ifelse(score > mean(score), "HIGH", "LOW"), nGenes = colSums(data.counts != 0), total_counts = colSums(data.counts) ) # statistics of input genes, signature genes first lines statistics.counts <- rbind(subset(data.counts, logical_genes), subset(data.counts, !logical_genes)) statistics <- descriptive_stats(statistics.counts) statistics <- cbind(gene=rownames(statistics), statistics) # Re-arrange score matrix for plots score <- data.frame(score = score, order = rank(score, ties.method = "first"), signature = signature_output$rate, stringsAsFactors = F) pdf(file = opt$pdf) ggplot(score, aes(x = order, y = score)) + geom_line() + geom_segment(x = 0, xend = max(score$order[score$signature == "LOW"]), y = mean(score$score), yend = mean(score$score)) + geom_area(aes(fill = signature), alpha = .7) + scale_fill_manual(values=c("#ff0000", "#08661e")) + geom_text(aes(x = 1, y = mean(score)), label = "Mean", vjust = -0.3, colour = "black") + labs(title = "Ordered cell signature scores", x = "Cell index", y = "Score") density_score <- density(score$score) ggplot(data.frame(density_score[1:2]), aes(x, y, fill = ifelse(x < mean(score$score), "LOW", "HIGH"))) + geom_line() + geom_vline(xintercept = mean(score$score)) + geom_text(x = mean(score$score), y = max(density_score$y), label = "Mean", hjust = -0.3, colour = "black") + geom_area(alpha = .7) + scale_fill_manual(values=c("#ff0000", "#08661e")) + ylim(0, max(density_score$y)) + labs( title = "Distribution of Cell signature scores", x = paste("N =", density_score$n, "Bandwidth =", density_score$bw), y = "Density", fill = "Signature" ) # Check score independant of low expression p_gene <- ggplot(signature_output, aes(rate, nGenes)) + geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F) + scale_fill_manual(values=c("#ff0000", "#08661e")) + geom_jitter() + labs(y = "Number of detected genes", x = "Signature") p_counts <- ggplot(signature_output, aes(rate, total_counts)) + geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F) + scale_fill_manual(values=c("#ff0000", "#08661e")) + geom_jitter() + labs(y = "Total counts", x = "Signature") grid.arrange(p_gene, p_counts, ncol = 2, top = "Influence of library sequencing depth on cell signature scores") dev.off() # Save file write.table( signature_output, opt$output, sep = "\t", quote = F, col.names = T, row.names = F ) write.table( statistics, opt$stats, sep = "\t", quote = F, col.names = T, row.names = F )