Mercurial > repos > yhoogstrate > edger_with_design_matrix
changeset 26:8b7bd6e290c4 draft
Deleted selected files
author | yhoogstrate |
---|---|
date | Tue, 20 May 2014 05:26:51 -0400 |
parents | 7cb518091b18 |
children | c6463da87893 |
files | edgeR_DGE.xml edgeR_Design_Matrix.listing.py edgeR_Design_Matrix.py edgeR_Design_Matrix.xml |
diffstat | 4 files changed, 0 insertions(+), 491 deletions(-) [+] |
line wrap: on
line diff
--- a/edgeR_DGE.xml Tue May 20 05:26:34 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,212 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<tool id="edger_dge" name="edgeR Differential GeneExpression Analysis"> - <description>RNA-Seq expression analysis using edgeR (R package)</description> - - <command> - <!-- - The following script is written in the "Cheetah" language: - http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html - --> - - R --vanilla --slave -f $R_script '--args - $design_matrix - $contrast - - $output_count_edgeR - $output_cpm - output_FPXM - $output_raw_counts - - $qc - $output_MDSplot - $output_BCVplot - $output_MAplot - smearPlot ' - > $output_R - 2> stderr.txt - && - grep -v 'Calculating library sizes from column' stderr.txt 1>&2 - - </command> - - <inputs> - <param name="design_matrix" type="data" format="tabular" help="Design matrix" /> - - <param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info." /> - - <param name="qc" type="select" label="Quality control reports"> - <option value="true" selected="true">Yes</option> - <option value="false">No</option> - </param> - - <param name="debug" type="select" label="R Debug output"> - <option value="true"> Yes</option> - <option value="false" selected="true">No</option> - </param> - </inputs> - - <configfiles> - <configfile name="R_script"> -library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping for loading limma - -## Fetch commandline arguments -args <- commandArgs(trailingOnly = TRUE) -designmatrix = args[1] -contrast = args[2] - -output_1 = args[3] -output_2 = args[4] -output_3 = args[5] ##FPKM file - yet to be implemented -output_4 = args[6] - -QC = nchar(args[7]) > 0 - -output_5 = args[8] -output_6 = args[9] -output_7 = args[10] - -output_8 = args[11] - - -library(edgeR) -raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T) - -## Obtain read-counts - -header = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1) -has_header = (class(header[1,1]) == "character") - -read_counts = read.delim(as.character(raw_data[1,1]),header=has_header,stringsAsFactors=F,row.names=1)[1] - -for(i in 2:length(raw_data[,1])) { - write("parsing counts from:",stdout()) - write(raw_data[i,1],stdout()) - - header = read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1) - has_header = (class(header[1,1]) == "character") - table = read.delim(as.character(raw_data[i,1]),header=has_header,stringsAsFactors=F,row.names=1)[1] - - read_counts = cbind(read_counts,table) -} - -colnames(read_counts) = as.character(raw_data[,2]) - - - -## Filter for HTSeq predifined counts: -exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") -exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned") - -exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) -exclude = exclude[is.na(exclude)==0] -if(length(exclude) != 0) { - read_counts = read_counts[-exclude,] -} - - - -empty_samples = apply(read_counts,2,function(x) sum(x) == 0) -if(sum(empty_samples) > 0) { - write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr()) - write(colnames(read_counts)[empty_samples],stderr()) -} else { - dge = DGEList(counts=read_counts,genes=rownames(read_counts)) - - design_tmp <- raw_data[3:length(raw_data)] - rownames(design_tmp) <- colnames(dge) - formula = paste(c("~0",colnames(design_tmp)),collapse = " + ") - design <- model.matrix(as.formula(formula),design_tmp) - - prefixes = colnames(design_tmp)[attr(design,"assign")] - avoid = nchar(prefixes) == nchar(colnames(design)) - replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design))) - replacements[avoid] = colnames(design)[avoid] - colnames(design) = replacements - - - - write("Calculating normalization factors...",stdout()) - dge = calcNormFactors(dge) - write("Estimating common dispersion...",stdout()) - dge = estimateGLMCommonDisp(dge,design) - write("Estimating trended dispersion...",stdout()) - dge = estimateGLMTrendedDisp(dge,design) - write("Estimating tagwise dispersion...",stdout()) - dge = estimateGLMTagwiseDisp(dge,design) - - - - - if(QC == TRUE) { - write("Creating QC plots...",stdout()) - #### MDS Plot - pdf(output_5) - plotMDS(dge, main="edgeR MDS Plot") - dev.off() - #### Biological coefficient of variation plot - pdf(output_6) - plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance") - dev.off() - } - - - - write("Fitting GLM...",stdout()) - fit = glmFit(dge,design) - - write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout()) - cont <- c(contrast) - cont <- makeContrasts(contrasts=cont, levels=design) - - lrt <- glmLRT(fit, contrast=cont[,1]) - write(paste("Exporting to file: ",output_1,sep=""),stdout()) - write.table(file=output_1,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=T) - write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t") - ## todo EXPORT FPKM - write.table(file=output_4,dge\$counts,sep="\t") - - - - if(QC == TRUE) { - write("Creating MA plots...",stdout()) - - etable <- topTags(lrt, n=nrow(dge))\$table - etable <- etable[order(etable\$FDR), ] - pdf(output_7) - with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance")) - with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red")) - abline(h=c(-1,1), col="blue") - dev.off() - } - write("Done!",stdout()) -} - </configfile> - </configfiles> - - <outputs> - <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - table" /> - <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" /> - <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts" /> - - <data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output" > - <filter>(debug == "true")</filter> - </data> - - <data format="pdf" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot"> - <filter>(qc == "true")</filter> - </data> - - <data format="pdf" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot"> - <filter>(qc == "true")</filter> - </data> - - <data format="pdf" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot"> - <filter>(qc == "true")</filter> - </data> - </outputs> - - <help> - input: Design matrix using "create Design matrix" tool - input: contrast - </help> -</tool>
--- a/edgeR_Design_Matrix.listing.py Tue May 20 05:26:34 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ - -def listfiles(unpaired_samples,paired_samples): - file_list = [] - - if(unpaired_samples != None): - if(type(unpaired_samples) == type([])): - for sample in unpaired_samples: - file_list.append([sample.name,sample.file_name,False]) - - ## The following takes care of a bug in galaxy; - ## if only one history object is selected, it isn't returned as a list but as single object. - ## This is probably a rudimentair part of code that originates from the time that multiple data-objects were not implemented. - - else: - file_list.append([unpaired_samples.name,unpaired_samples.file_name,False]) - - if(paired_samples != None): - pair_id = 0 - for pair in paired_samples: - pair_id += 1 - sample_id = 0 - for sample in pair['samples']: - sample_id += 1 - file_list.append([sample['sample'].name+" *[Pair "+str(pair_id)+"; sample "+str(sample_id)+"]",sample['sample'].file_name,False]) - - return file_list
--- a/edgeR_Design_Matrix.py Tue May 20 05:26:34 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,178 +0,0 @@ -#!/usr/bin/env python -import argparse, os, shutil, sys, tempfile, subprocess - - -class sampleContainer: - def __init__(self): - self.samples = [] - self.pairs = [] - self.pair_index = {} - self.treatments = {} - self.treatment_types = {} - self.names = {} - - def add_sample(self,sample): - if(sample in self.samples): - sys.stderr.write("Error:\n* Sample has been added multiple times: "+sample+"\n") - sys.exit(1) - else: - self.samples.append(sample) - print " - Added: "+sample - - def add_pair(self,pair): - print " - Adding pair:" - for sample in pair: - self.add_sample(sample) - self.pairs.append(pair) - pair_id = len(self.pairs) - - for sample in pair: - self.pair_index[sample] = pair_id - - def add_treatment_argument(self,treatment_argument,name=None): - print " - Parsing treatment" - - treatment = {"factor_index":{},"sample_index":{}} - only_integers = True - - i = 1 - for item in treatment_argument: - if(i % 2): - factor = item - if(treatment['factor_index'].has_key(factor)): - sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") - else: - print " - Adding factor: "+factor - treatment["factor_index"][factor] = [] - if(not factor.isdigit()): - only_integers = False - else: - for sample in item.split(","): - treatment["factor_index"][factor].append(sample) - if(treatment["sample_index"].has_key(sample)): - sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") - else: - treatment["sample_index"][sample] = factor - i += 1 - - treatment_factors = sorted(treatment["factor_index"].keys()) - - if(name == None): - treatment["name"] = "_vs_".join(treatment_factors) - else: - treatment["name"] = str(name) - - if(len(treatment["sample_index"]) != len(self.samples)): - sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") - - if(only_integers): - treatment_type = "integer" - else: - treatment_type = "string" - - if(self.treatments.has_key(treatment["name"])): - sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") - else: - self.treatments[treatment["name"]] = treatment - self.treatment_types[treatment["name"]] = treatment_type - print " - Treatment of type: "+treatment_type+" is valid: "+treatment["name"] - - def add_names_argument(self,names_argument): - for sample in names_argument: - self.add_name(sample) - - def add_name(self,argument): - argument = argument.split(":",1) - self.names[argument[0]] = argument[1] - - def add_unpaired_samples_argument(self,argument): - print " - Adding unpaired samples" - for sample in argument: - self.add_sample(sample) - - def add_paired_samples_argument(self,argument): - print " - Adding paired samples" - pair = [] - for potential_sample in argument: - if(potential_sample == ":"): - self.add_pair(pair) - pair = [] - else: - pair.append(potential_sample) - - def add_treatments_argument(self,argument): - print " - Adding treatments" - treatment_argument = [] - for potential_treatment in argument: - if(potential_treatment == ":"): - self.add_treatment_argument(treatment_argument) - treatment_argument = [] - else: - treatment_argument.append(potential_treatment) - - def add_pairings(self): - samples = sorted(self.samples) - treatment_arguments = [] - - if len(self.pairs) > 0: - for pair_id in range(len(self.pairs)): - # add to treatments - treatment_arguments.append(str(pair_id+1)) - treatment_arguments.append(",".join(self.pairs[pair_id])) - - # remove from samples - for sample in self.pairs[pair_id]: - samples.remove(sample) - - # Add remaining samples as separate factors - i = len(self.pairs)+1 - for sample in samples: - treatment_arguments.append(str(i)) - treatment_arguments.append(sample) - i += 1 - - treatment_arguments.append(":") - - self.add_treatments_argument(treatment_arguments) - - def export(self,output): - # Open file stream - if(args.output == "-"): - fh = sys.stdout - else: - fh = open(args.output,"w") - - # Write header: - fh.write("filename\tname\t"+"\t".join(self.treatments.keys())+"\n") - - #fh.write("#\ttype") - #for treatment in self.treatments.keys(): - # fh.write("\t"+self.treatment_types[treatment]) - #fh.write("\n") - - # Write body: - for sample in self.samples: - fh.write(sample+"\t"+self.names[sample]) - for treatment in self.treatments.keys(): - fh.write("\t"+self.treatments[treatment]["sample_index"][sample]) - fh.write("\n") - - fh.close() - -if __name__=="__main__": - parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") - parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) - parser.add_argument("-u","--unpaired-samples", nargs="*", help='Unpaired sample files.') - parser.add_argument("-p","--paired-samples", nargs="*", help='Paired sample files: pat_x_sample_1.txt, .') - parser.add_argument("-t","--treatments", nargs="+", help='Treatment conditions',required=True) - parser.add_argument("-n","--sample-names", nargs="+", help='Paired sample files: pat_x_sample_1.txt:NAME .') - - args = parser.parse_args() - - s = sampleContainer() - s.add_unpaired_samples_argument(args.unpaired_samples) - s.add_paired_samples_argument(args.paired_samples) - s.add_treatments_argument(args.treatments) - s.add_names_argument(args.sample_names) - s.add_pairings() - s.export(args.output)
--- a/edgeR_Design_Matrix.xml Tue May 20 05:26:34 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<tool id="edger_design_matrix_creator" name="edgeR Design Matrix Creator"> - <description>Create the experimental design for edgeR by a few clicks in Galaxy</description> - - <command interpreter="python"> - edgeR_Design_Matrix.py - -o $contrast_matrix - -u - #for $unpaired_sample in $unpaired_samples - ${unpaired_sample} - #end for - -p - #for $paired_sample in $paired_samples - #for $sample in $paired_sample.samples - ${sample.sample} - #end for - : - #end for - -t - #for $treatment in $check.treatments - #for $factor in $treatment.factors - "$factor.name" - $factor.samples - #end for - : - #end for - -n - #for $unpaired_sample in $unpaired_samples - "${unpaired_sample}:${unpaired_sample.name}" - #end for - #for $paired_sample in $paired_samples - #for $sample in $paired_sample.samples - "${sample.sample}:${sample.sample.name}" - #end for - #end for - </command> - - <code file="edgeR_Design_Matrix.listing.py" /> - - <inputs> - <param name="unpaired_samples" type="data" format="tabular" label="Unpaired read-count dataset" help="from featureCounts/DEXSeq-count/HTSeq-count, etc. Press [ctrl] and click the sample to unselect it." multiple="true" /> - - <repeat name="paired_samples" title="Add paired samples; per patient"> - <repeat name="samples" title="Add sample for patient" min="2"> - <param name="sample" type="data" format="tabular" label="Read-count dataset that belongs to a pair" help="from featureCounts/DEXSeq-count/HTSeq-count, etc." /> - </repeat> - </repeat> - - <conditional name="check"> - <param name="continue" help="When clicking this variable, the treatment menu will appear" type="boolean" truevalue="yes" /> - - <when value="yes"> - <repeat name="treatments" title="Treatments" min="1"> - <repeat name="factors" title="Factor" min="2" help="e.g. 'normal' or 'tumor', 'untreated', 'recurrent', 'metastatic', or 'group_1',...,'group_n' etc."> - <param name="name" type="text" /> - <param name="samples" type="select" label="Corresponding samples" multiple="true" dynamic_options="listfiles(unpaired_samples,paired_samples)" /> - </repeat> - </repeat> - </when> - </conditional> - - </inputs> - - <outputs> - <data format="tabular" name="contrast_matrix" label="Contrast matrix" /> - </outputs> - - <help> -Examples:: - 2 group, unpaired:: - Group name 1 = Healthy - Group name 2 = Tumor - - </help> -</tool>