# HG changeset patch
# User yhoogstrate
# Date 1400578011 14400
# Node ID 8b7bd6e290c4fc646bfc62f7fa66ee9d0c1b93f5
# Parent 7cb518091b1826c2a5022da0d6bb1d1f519078e3
Deleted selected files
diff -r 7cb518091b18 -r 8b7bd6e290c4 edgeR_DGE.xml
--- a/edgeR_DGE.xml Tue May 20 05:26:34 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,212 +0,0 @@
-
-
- RNA-Seq expression analysis using edgeR (R package)
-
-
-
-
- R --vanilla --slave -f $R_script '--args
- $design_matrix
- $contrast
-
- $output_count_edgeR
- $output_cpm
- output_FPXM
- $output_raw_counts
-
- $qc
- $output_MDSplot
- $output_BCVplot
- $output_MAplot
- smearPlot '
- > $output_R
- 2> stderr.txt
- &&
- grep -v 'Calculating library sizes from column' stderr.txt 1>&2
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping for loading limma
-
-## Fetch commandline arguments
-args <- commandArgs(trailingOnly = TRUE)
-designmatrix = args[1]
-contrast = args[2]
-
-output_1 = args[3]
-output_2 = args[4]
-output_3 = args[5] ##FPKM file - yet to be implemented
-output_4 = args[6]
-
-QC = nchar(args[7]) > 0
-
-output_5 = args[8]
-output_6 = args[9]
-output_7 = args[10]
-
-output_8 = args[11]
-
-
-library(edgeR)
-raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
-
-## Obtain read-counts
-
-header = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1)
-has_header = (class(header[1,1]) == "character")
-
-read_counts = read.delim(as.character(raw_data[1,1]),header=has_header,stringsAsFactors=F,row.names=1)[1]
-
-for(i in 2:length(raw_data[,1])) {
- write("parsing counts from:",stdout())
- write(raw_data[i,1],stdout())
-
- header = read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1)
- has_header = (class(header[1,1]) == "character")
- table = read.delim(as.character(raw_data[i,1]),header=has_header,stringsAsFactors=F,row.names=1)[1]
-
- read_counts = cbind(read_counts,table)
-}
-
-colnames(read_counts) = as.character(raw_data[,2])
-
-
-
-## Filter for HTSeq predifined counts:
-exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
-exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned")
-
-exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
-exclude = exclude[is.na(exclude)==0]
-if(length(exclude) != 0) {
- read_counts = read_counts[-exclude,]
-}
-
-
-
-empty_samples = apply(read_counts,2,function(x) sum(x) == 0)
-if(sum(empty_samples) > 0) {
- write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
- write(colnames(read_counts)[empty_samples],stderr())
-} else {
- dge = DGEList(counts=read_counts,genes=rownames(read_counts))
-
- design_tmp <- raw_data[3:length(raw_data)]
- rownames(design_tmp) <- colnames(dge)
- formula = paste(c("~0",colnames(design_tmp)),collapse = " + ")
- design <- model.matrix(as.formula(formula),design_tmp)
-
- prefixes = colnames(design_tmp)[attr(design,"assign")]
- avoid = nchar(prefixes) == nchar(colnames(design))
- replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
- replacements[avoid] = colnames(design)[avoid]
- colnames(design) = replacements
-
-
-
- write("Calculating normalization factors...",stdout())
- dge = calcNormFactors(dge)
- write("Estimating common dispersion...",stdout())
- dge = estimateGLMCommonDisp(dge,design)
- write("Estimating trended dispersion...",stdout())
- dge = estimateGLMTrendedDisp(dge,design)
- write("Estimating tagwise dispersion...",stdout())
- dge = estimateGLMTagwiseDisp(dge,design)
-
-
-
-
- if(QC == TRUE) {
- write("Creating QC plots...",stdout())
- #### MDS Plot
- pdf(output_5)
- plotMDS(dge, main="edgeR MDS Plot")
- dev.off()
- #### Biological coefficient of variation plot
- pdf(output_6)
- plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
- dev.off()
- }
-
-
-
- write("Fitting GLM...",stdout())
- fit = glmFit(dge,design)
-
- write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout())
- cont <- c(contrast)
- cont <- makeContrasts(contrasts=cont, levels=design)
-
- lrt <- glmLRT(fit, contrast=cont[,1])
- write(paste("Exporting to file: ",output_1,sep=""),stdout())
- write.table(file=output_1,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=T)
- write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t")
- ## todo EXPORT FPKM
- write.table(file=output_4,dge\$counts,sep="\t")
-
-
-
- if(QC == TRUE) {
- write("Creating MA plots...",stdout())
-
- etable <- topTags(lrt, n=nrow(dge))\$table
- etable <- etable[order(etable\$FDR), ]
- pdf(output_7)
- with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
- with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red"))
- abline(h=c(-1,1), col="blue")
- dev.off()
- }
- write("Done!",stdout())
-}
-
-
-
-
-
-
-
-
-
- (debug == "true")
-
-
-
- (qc == "true")
-
-
-
- (qc == "true")
-
-
-
- (qc == "true")
-
-
-
-
- input: Design matrix using "create Design matrix" tool
- input: contrast
-
-
diff -r 7cb518091b18 -r 8b7bd6e290c4 edgeR_Design_Matrix.listing.py
--- a/edgeR_Design_Matrix.listing.py Tue May 20 05:26:34 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-
-def listfiles(unpaired_samples,paired_samples):
- file_list = []
-
- if(unpaired_samples != None):
- if(type(unpaired_samples) == type([])):
- for sample in unpaired_samples:
- file_list.append([sample.name,sample.file_name,False])
-
- ## The following takes care of a bug in galaxy;
- ## if only one history object is selected, it isn't returned as a list but as single object.
- ## This is probably a rudimentair part of code that originates from the time that multiple data-objects were not implemented.
-
- else:
- file_list.append([unpaired_samples.name,unpaired_samples.file_name,False])
-
- if(paired_samples != None):
- pair_id = 0
- for pair in paired_samples:
- pair_id += 1
- sample_id = 0
- for sample in pair['samples']:
- sample_id += 1
- file_list.append([sample['sample'].name+" *[Pair "+str(pair_id)+"; sample "+str(sample_id)+"]",sample['sample'].file_name,False])
-
- return file_list
diff -r 7cb518091b18 -r 8b7bd6e290c4 edgeR_Design_Matrix.py
--- a/edgeR_Design_Matrix.py Tue May 20 05:26:34 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,178 +0,0 @@
-#!/usr/bin/env python
-import argparse, os, shutil, sys, tempfile, subprocess
-
-
-class sampleContainer:
- def __init__(self):
- self.samples = []
- self.pairs = []
- self.pair_index = {}
- self.treatments = {}
- self.treatment_types = {}
- self.names = {}
-
- def add_sample(self,sample):
- if(sample in self.samples):
- sys.stderr.write("Error:\n* Sample has been added multiple times: "+sample+"\n")
- sys.exit(1)
- else:
- self.samples.append(sample)
- print " - Added: "+sample
-
- def add_pair(self,pair):
- print " - Adding pair:"
- for sample in pair:
- self.add_sample(sample)
- self.pairs.append(pair)
- pair_id = len(self.pairs)
-
- for sample in pair:
- self.pair_index[sample] = pair_id
-
- def add_treatment_argument(self,treatment_argument,name=None):
- print " - Parsing treatment"
-
- treatment = {"factor_index":{},"sample_index":{}}
- only_integers = True
-
- i = 1
- for item in treatment_argument:
- if(i % 2):
- factor = item
- if(treatment['factor_index'].has_key(factor)):
- sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n")
- else:
- print " - Adding factor: "+factor
- treatment["factor_index"][factor] = []
- if(not factor.isdigit()):
- only_integers = False
- else:
- for sample in item.split(","):
- treatment["factor_index"][factor].append(sample)
- if(treatment["sample_index"].has_key(sample)):
- sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n")
- else:
- treatment["sample_index"][sample] = factor
- i += 1
-
- treatment_factors = sorted(treatment["factor_index"].keys())
-
- if(name == None):
- treatment["name"] = "_vs_".join(treatment_factors)
- else:
- treatment["name"] = str(name)
-
- if(len(treatment["sample_index"]) != len(self.samples)):
- sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n")
-
- if(only_integers):
- treatment_type = "integer"
- else:
- treatment_type = "string"
-
- if(self.treatments.has_key(treatment["name"])):
- sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n")
- else:
- self.treatments[treatment["name"]] = treatment
- self.treatment_types[treatment["name"]] = treatment_type
- print " - Treatment of type: "+treatment_type+" is valid: "+treatment["name"]
-
- def add_names_argument(self,names_argument):
- for sample in names_argument:
- self.add_name(sample)
-
- def add_name(self,argument):
- argument = argument.split(":",1)
- self.names[argument[0]] = argument[1]
-
- def add_unpaired_samples_argument(self,argument):
- print " - Adding unpaired samples"
- for sample in argument:
- self.add_sample(sample)
-
- def add_paired_samples_argument(self,argument):
- print " - Adding paired samples"
- pair = []
- for potential_sample in argument:
- if(potential_sample == ":"):
- self.add_pair(pair)
- pair = []
- else:
- pair.append(potential_sample)
-
- def add_treatments_argument(self,argument):
- print " - Adding treatments"
- treatment_argument = []
- for potential_treatment in argument:
- if(potential_treatment == ":"):
- self.add_treatment_argument(treatment_argument)
- treatment_argument = []
- else:
- treatment_argument.append(potential_treatment)
-
- def add_pairings(self):
- samples = sorted(self.samples)
- treatment_arguments = []
-
- if len(self.pairs) > 0:
- for pair_id in range(len(self.pairs)):
- # add to treatments
- treatment_arguments.append(str(pair_id+1))
- treatment_arguments.append(",".join(self.pairs[pair_id]))
-
- # remove from samples
- for sample in self.pairs[pair_id]:
- samples.remove(sample)
-
- # Add remaining samples as separate factors
- i = len(self.pairs)+1
- for sample in samples:
- treatment_arguments.append(str(i))
- treatment_arguments.append(sample)
- i += 1
-
- treatment_arguments.append(":")
-
- self.add_treatments_argument(treatment_arguments)
-
- def export(self,output):
- # Open file stream
- if(args.output == "-"):
- fh = sys.stdout
- else:
- fh = open(args.output,"w")
-
- # Write header:
- fh.write("filename\tname\t"+"\t".join(self.treatments.keys())+"\n")
-
- #fh.write("#\ttype")
- #for treatment in self.treatments.keys():
- # fh.write("\t"+self.treatment_types[treatment])
- #fh.write("\n")
-
- # Write body:
- for sample in self.samples:
- fh.write(sample+"\t"+self.names[sample])
- for treatment in self.treatments.keys():
- fh.write("\t"+self.treatments[treatment]["sample_index"][sample])
- fh.write("\n")
-
- fh.close()
-
-if __name__=="__main__":
- parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.")
- parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True)
- parser.add_argument("-u","--unpaired-samples", nargs="*", help='Unpaired sample files.')
- parser.add_argument("-p","--paired-samples", nargs="*", help='Paired sample files: pat_x_sample_1.txt, .')
- parser.add_argument("-t","--treatments", nargs="+", help='Treatment conditions',required=True)
- parser.add_argument("-n","--sample-names", nargs="+", help='Paired sample files: pat_x_sample_1.txt:NAME .')
-
- args = parser.parse_args()
-
- s = sampleContainer()
- s.add_unpaired_samples_argument(args.unpaired_samples)
- s.add_paired_samples_argument(args.paired_samples)
- s.add_treatments_argument(args.treatments)
- s.add_names_argument(args.sample_names)
- s.add_pairings()
- s.export(args.output)
diff -r 7cb518091b18 -r 8b7bd6e290c4 edgeR_Design_Matrix.xml
--- a/edgeR_Design_Matrix.xml Tue May 20 05:26:34 2014 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-
-
- Create the experimental design for edgeR by a few clicks in Galaxy
-
-
- edgeR_Design_Matrix.py
- -o $contrast_matrix
- -u
- #for $unpaired_sample in $unpaired_samples
- ${unpaired_sample}
- #end for
- -p
- #for $paired_sample in $paired_samples
- #for $sample in $paired_sample.samples
- ${sample.sample}
- #end for
- :
- #end for
- -t
- #for $treatment in $check.treatments
- #for $factor in $treatment.factors
- "$factor.name"
- $factor.samples
- #end for
- :
- #end for
- -n
- #for $unpaired_sample in $unpaired_samples
- "${unpaired_sample}:${unpaired_sample.name}"
- #end for
- #for $paired_sample in $paired_samples
- #for $sample in $paired_sample.samples
- "${sample.sample}:${sample.sample.name}"
- #end for
- #end for
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Examples::
- 2 group, unpaired::
- Group name 1 = Healthy
- Group name 2 = Tumor
-
-
-