# HG changeset patch # User yhoogstrate # Date 1441096336 14400 # Node ID 049d8bc2214e600c9039e5b288cc3cf851ffa658 # Parent 56117c9db7137f28109c35c00a940a37b3998dd8 planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools/raw/master/edger_with_design_matrix commit 2700e500a4fb135a20ede7d52221a9d31f1aaa5e-dirty diff -r 56117c9db713 -r 049d8bc2214e bin/design_matrix_creator --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/design_matrix_creator Tue Sep 01 04:32:16 2015 -0400 @@ -0,0 +1,222 @@ +#!/usr/bin/env python + +import argparse, os, shutil, sys, tempfile, subprocess + +__version_info__ = ('1', '0', '0')#, 'beta') +__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3] +__author__ = 'Youri Hoogstrate' +__homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools' +__license__ = 'GNU General Public License v3 (GPLv3)' + + +class sampleContainer: + def __init__(self): + self.samples = [] + self.treatments = {} + self.treatment_index = [] + self.treatment_types = {} + + def do_decode(self,encoded_str): + return encoded_str.decode("base64").strip().replace("\t",'') + + def add_samples(self,argument): + print " - Adding samples" + for sample in argument: + self.add_sample(self.do_decode(sample)) + + def add_sample(self,sample): + if(sample in self.samples): + sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n") + sys.exit(1) + else: + self.samples.append(sample) + print " - Added: "+sample + + def add_blocking(self,argument): + print " - Adding paired samples" + pair = [] + for block in argument: + self.add_block(block) + + def add_block(self,blocks): + blocks = blocks.split(":") + as_treatment = blocks[0] + blocks = blocks[1:] + + used_samples = [] + indexed_samples = {} + + for i in range(len(blocks)): + block = blocks[i] + samples = self.get_samples_from_block(block) + indexed_samples[i+1] = [] + for sample in samples: + if(sample in used_samples): + sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n") + sys.exit(0) + else: + indexed_samples[i+1] = block + used_samples.append(sample) + + for sample in self.samples: + if(sample not in used_samples): + i = i + 1 + indexed_samples[i+1] = str(sample).encode('base64').strip() + + for index in indexed_samples.keys(): + key = str(index).encode('base64').strip() + as_treatment += ":"+key+":"+indexed_samples[index] + + self.add_treatment(as_treatment) + + def get_samples_from_block(self,decoded_block): + return [ self.do_decode(x) for x in decoded_block.split(",")] + + def add_treatments(self,argument): + print " - Adding treatments" + for treatment in argument: + self.add_treatment(treatment) + + def add_treatment(self,treatment_argument): + print " - Parsing treatment" + + + treatment_argument = treatment_argument.split(":") + name = self.do_decode(treatment_argument[0]) + treatment_argument = treatment_argument[1:] + + + treatment = {"factor_index":{},"sample_index":{}} + only_integers = True + + i = 1 + for item in treatment_argument: + if(i % 2): + factor = self.do_decode(item) + + if(treatment['factor_index'].has_key(factor)): + sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") + sys.exit(0) + else: + print " - Adding factor: "+factor + treatment["factor_index"][factor] = [] + if(not factor.isdigit()): + only_integers = False + else: + for sample in item.split(","): + sample = self.do_decode(sample) + + if(not sample in self.samples): + sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n") + sys.exit(0) + + treatment["factor_index"][factor].append(sample) + if(treatment["sample_index"].has_key(sample)): + sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") + sys.exit(0) + else: + treatment["sample_index"][sample] = factor + i += 1 + + treatment_factors = sorted(treatment["factor_index"].keys()) + + if(name == None): + treatment["name"] = "_vs_".join(treatment_factors) + else: + treatment["name"] = str(name) + + if(len(treatment["sample_index"]) != len(self.samples)): + sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") + + if(only_integers): + treatment_type = "integer" + else: + treatment_type = "string" + + if(self.treatments.has_key(treatment["name"])): + sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") + else: + self.treatments[treatment["name"]] = treatment + self.treatment_index.append(treatment["name"]) + self.treatment_types[treatment["name"]] = treatment_type + print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid" + + def export(self,output): + # Open file stream + if(args.output == "-"): + fh = sys.stdout + else: + fh = open(args.output,"w") + + # Write header: + fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n") + + # Write body: + for sample in self.samples: + fh.write(sample) + for treatment_id in self.treatment_index: + treatment = self.treatments[treatment_id] + fh.write("\t"+treatment["sample_index"][sample]) + fh.write("\n") + + fh.close() + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") + parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) + parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)') + parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)') + parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True) + parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"') + + args = parser.parse_args() + + columns = None + if(args.columns_file): + with open(args.columns_file, "r") as f: + listed_columns = [None] + f.readline().strip("\n").split("\t") + for i in range(1,len(listed_columns)): + listed_columns[i] = listed_columns[i].encode('base64').replace('\n','') + + s = sampleContainer() + + if(listed_columns): + columns = [] + for sample in args.sample_names: + columns.append(listed_columns[int(sample)]) + + + treatments = [] + for treatment in args.treatments: + treatment = treatment.split(":") + for i in range(1,len(treatment)): + if(i%2 == 0): + treatment_tmp = treatment[i].split(",") + for j in range(len(treatment_tmp)): + treatment_tmp[j] = listed_columns[int(treatment_tmp[j])] + treatment[i] = ",".join(treatment_tmp) + + treatments.append(":".join(treatment)) + + blockings = [] + if(args.blocking): + for blocking in args.blocking: + blocking = blocking.split(":") + for i in range(1,len(blocking)): + block = blocking[i].split(",") + for j in range(len(block)): + block[j] = listed_columns[int(block[j])] + blocking[i] = ",".join(block) + blockings.append(":".join(blocking)) + + s.add_samples(columns) + s.add_treatments(treatments) + s.add_blocking(blockings) + + else: + s.add_samples(args.sample_names) + s.add_treatments(args.treatments) + if(args.blocking): + s.add_blocking(args.blocking) + + s.export(args.output) diff -r 56117c9db713 -r 049d8bc2214e bin/edger_dge_table_to_bedgraph --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/edger_dge_table_to_bedgraph Tue Sep 01 04:32:16 2015 -0400 @@ -0,0 +1,222 @@ +#!/usr/bin/env python + +import re,sys,os,os.path,argparse,textwrap,datetime + +class GTF: + def __init__(self,filename,features=["exon"],symbol="gene_id"): + self.features = features + if(re.match("^[a-zA-Z0-9_\-]+$",symbol)): + self.symbol = symbol + else: + raise ValueError('False symbol matching symbol: '+str(symbol)) + self.index = {} + self.parse(filename) + + def parse(self,filename): + with open(filename) as infile: + for line in infile: + self.parse_line(line) + + def parse_line(self,line): + line = line.strip().split("\t") + if(len(line) == 9): + if(line[2] in self.features): + gene_id = self.parse_column_9(line[8]) + if(gene_id): + if(not self.index.has_key(gene_id)): + self.index[gene_id] = [] + self.index[gene_id].append([line[0],line[3],line[4]]) + + def parse_column_9(self,line): + query = self.symbol+'[ =]+([^ ;]+)' + m = re.search(query, line) + + if(m): + return m.group(1).strip("'").strip('"') + else: + return None + + def get(self,key): + try: + return self.index[key] + except: + return False + + +class EdgeR_table: + def __init__(self,table,gtf,columns=[3,7]): + self.index = {} + self.gtf = gtf + self.columns = columns + self.parse(table) + + def parse(self,filename): + i = 0 + with open(filename) as infile: + for line in infile: + if(i == 0): + self.parse_header(line) + else: + self.parse_line(line) + i += 1 + + def parse_header(self,line): + params = line.strip().split("\t") + if(params[1].lower().find("genes") == -1): + raise ValueError('False header in file - no "genes" in 2nd colum: '+line) + if(params[2].lower().find("logfc") == -1): + raise ValueError('False header in file - no "logfc" in 3rd colum: '+line) + if(params[3].lower().find("logcpm") == -1): + raise ValueError('False header in file - no "logcpm" in 4th colum: '+line) + if(params[4].lower().find("lr") == -1): + raise ValueError('False header in file - no "lr" in 5th colum: '+line) + if(params[5].lower().find("pvalue") == -1): + raise ValueError('False header in file - no "pvalue" in 6th colum: '+line) + if(params[6].lower().find("fdr") == -1): + raise ValueError('False header in file - no "fdr" in 7th colum: '+line) + + def parse_line(self,line): + line = line.strip().split("\t") + + if(len(line) == 7): + gene_id = line[1].strip('"').strip("'") + column_data = {} + for column in self.columns: + if(column in [6,7]): + column_data[column] = str(1.0 - float(line[column-1])) + else: + column_data[column] = line[column-1] + + locations = self.gtf.get(gene_id) + if(not locations): + print "Warning: no location found for gene "+gene_id + else: + for location in locations: + self.insert(location,column_data) + + def insert(self,location,data): + chrom = location[0] + start = location[1] + end = location[2] + + if(not self.index.has_key(chrom)): + self.index[chrom] = {} + + if(not self.index[chrom].has_key(start)): + self.index[chrom][start] = {} + + if(not self.index[chrom][start].has_key(end)): + self.index[chrom][start][end] = [] + + self.index[chrom][start][end].append(data) + + def export(self,filenames={3:"log_cpm.txt",7:"fdr.txt"}): + for column in self.columns: + fh = open(filenames[column],"w") + + buf = False + + for chrom in sorted(self.index.keys()): + for start in sorted(self.index[chrom].keys()): + for end in sorted(self.index[chrom][start].keys()): + fh.write(chrom+"\t"+start+"\t"+end+"\t"+self.index[chrom][start][end][0][column]+"\n") + + fh.close() + + os.system("sort -k1,1V -k2,2g -k3,3g '"+filenames[column]+"' > '"+filenames[column]+".sorted'") + + +def remove_overlap_in_bedgraph(bedgraph_file_dirty,bedgraph_file_clean): + fh = open(bedgraph_file_clean,"w") + buf = False + + with open(bedgraph_file_dirty,"r") as f: + for line in f: + cur = line.strip().split("\t") + cur[1] = int(cur[1]) + cur[2] = int(cur[2]) + + if(not buf): + buf = cur + else: + if(cur[0] == buf[0] and cur[1] <= buf[2] ): + if(buf[1] == cur[1]): #is subset + newscore = (float(buf[3])+float(cur[3]))/2 + buf[2] = cur[2] + buf[3] = newscore + else: + c1 = buf[1] + c2 = cur[1] + c3 = min(buf[2],cur[2]) + c4 = max(buf[2],cur[2]) + + fh.write(buf[0]+"\t"+str(c1)+"\t"+str(c2-1)+"\t"+str(buf[3])+"\n") + + newscore = (float(buf[3])+float(cur[3]))/2 + #fh.write(buf[0]+"\t"+str(c2+1)+"\t"+str(c3)+"\t"+str(newscore)+"\tp2\n") + #buf = [buf[0], c3+1 , c4 , cur[3]] + + buf = [buf[0], c2 , c4 , cur[3]] + + # find if buf is a subset -> if so, merge and send to buffer + # or find the overlapping region + + # if current is overlapping with buffer; merge: + ## [ ] < buf + ## [ ] < cur + ## + ## [ ] < buf + ## [ ] < cur + ## 111112222333333 << write 1 and 2 and keep 3 in buf + else: + fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") + buf=cur + + fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") + fh.close() + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,epilog="For more info please visit:\n") + + parser.add_argument("-t",help="CPM table to extract columns from",nargs=1,required=True) + parser.add_argument("-g",help="GTF file used to extract genomic location",nargs=1,required=True) + + parser.add_argument("-c3",help="Output (bedgraph) for column 3 (logFC)",nargs="?",required=False) + parser.add_argument("-c4",help="Output (bedgraph) for column 4 (logCPM)",nargs="?",required=False) + parser.add_argument("-c5",help="Output (bedgraph) for column 5 (LR)",nargs="?",required=False) + parser.add_argument("-c6",help="Output (bedgraph) for column 6 (PValue)",nargs="?",required=False) + parser.add_argument("-c7",help="Output (bedgraph) for column 7 (FDR)",nargs="?",required=False) + + args = parser.parse_args() + + #files = {3:"VCAP_logFC.hg19.bedgraph",7:"VCAP_fdr.hg19.bedgraph"} + files = {} + + if(args.c3): + files[3] = args.c3 + if(args.c4): + files[4] = args.c4 + if(args.c5): + files[5] = args.c5 + if(args.c6): + files[6] = args.c6 + if(args.c7): + files[7] = args.c7 + + print "Parsing GTF file" + g = GTF(args.g[0]) + + print "Parsing EdgeR table" + e = EdgeR_table(args.t[0],g,files.keys()) + + print "Exporting raw bedgraph(s)" + e.export(files) + + print "Removing overlapping entries in bedgraph(s)" + for key in files.keys(): + remove_overlap_in_bedgraph(files[key]+".sorted",files[key]) + os.system("rm '"+files[key]+".sorted'") diff -r 56117c9db713 -r 049d8bc2214e design_matrix_creator --- a/design_matrix_creator Tue Sep 01 04:25:37 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,222 +0,0 @@ -#!/usr/bin/env python - -import argparse, os, shutil, sys, tempfile, subprocess - -__version_info__ = ('1', '0', '0')#, 'beta') -__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3] -__author__ = 'Youri Hoogstrate' -__homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools' -__license__ = 'GNU General Public License v3 (GPLv3)' - - -class sampleContainer: - def __init__(self): - self.samples = [] - self.treatments = {} - self.treatment_index = [] - self.treatment_types = {} - - def do_decode(self,encoded_str): - return encoded_str.decode("base64").strip().replace("\t",'') - - def add_samples(self,argument): - print " - Adding samples" - for sample in argument: - self.add_sample(self.do_decode(sample)) - - def add_sample(self,sample): - if(sample in self.samples): - sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n") - sys.exit(1) - else: - self.samples.append(sample) - print " - Added: "+sample - - def add_blocking(self,argument): - print " - Adding paired samples" - pair = [] - for block in argument: - self.add_block(block) - - def add_block(self,blocks): - blocks = blocks.split(":") - as_treatment = blocks[0] - blocks = blocks[1:] - - used_samples = [] - indexed_samples = {} - - for i in range(len(blocks)): - block = blocks[i] - samples = self.get_samples_from_block(block) - indexed_samples[i+1] = [] - for sample in samples: - if(sample in used_samples): - sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n") - sys.exit(0) - else: - indexed_samples[i+1] = block - used_samples.append(sample) - - for sample in self.samples: - if(sample not in used_samples): - i = i + 1 - indexed_samples[i+1] = str(sample).encode('base64').strip() - - for index in indexed_samples.keys(): - key = str(index).encode('base64').strip() - as_treatment += ":"+key+":"+indexed_samples[index] - - self.add_treatment(as_treatment) - - def get_samples_from_block(self,decoded_block): - return [ self.do_decode(x) for x in decoded_block.split(",")] - - def add_treatments(self,argument): - print " - Adding treatments" - for treatment in argument: - self.add_treatment(treatment) - - def add_treatment(self,treatment_argument): - print " - Parsing treatment" - - - treatment_argument = treatment_argument.split(":") - name = self.do_decode(treatment_argument[0]) - treatment_argument = treatment_argument[1:] - - - treatment = {"factor_index":{},"sample_index":{}} - only_integers = True - - i = 1 - for item in treatment_argument: - if(i % 2): - factor = self.do_decode(item) - - if(treatment['factor_index'].has_key(factor)): - sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") - sys.exit(0) - else: - print " - Adding factor: "+factor - treatment["factor_index"][factor] = [] - if(not factor.isdigit()): - only_integers = False - else: - for sample in item.split(","): - sample = self.do_decode(sample) - - if(not sample in self.samples): - sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n") - sys.exit(0) - - treatment["factor_index"][factor].append(sample) - if(treatment["sample_index"].has_key(sample)): - sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") - sys.exit(0) - else: - treatment["sample_index"][sample] = factor - i += 1 - - treatment_factors = sorted(treatment["factor_index"].keys()) - - if(name == None): - treatment["name"] = "_vs_".join(treatment_factors) - else: - treatment["name"] = str(name) - - if(len(treatment["sample_index"]) != len(self.samples)): - sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") - - if(only_integers): - treatment_type = "integer" - else: - treatment_type = "string" - - if(self.treatments.has_key(treatment["name"])): - sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") - else: - self.treatments[treatment["name"]] = treatment - self.treatment_index.append(treatment["name"]) - self.treatment_types[treatment["name"]] = treatment_type - print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid" - - def export(self,output): - # Open file stream - if(args.output == "-"): - fh = sys.stdout - else: - fh = open(args.output,"w") - - # Write header: - fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n") - - # Write body: - for sample in self.samples: - fh.write(sample) - for treatment_id in self.treatment_index: - treatment = self.treatments[treatment_id] - fh.write("\t"+treatment["sample_index"][sample]) - fh.write("\n") - - fh.close() - -if __name__=="__main__": - parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") - parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) - parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)') - parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)') - parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True) - parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"') - - args = parser.parse_args() - - columns = None - if(args.columns_file): - with open(args.columns_file, "r") as f: - listed_columns = [None] + f.readline().strip("\n").split("\t") - for i in range(1,len(listed_columns)): - listed_columns[i] = listed_columns[i].encode('base64').replace('\n','') - - s = sampleContainer() - - if(listed_columns): - columns = [] - for sample in args.sample_names: - columns.append(listed_columns[int(sample)]) - - - treatments = [] - for treatment in args.treatments: - treatment = treatment.split(":") - for i in range(1,len(treatment)): - if(i%2 == 0): - treatment_tmp = treatment[i].split(",") - for j in range(len(treatment_tmp)): - treatment_tmp[j] = listed_columns[int(treatment_tmp[j])] - treatment[i] = ",".join(treatment_tmp) - - treatments.append(":".join(treatment)) - - blockings = [] - if(args.blocking): - for blocking in args.blocking: - blocking = blocking.split(":") - for i in range(1,len(blocking)): - block = blocking[i].split(",") - for j in range(len(block)): - block[j] = listed_columns[int(block[j])] - blocking[i] = ",".join(block) - blockings.append(":".join(blocking)) - - s.add_samples(columns) - s.add_treatments(treatments) - s.add_blocking(blockings) - - else: - s.add_samples(args.sample_names) - s.add_treatments(args.treatments) - if(args.blocking): - s.add_blocking(args.blocking) - - s.export(args.output) diff -r 56117c9db713 -r 049d8bc2214e edgeR_Differential_Gene_Expression.xml --- a/edgeR_Differential_Gene_Expression.xml Tue Sep 01 04:25:37 2015 -0400 +++ b/edgeR_Differential_Gene_Expression.xml Tue Sep 01 04:32:16 2015 -0400 @@ -29,11 +29,6 @@ echo $(R --version | grep version | grep -v GNU) " , EdgeR version" $(R --vanilla --slave -e "library(edgeR) ; cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ") - - R --vanilla --slave -f $R_script '--args $expression_matrix $design_matrix @@ -111,48 +106,47 @@ -library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping -library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping -library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping + 0) { write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr()) write(colnames(read_counts)[empty_samples],stderr()) } else { - dge <- DGEList(counts=read_counts,genes=rownames(read_counts)) + dge <- DGEList(counts=read_counts,genes=rownames(read_counts)) - formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ") - design_matrix_tmp <- design_matrix - colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp)) - design <- model.matrix(as.formula(formula),design_matrix_tmp) + formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ") + design_matrix_tmp <- design_matrix + colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp)) + design <- model.matrix(as.formula(formula),design_matrix_tmp) rm(design_matrix_tmp) # Filter prefixes @@ -211,18 +205,18 @@ # Do normalization write("Calculating normalization factors...",stdout()) - dge <- calcNormFactors(dge) + dge <- calcNormFactors(dge) write("Estimating common dispersion...",stdout()) - dge <- estimateGLMCommonDisp(dge,design) + dge <- estimateGLMCommonDisp(dge,design) write("Estimating trended dispersion...",stdout()) - dge <- estimateGLMTrendedDisp(dge,design) + dge <- estimateGLMTrendedDisp(dge,design) write("Estimating tagwise dispersion...",stdout()) - dge <- estimateGLMTagwiseDisp(dge,design) + dge <- estimateGLMTagwiseDisp(dge,design) if(output_MDSplot_logFC != "/dev/null") { write("Creating MDS plot (logFC method)",stdout()) - points <- plotMDS.DGEList(dge,top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot + points <- plotMDS.DGEList(dge,top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot dev.off()# Kill it if(output_format_images == "pdf") { @@ -237,7 +231,7 @@ } - diff_x <- abs(max(points\$x)-min(points\$x)) + diff_x <- abs(max(points\$x)-min(points\$x)) diff_y <-(max(points\$y)-min(points\$y)) plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR logFC-MDS Plot on top 500 genes",type="n", xlab="Leading logFC dim 1", ylab="Leading logFC dim 2") points(points\$x,points\$y,pch=20) @@ -252,7 +246,7 @@ ## 1. First create a virtual plot to obtain the desired coordinates pdf("bcvmds.pdf") - points <- plotMDS.DGEList(dge,method="bcv",top=500,labels=rep("",nrow(dge\$samples))) + points <- plotMDS.DGEList(dge,method="bcv",top=500,labels=rep("",nrow(dge\$samples))) dev.off()# Kill it ## 2. Re-plot the coordinates in a new figure with the size and settings. @@ -267,8 +261,8 @@ bitmap(output_MDSplot_bcv,type="png16m",height=14,width=14) } - diff_x <- abs(max(points\$x)-min(points\$x)) - diff_y <-(max(points\$y)-min(points\$y)) + diff_x <- abs(max(points\$x)-min(points\$x)) + diff_y <- (max(points\$y)-min(points\$y)) plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR BCV-MDS Plot",type="n", xlab="Leading BCV dim 1", ylab="Leading BCV dim 2") points(points\$x,points\$y,pch=20) text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4) @@ -298,13 +292,13 @@ write("Fitting GLM...",stdout()) - fit <- glmFit(dge,design) + fit <- glmFit(dge,design) write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout()) - cont <- c(contrast) - cont <- makeContrasts(contrasts=cont, levels=design) + cont <- c(contrast) + cont <- makeContrasts(contrasts=cont, levels=design) - lrt <- glmLRT(fit, contrast=cont[,1]) + lrt <- glmLRT(fit, contrast=cont[,1]) write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout()) write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA) write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA) @@ -313,8 +307,8 @@ write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA) if(output_MAplot != "/dev/null" || output_PValue_distribution_plot != "/dev/null") { - etable <- topTags(lrt, n=nrow(dge))\$table - etable <- etable[order(etable\$FDR), ] + etable <- topTags(lrt, n=nrow(dge))\$table + etable <- etable[order(etable\$FDR), ] if(output_MAplot != "/dev/null") { write("Creating MA plot...",stdout()) @@ -350,16 +344,16 @@ bitmap(output_PValue_distribution_plot,type="png16m",width=14,height=14) } - expressed_genes <- subset(etable, PValue < 0.99) - h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)") - center <- sum(h\$counts) / length(h\$counts) + expressed_genes <- subset(etable, PValue < 0.99) + h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)") + center <- sum(h\$counts) / length(h\$counts) lines(c(0,1),c(center,center),lty=2,col="red",lwd=2) - k <- ksmooth(h\$mid, h\$counts) + k <- ksmooth(h\$mid, h\$counts) lines(k\$x,k\$y,col="red",lwd=2) - rmsd <- (h\$counts) - center - rmsd <- rmsd^2 - rmsd <- sum(rmsd) - rmsd <- sqrt(rmsd) + rmsd <- (h\$counts) - center + rmsd <- rmsd^2 + rmsd <- sum(rmsd) + rmsd <- sqrt(rmsd) text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue") ## change e into epsilon somehow dev.off() @@ -379,9 +373,9 @@ bitmap(output_heatmap_plot,type="png16m",width=10.5) } - etable2 <- topTags(lrt, n=100)\$table - order <- rownames(etable2) - cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),] + etable2 <- topTags(lrt, n=100)\$table + order <- rownames(etable2) + cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),] heatmap(t(cpm_sub)) dev.off() } @@ -394,6 +388,7 @@ write("Done!",stdout()) } +]]> diff -r 56117c9db713 -r 049d8bc2214e edger_dge_table_to_bedgraph --- a/edger_dge_table_to_bedgraph Tue Sep 01 04:25:37 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,222 +0,0 @@ -#!/usr/bin/env python - -import re,sys,os,os.path,argparse,textwrap,datetime - -class GTF: - def __init__(self,filename,features=["exon"],symbol="gene_id"): - self.features = features - if(re.match("^[a-zA-Z0-9_\-]+$",symbol)): - self.symbol = symbol - else: - raise ValueError('False symbol matching symbol: '+str(symbol)) - self.index = {} - self.parse(filename) - - def parse(self,filename): - with open(filename) as infile: - for line in infile: - self.parse_line(line) - - def parse_line(self,line): - line = line.strip().split("\t") - if(len(line) == 9): - if(line[2] in self.features): - gene_id = self.parse_column_9(line[8]) - if(gene_id): - if(not self.index.has_key(gene_id)): - self.index[gene_id] = [] - self.index[gene_id].append([line[0],line[3],line[4]]) - - def parse_column_9(self,line): - query = self.symbol+'[ =]+([^ ;]+)' - m = re.search(query, line) - - if(m): - return m.group(1).strip("'").strip('"') - else: - return None - - def get(self,key): - try: - return self.index[key] - except: - return False - - -class EdgeR_table: - def __init__(self,table,gtf,columns=[3,7]): - self.index = {} - self.gtf = gtf - self.columns = columns - self.parse(table) - - def parse(self,filename): - i = 0 - with open(filename) as infile: - for line in infile: - if(i == 0): - self.parse_header(line) - else: - self.parse_line(line) - i += 1 - - def parse_header(self,line): - params = line.strip().split("\t") - if(params[1].lower().find("genes") == -1): - raise ValueError('False header in file - no "genes" in 2nd colum: '+line) - if(params[2].lower().find("logfc") == -1): - raise ValueError('False header in file - no "logfc" in 3rd colum: '+line) - if(params[3].lower().find("logcpm") == -1): - raise ValueError('False header in file - no "logcpm" in 4th colum: '+line) - if(params[4].lower().find("lr") == -1): - raise ValueError('False header in file - no "lr" in 5th colum: '+line) - if(params[5].lower().find("pvalue") == -1): - raise ValueError('False header in file - no "pvalue" in 6th colum: '+line) - if(params[6].lower().find("fdr") == -1): - raise ValueError('False header in file - no "fdr" in 7th colum: '+line) - - def parse_line(self,line): - line = line.strip().split("\t") - - if(len(line) == 7): - gene_id = line[1].strip('"').strip("'") - column_data = {} - for column in self.columns: - if(column in [6,7]): - column_data[column] = str(1.0 - float(line[column-1])) - else: - column_data[column] = line[column-1] - - locations = self.gtf.get(gene_id) - if(not locations): - print "Warning: no location found for gene "+gene_id - else: - for location in locations: - self.insert(location,column_data) - - def insert(self,location,data): - chrom = location[0] - start = location[1] - end = location[2] - - if(not self.index.has_key(chrom)): - self.index[chrom] = {} - - if(not self.index[chrom].has_key(start)): - self.index[chrom][start] = {} - - if(not self.index[chrom][start].has_key(end)): - self.index[chrom][start][end] = [] - - self.index[chrom][start][end].append(data) - - def export(self,filenames={3:"log_cpm.txt",7:"fdr.txt"}): - for column in self.columns: - fh = open(filenames[column],"w") - - buf = False - - for chrom in sorted(self.index.keys()): - for start in sorted(self.index[chrom].keys()): - for end in sorted(self.index[chrom][start].keys()): - fh.write(chrom+"\t"+start+"\t"+end+"\t"+self.index[chrom][start][end][0][column]+"\n") - - fh.close() - - os.system("sort -k1,1V -k2,2g -k3,3g '"+filenames[column]+"' > '"+filenames[column]+".sorted'") - - -def remove_overlap_in_bedgraph(bedgraph_file_dirty,bedgraph_file_clean): - fh = open(bedgraph_file_clean,"w") - buf = False - - with open(bedgraph_file_dirty,"r") as f: - for line in f: - cur = line.strip().split("\t") - cur[1] = int(cur[1]) - cur[2] = int(cur[2]) - - if(not buf): - buf = cur - else: - if(cur[0] == buf[0] and cur[1] <= buf[2] ): - if(buf[1] == cur[1]): #is subset - newscore = (float(buf[3])+float(cur[3]))/2 - buf[2] = cur[2] - buf[3] = newscore - else: - c1 = buf[1] - c2 = cur[1] - c3 = min(buf[2],cur[2]) - c4 = max(buf[2],cur[2]) - - fh.write(buf[0]+"\t"+str(c1)+"\t"+str(c2-1)+"\t"+str(buf[3])+"\n") - - newscore = (float(buf[3])+float(cur[3]))/2 - #fh.write(buf[0]+"\t"+str(c2+1)+"\t"+str(c3)+"\t"+str(newscore)+"\tp2\n") - #buf = [buf[0], c3+1 , c4 , cur[3]] - - buf = [buf[0], c2 , c4 , cur[3]] - - # find if buf is a subset -> if so, merge and send to buffer - # or find the overlapping region - - # if current is overlapping with buffer; merge: - ## [ ] < buf - ## [ ] < cur - ## - ## [ ] < buf - ## [ ] < cur - ## 111112222333333 << write 1 and 2 and keep 3 in buf - else: - fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") - buf=cur - - fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") - fh.close() - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,epilog="For more info please visit:\n") - - parser.add_argument("-t",help="CPM table to extract columns from",nargs=1,required=True) - parser.add_argument("-g",help="GTF file used to extract genomic location",nargs=1,required=True) - - parser.add_argument("-c3",help="Output (bedgraph) for column 3 (logFC)",nargs="?",required=False) - parser.add_argument("-c4",help="Output (bedgraph) for column 4 (logCPM)",nargs="?",required=False) - parser.add_argument("-c5",help="Output (bedgraph) for column 5 (LR)",nargs="?",required=False) - parser.add_argument("-c6",help="Output (bedgraph) for column 6 (PValue)",nargs="?",required=False) - parser.add_argument("-c7",help="Output (bedgraph) for column 7 (FDR)",nargs="?",required=False) - - args = parser.parse_args() - - #files = {3:"VCAP_logFC.hg19.bedgraph",7:"VCAP_fdr.hg19.bedgraph"} - files = {} - - if(args.c3): - files[3] = args.c3 - if(args.c4): - files[4] = args.c4 - if(args.c5): - files[5] = args.c5 - if(args.c6): - files[6] = args.c6 - if(args.c7): - files[7] = args.c7 - - print "Parsing GTF file" - g = GTF(args.g[0]) - - print "Parsing EdgeR table" - e = EdgeR_table(args.t[0],g,files.keys()) - - print "Exporting raw bedgraph(s)" - e.export(files) - - print "Removing overlapping entries in bedgraph(s)" - for key in files.keys(): - remove_overlap_in_bedgraph(files[key]+".sorted",files[key]) - os.system("rm '"+files[key]+".sorted'") diff -r 56117c9db713 -r 049d8bc2214e tool_dependencies.xml --- a/tool_dependencies.xml Tue Sep 01 04:25:37 2015 -0400 +++ b/tool_dependencies.xml Tue Sep 01 04:32:16 2015 -0400 @@ -7,7 +7,7 @@ - ls -als ; echo "###############" ; ls -als $REPOSITORY_INSTALL_DIR ; pwd ; mkdir $INSTALL_DIR/bin ; cp $REPOSITORY_INSTALL_DIR/design_matrix_creator $INSTALL_DIR/bin/ + mkdir $INSTALL_DIR/bin ; cp $REPOSITORY_INSTALL_DIR/bin/design_matrix_creator $INSTALL_DIR/bin/ $INSTALL_DIR/bin/design_matrix_creator @@ -22,7 +22,7 @@ - mkdir $INSTALL_DIR/bin ; cp $REPOSITORY_INSTALL_DIR/edger_dge_table_to_bedgraph $INSTALL_DIR/bin/ + mkdir $INSTALL_DIR/bin ; cp $REPOSITORY_INSTALL_DIR/bin/edger_dge_table_to_bedgraph $INSTALL_DIR/bin/ $INSTALL_DIR/bin/edger_dge_table_to_bedgraph