Mercurial > repos > yhoogstrate > edger_with_design_matrix
changeset 107:049d8bc2214e draft
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools/raw/master/edger_with_design_matrix commit 2700e500a4fb135a20ede7d52221a9d31f1aaa5e-dirty
author | yhoogstrate |
---|---|
date | Tue, 01 Sep 2015 04:32:16 -0400 |
parents | 56117c9db713 |
children | a02794bb9073 |
files | bin/design_matrix_creator bin/edger_dge_table_to_bedgraph design_matrix_creator edgeR_Differential_Gene_Expression.xml edger_dge_table_to_bedgraph tool_dependencies.xml |
diffstat | 6 files changed, 521 insertions(+), 526 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/design_matrix_creator Tue Sep 01 04:32:16 2015 -0400 @@ -0,0 +1,222 @@ +#!/usr/bin/env python + +import argparse, os, shutil, sys, tempfile, subprocess + +__version_info__ = ('1', '0', '0')#, 'beta') +__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3] +__author__ = 'Youri Hoogstrate' +__homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools' +__license__ = 'GNU General Public License v3 (GPLv3)' + + +class sampleContainer: + def __init__(self): + self.samples = [] + self.treatments = {} + self.treatment_index = [] + self.treatment_types = {} + + def do_decode(self,encoded_str): + return encoded_str.decode("base64").strip().replace("\t",'') + + def add_samples(self,argument): + print " - Adding samples" + for sample in argument: + self.add_sample(self.do_decode(sample)) + + def add_sample(self,sample): + if(sample in self.samples): + sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n") + sys.exit(1) + else: + self.samples.append(sample) + print " - Added: "+sample + + def add_blocking(self,argument): + print " - Adding paired samples" + pair = [] + for block in argument: + self.add_block(block) + + def add_block(self,blocks): + blocks = blocks.split(":") + as_treatment = blocks[0] + blocks = blocks[1:] + + used_samples = [] + indexed_samples = {} + + for i in range(len(blocks)): + block = blocks[i] + samples = self.get_samples_from_block(block) + indexed_samples[i+1] = [] + for sample in samples: + if(sample in used_samples): + sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n") + sys.exit(0) + else: + indexed_samples[i+1] = block + used_samples.append(sample) + + for sample in self.samples: + if(sample not in used_samples): + i = i + 1 + indexed_samples[i+1] = str(sample).encode('base64').strip() + + for index in indexed_samples.keys(): + key = str(index).encode('base64').strip() + as_treatment += ":"+key+":"+indexed_samples[index] + + self.add_treatment(as_treatment) + + def get_samples_from_block(self,decoded_block): + return [ self.do_decode(x) for x in decoded_block.split(",")] + + def add_treatments(self,argument): + print " - Adding treatments" + for treatment in argument: + self.add_treatment(treatment) + + def add_treatment(self,treatment_argument): + print " - Parsing treatment" + + + treatment_argument = treatment_argument.split(":") + name = self.do_decode(treatment_argument[0]) + treatment_argument = treatment_argument[1:] + + + treatment = {"factor_index":{},"sample_index":{}} + only_integers = True + + i = 1 + for item in treatment_argument: + if(i % 2): + factor = self.do_decode(item) + + if(treatment['factor_index'].has_key(factor)): + sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") + sys.exit(0) + else: + print " - Adding factor: "+factor + treatment["factor_index"][factor] = [] + if(not factor.isdigit()): + only_integers = False + else: + for sample in item.split(","): + sample = self.do_decode(sample) + + if(not sample in self.samples): + sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n") + sys.exit(0) + + treatment["factor_index"][factor].append(sample) + if(treatment["sample_index"].has_key(sample)): + sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") + sys.exit(0) + else: + treatment["sample_index"][sample] = factor + i += 1 + + treatment_factors = sorted(treatment["factor_index"].keys()) + + if(name == None): + treatment["name"] = "_vs_".join(treatment_factors) + else: + treatment["name"] = str(name) + + if(len(treatment["sample_index"]) != len(self.samples)): + sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") + + if(only_integers): + treatment_type = "integer" + else: + treatment_type = "string" + + if(self.treatments.has_key(treatment["name"])): + sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") + else: + self.treatments[treatment["name"]] = treatment + self.treatment_index.append(treatment["name"]) + self.treatment_types[treatment["name"]] = treatment_type + print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid" + + def export(self,output): + # Open file stream + if(args.output == "-"): + fh = sys.stdout + else: + fh = open(args.output,"w") + + # Write header: + fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n") + + # Write body: + for sample in self.samples: + fh.write(sample) + for treatment_id in self.treatment_index: + treatment = self.treatments[treatment_id] + fh.write("\t"+treatment["sample_index"][sample]) + fh.write("\n") + + fh.close() + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") + parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) + parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)') + parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)') + parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True) + parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"') + + args = parser.parse_args() + + columns = None + if(args.columns_file): + with open(args.columns_file, "r") as f: + listed_columns = [None] + f.readline().strip("\n").split("\t") + for i in range(1,len(listed_columns)): + listed_columns[i] = listed_columns[i].encode('base64').replace('\n','') + + s = sampleContainer() + + if(listed_columns): + columns = [] + for sample in args.sample_names: + columns.append(listed_columns[int(sample)]) + + + treatments = [] + for treatment in args.treatments: + treatment = treatment.split(":") + for i in range(1,len(treatment)): + if(i%2 == 0): + treatment_tmp = treatment[i].split(",") + for j in range(len(treatment_tmp)): + treatment_tmp[j] = listed_columns[int(treatment_tmp[j])] + treatment[i] = ",".join(treatment_tmp) + + treatments.append(":".join(treatment)) + + blockings = [] + if(args.blocking): + for blocking in args.blocking: + blocking = blocking.split(":") + for i in range(1,len(blocking)): + block = blocking[i].split(",") + for j in range(len(block)): + block[j] = listed_columns[int(block[j])] + blocking[i] = ",".join(block) + blockings.append(":".join(blocking)) + + s.add_samples(columns) + s.add_treatments(treatments) + s.add_blocking(blockings) + + else: + s.add_samples(args.sample_names) + s.add_treatments(args.treatments) + if(args.blocking): + s.add_blocking(args.blocking) + + s.export(args.output)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/edger_dge_table_to_bedgraph Tue Sep 01 04:32:16 2015 -0400 @@ -0,0 +1,222 @@ +#!/usr/bin/env python + +import re,sys,os,os.path,argparse,textwrap,datetime + +class GTF: + def __init__(self,filename,features=["exon"],symbol="gene_id"): + self.features = features + if(re.match("^[a-zA-Z0-9_\-]+$",symbol)): + self.symbol = symbol + else: + raise ValueError('False symbol matching symbol: '+str(symbol)) + self.index = {} + self.parse(filename) + + def parse(self,filename): + with open(filename) as infile: + for line in infile: + self.parse_line(line) + + def parse_line(self,line): + line = line.strip().split("\t") + if(len(line) == 9): + if(line[2] in self.features): + gene_id = self.parse_column_9(line[8]) + if(gene_id): + if(not self.index.has_key(gene_id)): + self.index[gene_id] = [] + self.index[gene_id].append([line[0],line[3],line[4]]) + + def parse_column_9(self,line): + query = self.symbol+'[ =]+([^ ;]+)' + m = re.search(query, line) + + if(m): + return m.group(1).strip("'").strip('"') + else: + return None + + def get(self,key): + try: + return self.index[key] + except: + return False + + +class EdgeR_table: + def __init__(self,table,gtf,columns=[3,7]): + self.index = {} + self.gtf = gtf + self.columns = columns + self.parse(table) + + def parse(self,filename): + i = 0 + with open(filename) as infile: + for line in infile: + if(i == 0): + self.parse_header(line) + else: + self.parse_line(line) + i += 1 + + def parse_header(self,line): + params = line.strip().split("\t") + if(params[1].lower().find("genes") == -1): + raise ValueError('False header in file - no "genes" in 2nd colum: '+line) + if(params[2].lower().find("logfc") == -1): + raise ValueError('False header in file - no "logfc" in 3rd colum: '+line) + if(params[3].lower().find("logcpm") == -1): + raise ValueError('False header in file - no "logcpm" in 4th colum: '+line) + if(params[4].lower().find("lr") == -1): + raise ValueError('False header in file - no "lr" in 5th colum: '+line) + if(params[5].lower().find("pvalue") == -1): + raise ValueError('False header in file - no "pvalue" in 6th colum: '+line) + if(params[6].lower().find("fdr") == -1): + raise ValueError('False header in file - no "fdr" in 7th colum: '+line) + + def parse_line(self,line): + line = line.strip().split("\t") + + if(len(line) == 7): + gene_id = line[1].strip('"').strip("'") + column_data = {} + for column in self.columns: + if(column in [6,7]): + column_data[column] = str(1.0 - float(line[column-1])) + else: + column_data[column] = line[column-1] + + locations = self.gtf.get(gene_id) + if(not locations): + print "Warning: no location found for gene "+gene_id + else: + for location in locations: + self.insert(location,column_data) + + def insert(self,location,data): + chrom = location[0] + start = location[1] + end = location[2] + + if(not self.index.has_key(chrom)): + self.index[chrom] = {} + + if(not self.index[chrom].has_key(start)): + self.index[chrom][start] = {} + + if(not self.index[chrom][start].has_key(end)): + self.index[chrom][start][end] = [] + + self.index[chrom][start][end].append(data) + + def export(self,filenames={3:"log_cpm.txt",7:"fdr.txt"}): + for column in self.columns: + fh = open(filenames[column],"w") + + buf = False + + for chrom in sorted(self.index.keys()): + for start in sorted(self.index[chrom].keys()): + for end in sorted(self.index[chrom][start].keys()): + fh.write(chrom+"\t"+start+"\t"+end+"\t"+self.index[chrom][start][end][0][column]+"\n") + + fh.close() + + os.system("sort -k1,1V -k2,2g -k3,3g '"+filenames[column]+"' > '"+filenames[column]+".sorted'") + + +def remove_overlap_in_bedgraph(bedgraph_file_dirty,bedgraph_file_clean): + fh = open(bedgraph_file_clean,"w") + buf = False + + with open(bedgraph_file_dirty,"r") as f: + for line in f: + cur = line.strip().split("\t") + cur[1] = int(cur[1]) + cur[2] = int(cur[2]) + + if(not buf): + buf = cur + else: + if(cur[0] == buf[0] and cur[1] <= buf[2] ): + if(buf[1] == cur[1]): #is subset + newscore = (float(buf[3])+float(cur[3]))/2 + buf[2] = cur[2] + buf[3] = newscore + else: + c1 = buf[1] + c2 = cur[1] + c3 = min(buf[2],cur[2]) + c4 = max(buf[2],cur[2]) + + fh.write(buf[0]+"\t"+str(c1)+"\t"+str(c2-1)+"\t"+str(buf[3])+"\n") + + newscore = (float(buf[3])+float(cur[3]))/2 + #fh.write(buf[0]+"\t"+str(c2+1)+"\t"+str(c3)+"\t"+str(newscore)+"\tp2\n") + #buf = [buf[0], c3+1 , c4 , cur[3]] + + buf = [buf[0], c2 , c4 , cur[3]] + + # find if buf is a subset -> if so, merge and send to buffer + # or find the overlapping region + + # if current is overlapping with buffer; merge: + ## [ ] < buf + ## [ ] < cur + ## + ## [ ] < buf + ## [ ] < cur + ## 111112222333333 << write 1 and 2 and keep 3 in buf + else: + fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") + buf=cur + + fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") + fh.close() + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,epilog="For more info please visit:\n<https://github.com/yhoogstrate/fuma>") + + parser.add_argument("-t",help="CPM table to extract columns from",nargs=1,required=True) + parser.add_argument("-g",help="GTF file used to extract genomic location",nargs=1,required=True) + + parser.add_argument("-c3",help="Output (bedgraph) for column 3 (logFC)",nargs="?",required=False) + parser.add_argument("-c4",help="Output (bedgraph) for column 4 (logCPM)",nargs="?",required=False) + parser.add_argument("-c5",help="Output (bedgraph) for column 5 (LR)",nargs="?",required=False) + parser.add_argument("-c6",help="Output (bedgraph) for column 6 (PValue)",nargs="?",required=False) + parser.add_argument("-c7",help="Output (bedgraph) for column 7 (FDR)",nargs="?",required=False) + + args = parser.parse_args() + + #files = {3:"VCAP_logFC.hg19.bedgraph",7:"VCAP_fdr.hg19.bedgraph"} + files = {} + + if(args.c3): + files[3] = args.c3 + if(args.c4): + files[4] = args.c4 + if(args.c5): + files[5] = args.c5 + if(args.c6): + files[6] = args.c6 + if(args.c7): + files[7] = args.c7 + + print "Parsing GTF file" + g = GTF(args.g[0]) + + print "Parsing EdgeR table" + e = EdgeR_table(args.t[0],g,files.keys()) + + print "Exporting raw bedgraph(s)" + e.export(files) + + print "Removing overlapping entries in bedgraph(s)" + for key in files.keys(): + remove_overlap_in_bedgraph(files[key]+".sorted",files[key]) + os.system("rm '"+files[key]+".sorted'")
--- a/design_matrix_creator Tue Sep 01 04:25:37 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,222 +0,0 @@ -#!/usr/bin/env python - -import argparse, os, shutil, sys, tempfile, subprocess - -__version_info__ = ('1', '0', '0')#, 'beta') -__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3] -__author__ = 'Youri Hoogstrate' -__homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools' -__license__ = 'GNU General Public License v3 (GPLv3)' - - -class sampleContainer: - def __init__(self): - self.samples = [] - self.treatments = {} - self.treatment_index = [] - self.treatment_types = {} - - def do_decode(self,encoded_str): - return encoded_str.decode("base64").strip().replace("\t",'') - - def add_samples(self,argument): - print " - Adding samples" - for sample in argument: - self.add_sample(self.do_decode(sample)) - - def add_sample(self,sample): - if(sample in self.samples): - sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n") - sys.exit(1) - else: - self.samples.append(sample) - print " - Added: "+sample - - def add_blocking(self,argument): - print " - Adding paired samples" - pair = [] - for block in argument: - self.add_block(block) - - def add_block(self,blocks): - blocks = blocks.split(":") - as_treatment = blocks[0] - blocks = blocks[1:] - - used_samples = [] - indexed_samples = {} - - for i in range(len(blocks)): - block = blocks[i] - samples = self.get_samples_from_block(block) - indexed_samples[i+1] = [] - for sample in samples: - if(sample in used_samples): - sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n") - sys.exit(0) - else: - indexed_samples[i+1] = block - used_samples.append(sample) - - for sample in self.samples: - if(sample not in used_samples): - i = i + 1 - indexed_samples[i+1] = str(sample).encode('base64').strip() - - for index in indexed_samples.keys(): - key = str(index).encode('base64').strip() - as_treatment += ":"+key+":"+indexed_samples[index] - - self.add_treatment(as_treatment) - - def get_samples_from_block(self,decoded_block): - return [ self.do_decode(x) for x in decoded_block.split(",")] - - def add_treatments(self,argument): - print " - Adding treatments" - for treatment in argument: - self.add_treatment(treatment) - - def add_treatment(self,treatment_argument): - print " - Parsing treatment" - - - treatment_argument = treatment_argument.split(":") - name = self.do_decode(treatment_argument[0]) - treatment_argument = treatment_argument[1:] - - - treatment = {"factor_index":{},"sample_index":{}} - only_integers = True - - i = 1 - for item in treatment_argument: - if(i % 2): - factor = self.do_decode(item) - - if(treatment['factor_index'].has_key(factor)): - sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") - sys.exit(0) - else: - print " - Adding factor: "+factor - treatment["factor_index"][factor] = [] - if(not factor.isdigit()): - only_integers = False - else: - for sample in item.split(","): - sample = self.do_decode(sample) - - if(not sample in self.samples): - sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n") - sys.exit(0) - - treatment["factor_index"][factor].append(sample) - if(treatment["sample_index"].has_key(sample)): - sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") - sys.exit(0) - else: - treatment["sample_index"][sample] = factor - i += 1 - - treatment_factors = sorted(treatment["factor_index"].keys()) - - if(name == None): - treatment["name"] = "_vs_".join(treatment_factors) - else: - treatment["name"] = str(name) - - if(len(treatment["sample_index"]) != len(self.samples)): - sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") - - if(only_integers): - treatment_type = "integer" - else: - treatment_type = "string" - - if(self.treatments.has_key(treatment["name"])): - sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") - else: - self.treatments[treatment["name"]] = treatment - self.treatment_index.append(treatment["name"]) - self.treatment_types[treatment["name"]] = treatment_type - print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid" - - def export(self,output): - # Open file stream - if(args.output == "-"): - fh = sys.stdout - else: - fh = open(args.output,"w") - - # Write header: - fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n") - - # Write body: - for sample in self.samples: - fh.write(sample) - for treatment_id in self.treatment_index: - treatment = self.treatments[treatment_id] - fh.write("\t"+treatment["sample_index"][sample]) - fh.write("\n") - - fh.close() - -if __name__=="__main__": - parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") - parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) - parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)') - parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)') - parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True) - parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"') - - args = parser.parse_args() - - columns = None - if(args.columns_file): - with open(args.columns_file, "r") as f: - listed_columns = [None] + f.readline().strip("\n").split("\t") - for i in range(1,len(listed_columns)): - listed_columns[i] = listed_columns[i].encode('base64').replace('\n','') - - s = sampleContainer() - - if(listed_columns): - columns = [] - for sample in args.sample_names: - columns.append(listed_columns[int(sample)]) - - - treatments = [] - for treatment in args.treatments: - treatment = treatment.split(":") - for i in range(1,len(treatment)): - if(i%2 == 0): - treatment_tmp = treatment[i].split(",") - for j in range(len(treatment_tmp)): - treatment_tmp[j] = listed_columns[int(treatment_tmp[j])] - treatment[i] = ",".join(treatment_tmp) - - treatments.append(":".join(treatment)) - - blockings = [] - if(args.blocking): - for blocking in args.blocking: - blocking = blocking.split(":") - for i in range(1,len(blocking)): - block = blocking[i].split(",") - for j in range(len(block)): - block[j] = listed_columns[int(block[j])] - blocking[i] = ",".join(block) - blockings.append(":".join(blocking)) - - s.add_samples(columns) - s.add_treatments(treatments) - s.add_blocking(blockings) - - else: - s.add_samples(args.sample_names) - s.add_treatments(args.treatments) - if(args.blocking): - s.add_blocking(args.blocking) - - s.export(args.output)
--- a/edgeR_Differential_Gene_Expression.xml Tue Sep 01 04:25:37 2015 -0400 +++ b/edgeR_Differential_Gene_Expression.xml Tue Sep 01 04:32:16 2015 -0400 @@ -29,11 +29,6 @@ <version_command>echo $(R --version | grep version | grep -v GNU) " , EdgeR version" $(R --vanilla --slave -e "library(edgeR) ; cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ")</version_command> <command> - <!-- - The following script is written in the "Cheetah" language: - http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html - --> - R --vanilla --slave -f $R_script '--args $expression_matrix $design_matrix @@ -111,48 +106,47 @@ <configfiles> <configfile name="R_script"> -library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping -library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping -library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping +<![CDATA[ + +library(limma,quietly=TRUE) ## quietly to avoid unnecessaity stderr messages +library(edgeR,quietly=TRUE) ## quietly to avoid unnecessaity stderr messages +library(splines,quietly=TRUE)## quietly to avoid unnecessaity stderr messages ## Fetch commandline arguments -args <- commandArgs(trailingOnly = TRUE) +args <- commandArgs(trailingOnly = TRUE) -expression_matrix_file = args[1] -design_matrix_file = args[2] -contrast = args[3] +expression_matrix_file <- args[1] +design_matrix_file <- args[2] +contrast <- args[3] -fdr = args[4] +fdr <- args[4] -output_count_edgeR = args[5] -output_cpm = args[6] +output_count_edgeR <- args[5] +output_cpm <- args[6] -output_xpkm = args[7] ##FPKM file - yet to be implemented +output_xpkm <- args[7] ##FPKM file - to be implemented -output_raw_counts = args[8] -output_MDSplot_logFC = args[9] -output_MDSplot_bcv = args[10] -output_BCVplot = args[11] -output_MAplot = args[12] -output_PValue_distribution_plot = args[13] -output_hierarchical_clustering_plot = args[14] -output_heatmap_plot = args[15] -output_RData_obj = args[16] -output_format_images = args[17] +output_raw_counts <- args[8] +output_MDSplot_logFC <- args[9] +output_MDSplot_bcv <- args[10] +output_BCVplot <- args[11] +output_MAplot <- args[12] +output_PValue_distribution_plot <- args[13] +output_hierarchical_clustering_plot <- args[14] +output_heatmap_plot <- args[15] +output_RData_obj <- args[16] +output_format_images <- args[17] -library(edgeR) -##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T) ## Obtain read-counts +expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) +design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) -expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) -design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) - -colnames(design_matrix) <- make.names(colnames(design_matrix)) +colnames(design_matrix) <- make.names(colnames(design_matrix)) for(i in 1:ncol(design_matrix)) { - old <- design_matrix[,i] - design_matrix[,i] <- make.names(design_matrix[,i]) + old <- design_matrix[,i] + design_matrix[,i] <- make.names(design_matrix[,i]) if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) { print("Renaming of factors:") print(old) @@ -160,46 +154,46 @@ print(design_matrix[,i]) } ## The following line seems to malfunction the script: - ##design_matrix[,i] <- as.factor(design_matrix[,i]) + ##design_matrix[,i] <- as.factor(design_matrix[,i]) } ## 1) In the expression matrix, you only want to have the samples described in the design matrix -columns <- match(rownames(design_matrix),colnames(expression_matrix)) -columns <- columns[!is.na(columns)] -read_counts <- expression_matrix[,columns] +columns <- match(rownames(design_matrix),colnames(expression_matrix)) +columns <- columns[!is.na(columns)] +read_counts <- expression_matrix[,columns] ## 2) In the design matrix, you only want to have samples of which you really have the counts -columns <- match(colnames(read_counts),rownames(design_matrix)) -columns <- columns[!is.na(columns)] -design_matrix <- design_matrix[columns,,drop=FALSE] +columns <- match(colnames(read_counts),rownames(design_matrix)) +columns <- columns[!is.na(columns)] +design_matrix <- design_matrix[columns,,drop=FALSE] ## Filter for HTSeq predifined counts: -exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") -exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned") +exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") +exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned") -exclude <- match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) -exclude <- exclude[is.na(exclude)==0] +exclude <- match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) +exclude <- exclude[is.na(exclude)==0] if(length(exclude) != 0) { - read_counts <- read_counts[-exclude,] + read_counts <- read_counts[-exclude,] } ## sorting expression matrix with the order of the read_counts -##order <- match(colnames(read_counts) , rownames(design_matrix)) -##read_counts_ordered <- read_counts[,order2] +##order <- match(colnames(read_counts) , rownames(design_matrix)) +##read_counts_ordered <- read_counts[,order2] -empty_samples <- apply(read_counts,2,function(x) sum(x) == 0) +empty_samples <- apply(read_counts,2,function(x) sum(x) == 0) if(sum(empty_samples) > 0) { write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr()) write(colnames(read_counts)[empty_samples],stderr()) } else { - dge <- DGEList(counts=read_counts,genes=rownames(read_counts)) + dge <- DGEList(counts=read_counts,genes=rownames(read_counts)) - formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ") - design_matrix_tmp <- design_matrix - colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp)) - design <- model.matrix(as.formula(formula),design_matrix_tmp) + formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ") + design_matrix_tmp <- design_matrix + colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp)) + design <- model.matrix(as.formula(formula),design_matrix_tmp) rm(design_matrix_tmp) # Filter prefixes @@ -211,18 +205,18 @@ # Do normalization write("Calculating normalization factors...",stdout()) - dge <- calcNormFactors(dge) + dge <- calcNormFactors(dge) write("Estimating common dispersion...",stdout()) - dge <- estimateGLMCommonDisp(dge,design) + dge <- estimateGLMCommonDisp(dge,design) write("Estimating trended dispersion...",stdout()) - dge <- estimateGLMTrendedDisp(dge,design) + dge <- estimateGLMTrendedDisp(dge,design) write("Estimating tagwise dispersion...",stdout()) - dge <- estimateGLMTagwiseDisp(dge,design) + dge <- estimateGLMTagwiseDisp(dge,design) if(output_MDSplot_logFC != "/dev/null") { write("Creating MDS plot (logFC method)",stdout()) - points <- plotMDS.DGEList(dge,top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot + points <- plotMDS.DGEList(dge,top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot dev.off()# Kill it if(output_format_images == "pdf") { @@ -237,7 +231,7 @@ } - diff_x <- abs(max(points\$x)-min(points\$x)) + diff_x <- abs(max(points\$x)-min(points\$x)) diff_y <-(max(points\$y)-min(points\$y)) plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR logFC-MDS Plot on top 500 genes",type="n", xlab="Leading logFC dim 1", ylab="Leading logFC dim 2") points(points\$x,points\$y,pch=20) @@ -252,7 +246,7 @@ ## 1. First create a virtual plot to obtain the desired coordinates pdf("bcvmds.pdf") - points <- plotMDS.DGEList(dge,method="bcv",top=500,labels=rep("",nrow(dge\$samples))) + points <- plotMDS.DGEList(dge,method="bcv",top=500,labels=rep("",nrow(dge\$samples))) dev.off()# Kill it ## 2. Re-plot the coordinates in a new figure with the size and settings. @@ -267,8 +261,8 @@ bitmap(output_MDSplot_bcv,type="png16m",height=14,width=14) } - diff_x <- abs(max(points\$x)-min(points\$x)) - diff_y <-(max(points\$y)-min(points\$y)) + diff_x <- abs(max(points\$x)-min(points\$x)) + diff_y <- (max(points\$y)-min(points\$y)) plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR BCV-MDS Plot",type="n", xlab="Leading BCV dim 1", ylab="Leading BCV dim 2") points(points\$x,points\$y,pch=20) text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4) @@ -298,13 +292,13 @@ write("Fitting GLM...",stdout()) - fit <- glmFit(dge,design) + fit <- glmFit(dge,design) write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout()) - cont <- c(contrast) - cont <- makeContrasts(contrasts=cont, levels=design) + cont <- c(contrast) + cont <- makeContrasts(contrasts=cont, levels=design) - lrt <- glmLRT(fit, contrast=cont[,1]) + lrt <- glmLRT(fit, contrast=cont[,1]) write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout()) write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA) write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA) @@ -313,8 +307,8 @@ write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA) if(output_MAplot != "/dev/null" || output_PValue_distribution_plot != "/dev/null") { - etable <- topTags(lrt, n=nrow(dge))\$table - etable <- etable[order(etable\$FDR), ] + etable <- topTags(lrt, n=nrow(dge))\$table + etable <- etable[order(etable\$FDR), ] if(output_MAplot != "/dev/null") { write("Creating MA plot...",stdout()) @@ -350,16 +344,16 @@ bitmap(output_PValue_distribution_plot,type="png16m",width=14,height=14) } - expressed_genes <- subset(etable, PValue < 0.99) - h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)") - center <- sum(h\$counts) / length(h\$counts) + expressed_genes <- subset(etable, PValue < 0.99) + h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)") + center <- sum(h\$counts) / length(h\$counts) lines(c(0,1),c(center,center),lty=2,col="red",lwd=2) - k <- ksmooth(h\$mid, h\$counts) + k <- ksmooth(h\$mid, h\$counts) lines(k\$x,k\$y,col="red",lwd=2) - rmsd <- (h\$counts) - center - rmsd <- rmsd^2 - rmsd <- sum(rmsd) - rmsd <- sqrt(rmsd) + rmsd <- (h\$counts) - center + rmsd <- rmsd^2 + rmsd <- sum(rmsd) + rmsd <- sqrt(rmsd) text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue") ## change e into epsilon somehow dev.off() @@ -379,9 +373,9 @@ bitmap(output_heatmap_plot,type="png16m",width=10.5) } - etable2 <- topTags(lrt, n=100)\$table - order <- rownames(etable2) - cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),] + etable2 <- topTags(lrt, n=100)\$table + order <- rownames(etable2) + cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),] heatmap(t(cpm_sub)) dev.off() } @@ -394,6 +388,7 @@ write("Done!",stdout()) } +]]> </configfile> </configfiles>
--- a/edger_dge_table_to_bedgraph Tue Sep 01 04:25:37 2015 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,222 +0,0 @@ -#!/usr/bin/env python - -import re,sys,os,os.path,argparse,textwrap,datetime - -class GTF: - def __init__(self,filename,features=["exon"],symbol="gene_id"): - self.features = features - if(re.match("^[a-zA-Z0-9_\-]+$",symbol)): - self.symbol = symbol - else: - raise ValueError('False symbol matching symbol: '+str(symbol)) - self.index = {} - self.parse(filename) - - def parse(self,filename): - with open(filename) as infile: - for line in infile: - self.parse_line(line) - - def parse_line(self,line): - line = line.strip().split("\t") - if(len(line) == 9): - if(line[2] in self.features): - gene_id = self.parse_column_9(line[8]) - if(gene_id): - if(not self.index.has_key(gene_id)): - self.index[gene_id] = [] - self.index[gene_id].append([line[0],line[3],line[4]]) - - def parse_column_9(self,line): - query = self.symbol+'[ =]+([^ ;]+)' - m = re.search(query, line) - - if(m): - return m.group(1).strip("'").strip('"') - else: - return None - - def get(self,key): - try: - return self.index[key] - except: - return False - - -class EdgeR_table: - def __init__(self,table,gtf,columns=[3,7]): - self.index = {} - self.gtf = gtf - self.columns = columns - self.parse(table) - - def parse(self,filename): - i = 0 - with open(filename) as infile: - for line in infile: - if(i == 0): - self.parse_header(line) - else: - self.parse_line(line) - i += 1 - - def parse_header(self,line): - params = line.strip().split("\t") - if(params[1].lower().find("genes") == -1): - raise ValueError('False header in file - no "genes" in 2nd colum: '+line) - if(params[2].lower().find("logfc") == -1): - raise ValueError('False header in file - no "logfc" in 3rd colum: '+line) - if(params[3].lower().find("logcpm") == -1): - raise ValueError('False header in file - no "logcpm" in 4th colum: '+line) - if(params[4].lower().find("lr") == -1): - raise ValueError('False header in file - no "lr" in 5th colum: '+line) - if(params[5].lower().find("pvalue") == -1): - raise ValueError('False header in file - no "pvalue" in 6th colum: '+line) - if(params[6].lower().find("fdr") == -1): - raise ValueError('False header in file - no "fdr" in 7th colum: '+line) - - def parse_line(self,line): - line = line.strip().split("\t") - - if(len(line) == 7): - gene_id = line[1].strip('"').strip("'") - column_data = {} - for column in self.columns: - if(column in [6,7]): - column_data[column] = str(1.0 - float(line[column-1])) - else: - column_data[column] = line[column-1] - - locations = self.gtf.get(gene_id) - if(not locations): - print "Warning: no location found for gene "+gene_id - else: - for location in locations: - self.insert(location,column_data) - - def insert(self,location,data): - chrom = location[0] - start = location[1] - end = location[2] - - if(not self.index.has_key(chrom)): - self.index[chrom] = {} - - if(not self.index[chrom].has_key(start)): - self.index[chrom][start] = {} - - if(not self.index[chrom][start].has_key(end)): - self.index[chrom][start][end] = [] - - self.index[chrom][start][end].append(data) - - def export(self,filenames={3:"log_cpm.txt",7:"fdr.txt"}): - for column in self.columns: - fh = open(filenames[column],"w") - - buf = False - - for chrom in sorted(self.index.keys()): - for start in sorted(self.index[chrom].keys()): - for end in sorted(self.index[chrom][start].keys()): - fh.write(chrom+"\t"+start+"\t"+end+"\t"+self.index[chrom][start][end][0][column]+"\n") - - fh.close() - - os.system("sort -k1,1V -k2,2g -k3,3g '"+filenames[column]+"' > '"+filenames[column]+".sorted'") - - -def remove_overlap_in_bedgraph(bedgraph_file_dirty,bedgraph_file_clean): - fh = open(bedgraph_file_clean,"w") - buf = False - - with open(bedgraph_file_dirty,"r") as f: - for line in f: - cur = line.strip().split("\t") - cur[1] = int(cur[1]) - cur[2] = int(cur[2]) - - if(not buf): - buf = cur - else: - if(cur[0] == buf[0] and cur[1] <= buf[2] ): - if(buf[1] == cur[1]): #is subset - newscore = (float(buf[3])+float(cur[3]))/2 - buf[2] = cur[2] - buf[3] = newscore - else: - c1 = buf[1] - c2 = cur[1] - c3 = min(buf[2],cur[2]) - c4 = max(buf[2],cur[2]) - - fh.write(buf[0]+"\t"+str(c1)+"\t"+str(c2-1)+"\t"+str(buf[3])+"\n") - - newscore = (float(buf[3])+float(cur[3]))/2 - #fh.write(buf[0]+"\t"+str(c2+1)+"\t"+str(c3)+"\t"+str(newscore)+"\tp2\n") - #buf = [buf[0], c3+1 , c4 , cur[3]] - - buf = [buf[0], c2 , c4 , cur[3]] - - # find if buf is a subset -> if so, merge and send to buffer - # or find the overlapping region - - # if current is overlapping with buffer; merge: - ## [ ] < buf - ## [ ] < cur - ## - ## [ ] < buf - ## [ ] < cur - ## 111112222333333 << write 1 and 2 and keep 3 in buf - else: - fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") - buf=cur - - fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") - fh.close() - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,epilog="For more info please visit:\n<https://github.com/yhoogstrate/fuma>") - - parser.add_argument("-t",help="CPM table to extract columns from",nargs=1,required=True) - parser.add_argument("-g",help="GTF file used to extract genomic location",nargs=1,required=True) - - parser.add_argument("-c3",help="Output (bedgraph) for column 3 (logFC)",nargs="?",required=False) - parser.add_argument("-c4",help="Output (bedgraph) for column 4 (logCPM)",nargs="?",required=False) - parser.add_argument("-c5",help="Output (bedgraph) for column 5 (LR)",nargs="?",required=False) - parser.add_argument("-c6",help="Output (bedgraph) for column 6 (PValue)",nargs="?",required=False) - parser.add_argument("-c7",help="Output (bedgraph) for column 7 (FDR)",nargs="?",required=False) - - args = parser.parse_args() - - #files = {3:"VCAP_logFC.hg19.bedgraph",7:"VCAP_fdr.hg19.bedgraph"} - files = {} - - if(args.c3): - files[3] = args.c3 - if(args.c4): - files[4] = args.c4 - if(args.c5): - files[5] = args.c5 - if(args.c6): - files[6] = args.c6 - if(args.c7): - files[7] = args.c7 - - print "Parsing GTF file" - g = GTF(args.g[0]) - - print "Parsing EdgeR table" - e = EdgeR_table(args.t[0],g,files.keys()) - - print "Exporting raw bedgraph(s)" - e.export(files) - - print "Removing overlapping entries in bedgraph(s)" - for key in files.keys(): - remove_overlap_in_bedgraph(files[key]+".sorted",files[key]) - os.system("rm '"+files[key]+".sorted'")
--- a/tool_dependencies.xml Tue Sep 01 04:25:37 2015 -0400 +++ b/tool_dependencies.xml Tue Sep 01 04:32:16 2015 -0400 @@ -7,7 +7,7 @@ <package name="design_matrix_creator" version="1.0.0"> <install version="1.0"> <actions> - <action type="shell_command">ls -als ; echo "###############" ; ls -als $REPOSITORY_INSTALL_DIR ; pwd ; mkdir $INSTALL_DIR/bin ; cp $REPOSITORY_INSTALL_DIR/design_matrix_creator $INSTALL_DIR/bin/</action> + <action type="shell_command">mkdir $INSTALL_DIR/bin ; cp $REPOSITORY_INSTALL_DIR/bin/design_matrix_creator $INSTALL_DIR/bin/</action> <action type="chmod"> <file mode="755">$INSTALL_DIR/bin/design_matrix_creator</file> </action> @@ -22,7 +22,7 @@ <package name="edger_dge_table_to_bedgraph" version="1.0.0"> <install version="1.0"> <actions> - <action type="shell_command">mkdir $INSTALL_DIR/bin ; cp $REPOSITORY_INSTALL_DIR/edger_dge_table_to_bedgraph $INSTALL_DIR/bin/</action> + <action type="shell_command">mkdir $INSTALL_DIR/bin ; cp $REPOSITORY_INSTALL_DIR/bin/edger_dge_table_to_bedgraph $INSTALL_DIR/bin/</action> <action type="chmod"> <file mode="755">$INSTALL_DIR/bin/edger_dge_table_to_bedgraph</file> </action>