Mercurial > repos > yhoogstrate > edger_with_design_matrix
changeset 92:86c3aaa205b5 draft
Added bedgraph exporting function for the DGE tables
author | yhoogstrate |
---|---|
date | Fri, 13 Mar 2015 05:45:09 -0400 |
parents | c81da57fff20 |
children | 31335aa52b2e |
files | edgeR_Convert_DGE_Table_to_Bedgraph.xml edger_dge_table_to_bedgraph tool_dependencies.xml |
diffstat | 3 files changed, 310 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/edgeR_Convert_DGE_Table_to_Bedgraph.xml Fri Mar 13 05:45:09 2015 -0400 @@ -0,0 +1,73 @@ +<?xml version="1.0" encoding="UTF-8"?> + <tool id="edger_dge_table_to_bedgraph" name="edgeR: convert 'differentially expressed genes'-table to bedgraph(s)" version="1.0.0"> + <description>EdgeR's "differentially expressed genes" table to bedgraph(s)</description> + + <requirements> + <requirement type="package" version="1.0.0">edger_dge_table_to_bedgraph</requirement> + </requirements> + + <command interpreter="python"> + edger_dge_table_to_bedgraph + -t $cpm_table + -g $geneset + + #if $logfc: + -c3 $logfc + #end if + + #if $logcpm: + -c4 $logcpm + #end if + + #if $lr: + -c5 $lr + #end if + + #if $pvalue: + -c6 $pvalue + #end if + + #if $fdr: + -c7 $fdr + #end if + </command> + + <inputs> + <param format="tabular" name="cpm_table" type="data" label="'differentially expressed genes'-table as result from EdgeR" help="must have 7 columns of which the 2nd are gene names matching the GTF file" /> + <param format="gtf" name="geneset" type="data" label="Geneset used for estimating expression levels prior to expression analysis" /> + + <param name="columns" type="select" label="Desired columns" multiple="true" display="checkboxes"> + <option value="c3" selected="true">logFC</option> + <option value="c4">logCPM</option> + <option value="c5">LR</option> + <option value="c6">PValue</option> + <option value="c7" selected="true">FDR</option> + </param> + </inputs> + + <outputs> + <data fromat="bedgraph" name="logfc" label="logFC from ${cpm_table.name}"> + <filter>"c3" in columns</filter> + </data> + + <data fromat="bedgraph" name="logcpm" label="logCPM from ${cpm_table.name}"> + <filter>"c4" in columns</filter> + </data> + + <data fromat="bedgraph" name="lr" label="LR from ${cpm_table.name}"> + <filter>"c5" in columns</filter> + </data> + + <data fromat="bedgraph" name="pvalue" label="PValue from ${cpm_table.name}"> + <filter>"c6" in columns</filter> + </data> + + <data fromat="bedgraph" name="fdr" label="FDR from ${cpm_table.name}"> + <filter>"c7" in columns</filter> + </data> + </outputs> + + <help> + P-values and FDRs are swapped from 1 to 0, and 0 to 1, because this way the most siginificant genes will obtain the highest values which is convenient for visualisation. + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/edger_dge_table_to_bedgraph Fri Mar 13 05:45:09 2015 -0400 @@ -0,0 +1,222 @@ +#!/usr/bin/env python + +import re,sys,os,os.path,argparse,textwrap,datetime + +class GTF: + def __init__(self,filename,features=["exon"],symbol="gene_id"): + self.features = features + if(re.match("^[a-zA-Z0-9_\-]+$",symbol)): + self.symbol = symbol + else: + raise ValueError('False symbol matching symbol: '+str(symbol)) + self.index = {} + self.parse(filename) + + def parse(self,filename): + with open(filename) as infile: + for line in infile: + self.parse_line(line) + + def parse_line(self,line): + line = line.strip().split("\t") + if(len(line) == 9): + if(line[2] in self.features): + gene_id = self.parse_column_9(line[8]) + if(gene_id): + if(not self.index.has_key(gene_id)): + self.index[gene_id] = [] + self.index[gene_id].append([line[0],line[3],line[4]]) + + def parse_column_9(self,line): + query = self.symbol+'[ =]+([^ ;]+)' + m = re.search(query, line) + + if(m): + return m.group(1).strip("'").strip('"') + else: + return None + + def get(self,key): + try: + return self.index[key] + except: + return False + + +class EdgeR_table: + def __init__(self,table,gtf,columns=[3,7]): + self.index = {} + self.gtf = gtf + self.columns = columns + self.parse(table) + + def parse(self,filename): + i = 0 + with open(filename) as infile: + for line in infile: + if(i == 0): + self.parse_header(line) + else: + self.parse_line(line) + i += 1 + + def parse_header(self,line): + params = line.strip().split("\t") + if(params[1].lower().find("genes") == -1): + raise ValueError('False header in file - no "genes" in 2nd colum: '+line) + if(params[2].lower().find("logfc") == -1): + raise ValueError('False header in file - no "logfc" in 3rd colum: '+line) + if(params[3].lower().find("logcpm") == -1): + raise ValueError('False header in file - no "logcpm" in 4th colum: '+line) + if(params[4].lower().find("lr") == -1): + raise ValueError('False header in file - no "lr" in 5th colum: '+line) + if(params[5].lower().find("pvalue") == -1): + raise ValueError('False header in file - no "pvalue" in 6th colum: '+line) + if(params[6].lower().find("fdr") == -1): + raise ValueError('False header in file - no "fdr" in 7th colum: '+line) + + def parse_line(self,line): + line = line.strip().split("\t") + + if(len(line) == 7): + gene_id = line[1].strip('"').strip("'") + column_data = {} + for column in self.columns: + if(column in [6,7]): + column_data[column] = str(1.0 - float(line[column-1])) + else: + column_data[column] = line[column-1] + + locations = self.gtf.get(gene_id) + if(not locations): + print "Warning: no location found for gene "+gene_id + else: + for location in locations: + self.insert(location,column_data) + + def insert(self,location,data): + chrom = location[0] + start = location[1] + end = location[2] + + if(not self.index.has_key(chrom)): + self.index[chrom] = {} + + if(not self.index[chrom].has_key(start)): + self.index[chrom][start] = {} + + if(not self.index[chrom][start].has_key(end)): + self.index[chrom][start][end] = [] + + self.index[chrom][start][end].append(data) + + def export(self,filenames={3:"log_cpm.txt",7:"fdr.txt"}): + for column in self.columns: + fh = open(filenames[column],"w") + + buf = False + + for chrom in sorted(self.index.keys()): + for start in sorted(self.index[chrom].keys()): + for end in sorted(self.index[chrom][start].keys()): + fh.write(chrom+"\t"+start+"\t"+end+"\t"+self.index[chrom][start][end][0][column]+"\n") + + fh.close() + + os.system("sort -k1,1V -k2,2g -k3,3g '"+filenames[column]+"' > '"+filenames[column]+".sorted'") + + +def remove_overlap_in_bedgraph(bedgraph_file_dirty,bedgraph_file_clean): + fh = open(bedgraph_file_clean,"w") + buf = False + + with open(bedgraph_file_dirty,"r") as f: + for line in f: + cur = line.strip().split("\t") + cur[1] = int(cur[1]) + cur[2] = int(cur[2]) + + if(not buf): + buf = cur + else: + if(cur[0] == buf[0] and cur[1] <= buf[2] ): + if(buf[1] == cur[1]): #is subset + newscore = (float(buf[3])+float(cur[3]))/2 + buf[2] = cur[2] + buf[3] = newscore + else: + c1 = buf[1] + c2 = cur[1] + c3 = min(buf[2],cur[2]) + c4 = max(buf[2],cur[2]) + + fh.write(buf[0]+"\t"+str(c1)+"\t"+str(c2-1)+"\t"+str(buf[3])+"\n") + + newscore = (float(buf[3])+float(cur[3]))/2 + #fh.write(buf[0]+"\t"+str(c2+1)+"\t"+str(c3)+"\t"+str(newscore)+"\tp2\n") + #buf = [buf[0], c3+1 , c4 , cur[3]] + + buf = [buf[0], c2 , c4 , cur[3]] + + # find if buf is a subset -> if so, merge and send to buffer + # or find the overlapping region + + # if current is overlapping with buffer; merge: + ## [ ] < buf + ## [ ] < cur + ## + ## [ ] < buf + ## [ ] < cur + ## 111112222333333 << write 1 and 2 and keep 3 in buf + else: + fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") + buf=cur + + fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") + fh.close() + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,epilog="For more info please visit:\n<https://github.com/yhoogstrate/fuma>") + + parser.add_argument("-t",help="CPM table to extract columns from",nargs=1,required=True) + parser.add_argument("-g",help="GTF file used to extract genomic location",nargs=1,required=True) + + parser.add_argument("-c3",help="Output (bedgraph) for column 3 (logFC)",nargs="?",required=False) + parser.add_argument("-c4",help="Output (bedgraph) for column 4 (logCPM)",nargs="?",required=False) + parser.add_argument("-c5",help="Output (bedgraph) for column 5 (LR)",nargs="?",required=False) + parser.add_argument("-c6",help="Output (bedgraph) for column 6 (PValue)",nargs="?",required=False) + parser.add_argument("-c7",help="Output (bedgraph) for column 7 (FDR)",nargs="?",required=False) + + args = parser.parse_args() + + #files = {3:"VCAP_logFC.hg19.bedgraph",7:"VCAP_fdr.hg19.bedgraph"} + files = {} + + if(args.c3): + files[3] = args.c3 + if(args.c4): + files[4] = args.c4 + if(args.c5): + files[5] = args.c5 + if(args.c6): + files[6] = args.c6 + if(args.c7): + files[7] = args.c7 + + print "Parsing GTF file" + g = GTF(args.g[0]) + + print "Parsing EdgeR table" + e = EdgeR_table(args.t[0],g,files.keys()) + + print "Exporting raw bedgraph(s)" + e.export(files) + + print "Removing overlapping entries in bedgraph(s)" + for key in files.keys(): + remove_overlap_in_bedgraph(files[key]+".sorted",files[key]) + os.system("rm '"+files[key]+".sorted'")
--- a/tool_dependencies.xml Mon Mar 09 08:35:13 2015 -0400 +++ b/tool_dependencies.xml Fri Mar 13 05:45:09 2015 -0400 @@ -42,4 +42,19 @@ </actions> </install> </package> + + <package name="edger_dge_table_to_bedgraph" version="1.0.0"> + <install version="1.0"> + <actions> + <action type="shell_command">mkdir $INSTALL_DIR/bin ; cp $REPOSITORY_INSTALL_DIR/edger_dge_table_to_bedgraph $INSTALL_DIR/bin/</action> + <action type="chmod"> + <file mode="755">$INSTALL_DIR/bin/edger_dge_table_to_bedgraph</file> + </action> + <action type="set_environment"> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> + <environment_variable action="prepend_to" name="PATH">$REPOSITORY_INSTALL_DIR</environment_variable> + </action> + </actions> + </install> + </package> </tool_dependency>