Mercurial > repos > yhoogstrate > edger_with_design_matrix
comparison bin/edger_dge_table_to_bedgraph @ 107:049d8bc2214e draft
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools/raw/master/edger_with_design_matrix commit 2700e500a4fb135a20ede7d52221a9d31f1aaa5e-dirty
| author | yhoogstrate |
|---|---|
| date | Tue, 01 Sep 2015 04:32:16 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 106:56117c9db713 | 107:049d8bc2214e |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import re,sys,os,os.path,argparse,textwrap,datetime | |
| 4 | |
| 5 class GTF: | |
| 6 def __init__(self,filename,features=["exon"],symbol="gene_id"): | |
| 7 self.features = features | |
| 8 if(re.match("^[a-zA-Z0-9_\-]+$",symbol)): | |
| 9 self.symbol = symbol | |
| 10 else: | |
| 11 raise ValueError('False symbol matching symbol: '+str(symbol)) | |
| 12 self.index = {} | |
| 13 self.parse(filename) | |
| 14 | |
| 15 def parse(self,filename): | |
| 16 with open(filename) as infile: | |
| 17 for line in infile: | |
| 18 self.parse_line(line) | |
| 19 | |
| 20 def parse_line(self,line): | |
| 21 line = line.strip().split("\t") | |
| 22 if(len(line) == 9): | |
| 23 if(line[2] in self.features): | |
| 24 gene_id = self.parse_column_9(line[8]) | |
| 25 if(gene_id): | |
| 26 if(not self.index.has_key(gene_id)): | |
| 27 self.index[gene_id] = [] | |
| 28 self.index[gene_id].append([line[0],line[3],line[4]]) | |
| 29 | |
| 30 def parse_column_9(self,line): | |
| 31 query = self.symbol+'[ =]+([^ ;]+)' | |
| 32 m = re.search(query, line) | |
| 33 | |
| 34 if(m): | |
| 35 return m.group(1).strip("'").strip('"') | |
| 36 else: | |
| 37 return None | |
| 38 | |
| 39 def get(self,key): | |
| 40 try: | |
| 41 return self.index[key] | |
| 42 except: | |
| 43 return False | |
| 44 | |
| 45 | |
| 46 class EdgeR_table: | |
| 47 def __init__(self,table,gtf,columns=[3,7]): | |
| 48 self.index = {} | |
| 49 self.gtf = gtf | |
| 50 self.columns = columns | |
| 51 self.parse(table) | |
| 52 | |
| 53 def parse(self,filename): | |
| 54 i = 0 | |
| 55 with open(filename) as infile: | |
| 56 for line in infile: | |
| 57 if(i == 0): | |
| 58 self.parse_header(line) | |
| 59 else: | |
| 60 self.parse_line(line) | |
| 61 i += 1 | |
| 62 | |
| 63 def parse_header(self,line): | |
| 64 params = line.strip().split("\t") | |
| 65 if(params[1].lower().find("genes") == -1): | |
| 66 raise ValueError('False header in file - no "genes" in 2nd colum: '+line) | |
| 67 if(params[2].lower().find("logfc") == -1): | |
| 68 raise ValueError('False header in file - no "logfc" in 3rd colum: '+line) | |
| 69 if(params[3].lower().find("logcpm") == -1): | |
| 70 raise ValueError('False header in file - no "logcpm" in 4th colum: '+line) | |
| 71 if(params[4].lower().find("lr") == -1): | |
| 72 raise ValueError('False header in file - no "lr" in 5th colum: '+line) | |
| 73 if(params[5].lower().find("pvalue") == -1): | |
| 74 raise ValueError('False header in file - no "pvalue" in 6th colum: '+line) | |
| 75 if(params[6].lower().find("fdr") == -1): | |
| 76 raise ValueError('False header in file - no "fdr" in 7th colum: '+line) | |
| 77 | |
| 78 def parse_line(self,line): | |
| 79 line = line.strip().split("\t") | |
| 80 | |
| 81 if(len(line) == 7): | |
| 82 gene_id = line[1].strip('"').strip("'") | |
| 83 column_data = {} | |
| 84 for column in self.columns: | |
| 85 if(column in [6,7]): | |
| 86 column_data[column] = str(1.0 - float(line[column-1])) | |
| 87 else: | |
| 88 column_data[column] = line[column-1] | |
| 89 | |
| 90 locations = self.gtf.get(gene_id) | |
| 91 if(not locations): | |
| 92 print "Warning: no location found for gene "+gene_id | |
| 93 else: | |
| 94 for location in locations: | |
| 95 self.insert(location,column_data) | |
| 96 | |
| 97 def insert(self,location,data): | |
| 98 chrom = location[0] | |
| 99 start = location[1] | |
| 100 end = location[2] | |
| 101 | |
| 102 if(not self.index.has_key(chrom)): | |
| 103 self.index[chrom] = {} | |
| 104 | |
| 105 if(not self.index[chrom].has_key(start)): | |
| 106 self.index[chrom][start] = {} | |
| 107 | |
| 108 if(not self.index[chrom][start].has_key(end)): | |
| 109 self.index[chrom][start][end] = [] | |
| 110 | |
| 111 self.index[chrom][start][end].append(data) | |
| 112 | |
| 113 def export(self,filenames={3:"log_cpm.txt",7:"fdr.txt"}): | |
| 114 for column in self.columns: | |
| 115 fh = open(filenames[column],"w") | |
| 116 | |
| 117 buf = False | |
| 118 | |
| 119 for chrom in sorted(self.index.keys()): | |
| 120 for start in sorted(self.index[chrom].keys()): | |
| 121 for end in sorted(self.index[chrom][start].keys()): | |
| 122 fh.write(chrom+"\t"+start+"\t"+end+"\t"+self.index[chrom][start][end][0][column]+"\n") | |
| 123 | |
| 124 fh.close() | |
| 125 | |
| 126 os.system("sort -k1,1V -k2,2g -k3,3g '"+filenames[column]+"' > '"+filenames[column]+".sorted'") | |
| 127 | |
| 128 | |
| 129 def remove_overlap_in_bedgraph(bedgraph_file_dirty,bedgraph_file_clean): | |
| 130 fh = open(bedgraph_file_clean,"w") | |
| 131 buf = False | |
| 132 | |
| 133 with open(bedgraph_file_dirty,"r") as f: | |
| 134 for line in f: | |
| 135 cur = line.strip().split("\t") | |
| 136 cur[1] = int(cur[1]) | |
| 137 cur[2] = int(cur[2]) | |
| 138 | |
| 139 if(not buf): | |
| 140 buf = cur | |
| 141 else: | |
| 142 if(cur[0] == buf[0] and cur[1] <= buf[2] ): | |
| 143 if(buf[1] == cur[1]): #is subset | |
| 144 newscore = (float(buf[3])+float(cur[3]))/2 | |
| 145 buf[2] = cur[2] | |
| 146 buf[3] = newscore | |
| 147 else: | |
| 148 c1 = buf[1] | |
| 149 c2 = cur[1] | |
| 150 c3 = min(buf[2],cur[2]) | |
| 151 c4 = max(buf[2],cur[2]) | |
| 152 | |
| 153 fh.write(buf[0]+"\t"+str(c1)+"\t"+str(c2-1)+"\t"+str(buf[3])+"\n") | |
| 154 | |
| 155 newscore = (float(buf[3])+float(cur[3]))/2 | |
| 156 #fh.write(buf[0]+"\t"+str(c2+1)+"\t"+str(c3)+"\t"+str(newscore)+"\tp2\n") | |
| 157 #buf = [buf[0], c3+1 , c4 , cur[3]] | |
| 158 | |
| 159 buf = [buf[0], c2 , c4 , cur[3]] | |
| 160 | |
| 161 # find if buf is a subset -> if so, merge and send to buffer | |
| 162 # or find the overlapping region | |
| 163 | |
| 164 # if current is overlapping with buffer; merge: | |
| 165 ## [ ] < buf | |
| 166 ## [ ] < cur | |
| 167 ## | |
| 168 ## [ ] < buf | |
| 169 ## [ ] < cur | |
| 170 ## 111112222333333 << write 1 and 2 and keep 3 in buf | |
| 171 else: | |
| 172 fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") | |
| 173 buf=cur | |
| 174 | |
| 175 fh.write(buf[0]+"\t"+str(buf[1])+"\t"+str(buf[2])+"\t"+str(buf[3])+"\n") | |
| 176 fh.close() | |
| 177 | |
| 178 | |
| 179 | |
| 180 if __name__ == "__main__": | |
| 181 parser = argparse.ArgumentParser() | |
| 182 | |
| 183 parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,epilog="For more info please visit:\n<https://github.com/yhoogstrate/fuma>") | |
| 184 | |
| 185 parser.add_argument("-t",help="CPM table to extract columns from",nargs=1,required=True) | |
| 186 parser.add_argument("-g",help="GTF file used to extract genomic location",nargs=1,required=True) | |
| 187 | |
| 188 parser.add_argument("-c3",help="Output (bedgraph) for column 3 (logFC)",nargs="?",required=False) | |
| 189 parser.add_argument("-c4",help="Output (bedgraph) for column 4 (logCPM)",nargs="?",required=False) | |
| 190 parser.add_argument("-c5",help="Output (bedgraph) for column 5 (LR)",nargs="?",required=False) | |
| 191 parser.add_argument("-c6",help="Output (bedgraph) for column 6 (PValue)",nargs="?",required=False) | |
| 192 parser.add_argument("-c7",help="Output (bedgraph) for column 7 (FDR)",nargs="?",required=False) | |
| 193 | |
| 194 args = parser.parse_args() | |
| 195 | |
| 196 #files = {3:"VCAP_logFC.hg19.bedgraph",7:"VCAP_fdr.hg19.bedgraph"} | |
| 197 files = {} | |
| 198 | |
| 199 if(args.c3): | |
| 200 files[3] = args.c3 | |
| 201 if(args.c4): | |
| 202 files[4] = args.c4 | |
| 203 if(args.c5): | |
| 204 files[5] = args.c5 | |
| 205 if(args.c6): | |
| 206 files[6] = args.c6 | |
| 207 if(args.c7): | |
| 208 files[7] = args.c7 | |
| 209 | |
| 210 print "Parsing GTF file" | |
| 211 g = GTF(args.g[0]) | |
| 212 | |
| 213 print "Parsing EdgeR table" | |
| 214 e = EdgeR_table(args.t[0],g,files.keys()) | |
| 215 | |
| 216 print "Exporting raw bedgraph(s)" | |
| 217 e.export(files) | |
| 218 | |
| 219 print "Removing overlapping entries in bedgraph(s)" | |
| 220 for key in files.keys(): | |
| 221 remove_overlap_in_bedgraph(files[key]+".sorted",files[key]) | |
| 222 os.system("rm '"+files[key]+".sorted'") |
