comparison rgGSEA/rgGSEA.py @ 2:71fa159646c9 draft

Uploaded
author fubar
date Sun, 09 Jun 2013 02:01:13 -0400
parents
children 89e89b70a867
comparison
equal deleted inserted replaced
1:f24f30eae819 2:71fa159646c9
1 """
2 April 2013
3 eeesh GSEA does NOT respect the mode flag!
4
5 Now realise that the creation of the input rank file for gsea needs to take the lowest p value for duplicate
6 feature names. To make Ish's life easier, remove duplicate gene ids from any gene set to stop GSEA from
7 barfing.
8
9 October 14 2012
10 Amazingly long time to figure out that GSEA fails with useless error message if any filename contains a dash "-"
11 eesh.
12
13 Added history .gmt source - requires passing a faked name to gsea
14 Wrapper for GSEA http://www.broadinstitute.org/gsea/index.jsp
15 Started Feb 22
16 Copyright 2012 Ross Lazarus
17 All rights reserved
18 Licensed under the LGPL
19
20 called eg as
21
22 #!/bin/sh
23 GALAXY_LIB="/data/extended/galaxy/lib"
24 if [ "$GALAXY_LIB" != "None" ]; then
25 if [ -n "$PYTHONPATH" ]; then
26 PYTHONPATH="$GALAXY_LIB:$PYTHONPATH"
27 else
28 PYTHONPATH="$GALAXY_LIB"
29 fi
30 export PYTHONPATH
31 fi
32
33 cd /data/extended/galaxy/database/job_working_directory/027/27311
34 python /data/extended/galaxy/tools/rgenetics/rgGSEA.py --input_tab "/data/extended/galaxy/database/files/033/dataset_33806.dat" --adjpvalcol "5" --signcol "2"
35 --idcol "1" --outhtml "/data/extended/galaxy/database/files/034/dataset_34455.dat" --input_name "actaearly-Controlearly-actalate-Controllate_topTable.xls"
36 --setMax "500" --setMin "15" --nPerm "1000" --plotTop "20"
37 --gsea_jar "/data/extended/galaxy/tool-data/shared/jars/gsea2-2.0.12.jar"
38 --output_dir "/data/extended/galaxy/database/job_working_directory/027/27311/dataset_34455_files" --mode "Max_probe"
39 --title " actaearly-Controlearly-actalate-Controllate_interpro_GSEA" --builtin_gmt "/data/genomes/gsea/3.1/IPR_DOMAIN.gmt"
40
41
42 """
43 import optparse
44 import tempfile
45 import os
46 import sys
47 import subprocess
48 import time
49 import shutil
50 import glob
51 import math
52 import re
53
54 KEEPSELECTION = False # detailed records for selection of multiple probes
55
56 def timenow():
57 """return current time as a string
58 """
59 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
60
61
62
63 def fix_subdir(adir,destdir):
64 """ Galaxy wants everything in the same files_dir
65 if os.path.exists(adir):
66 for (d,dirs,files) in os.path.walk(adir):
67 for f in files:
68 sauce = os.path.join(d,f)
69 shutil.copy(sauce,destdir)
70 """
71
72 def fixAffycrap(apath=''):
73 """class='richTable'>RUNNING ES</th><th class='richTable'>CORE ENRICHMENT</th><tr><td class='lessen'>1</td>
74 <td><a href='https://www.affymetrix.com/LinkServlet?probeset=LBR'>LBR</a></td><td></td><td></td><td>1113</td>
75 <td>0.194</td><td>-0.1065</td><td>No</td></tr><tr><td class='lessen'>2</td><td>
76 <a href='https://www.affymetrix.com/LinkServlet?probeset=GGPS1'>GGPS1</a></td><td></td><td></td><td>4309</td><td>0.014</td><td>-0.4328</td>
77 <td>No</td></tr>
78 """
79 html = []
80 try:
81 html = open(apath,'r').readlines()
82 except:
83 return html
84 for i,row in enumerate(html):
85 row = re.sub('https\:\/\/www.affymetrix.com\/LinkServlet\?probeset=',"http://www.genecards.org/index.php?path=/Search/keyword/",row)
86 html[i] = row
87 return html
88
89 cleanup = False
90 if os.path.exists(adir):
91 flist = os.listdir(adir) # get all files created
92 for f in flist:
93 apath = os.path.join(adir,f)
94 dest = os.path.join(destdir,f)
95 if not os.path.isdir(apath):
96 if os.path.splitext(f)[1].lower() == '.html':
97 html = fixAffycrap(apath)
98 fixed = open(apath,'w')
99 fixed.write('\n'.join(html))
100 fixed.write('\n')
101 fixed.close()
102 if not os.path.isfile(dest):
103 shutil.copy(apath,dest)
104 else:
105 fix_subdir(apath,destdir)
106 if cleanup:
107 try:
108 shutil.rmtree(path=adir,ignore_errors=True)
109 except:
110 pass
111
112
113
114 def getFileString(fpath, outpath):
115 """
116 format a nice file size string
117 """
118 size = ''
119 fp = os.path.join(outpath, fpath)
120 s = fpath
121 if os.path.isfile(fp):
122 n = float(os.path.getsize(fp))
123 if n > 2**20:
124 size = ' (%1.1f MB)' % (n/2**20)
125 elif n > 2**10:
126 size = ' (%1.1f KB)' % (n/2**10)
127 elif n > 0:
128 size = ' (%d B)' % (int(n))
129 s = '%s %s' % (fpath, size)
130 return s
131
132 class gsea_wrapper:
133 """
134 GSEA java desktop client has a CL interface. CL can be gleaned by clicking the 'command line' button after setting up an analysis
135 We don't want gsea to do the analysis but it can read .rnk files containing rows of identifiers and an evidence weight such as the signed t statistic from limma for differential expression
136 (vgalaxy)rlazarus@iaas1:~/public_html/idle_illumina_analysis$ cat gseaHumanREFSEQ.sh
137 #!/bin/bash
138 for RNK in `ls *.rnk`
139 do
140 DIRNAME=${RNK%.*}
141 echo $DIRNAME
142 qsub -cwd -b y java -Xmx4096m -cp /data/app/bin/gsea2-2.07.jar xtools.gsea.GseaPreranked -gmx ../msigdb.v3.0.symbols.gmt -collapse true -mode Max_probe -norm meandiv
143 -nperm 1000 -rnk $RNK -scoring_scheme weighted -rpt_label $RNK -chip ../RefSeq_human.chip -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp
144 -set_max 500 -set_min 15 -zip_report false -out gseaout/${DIRNAME} -gui false
145 done
146 """
147
148 def __init__(self,myName=None,opts=None):
149 """ setup cl for gsea
150 """
151 self.idcol = 0
152 self.signcol = 0
153 self.adjpvalcol = 0
154 self.progname=myName
155 self.opts = opts
156 remove_duplicates=True
157 if not os.path.isdir(opts.output_dir):
158 try:
159 os.makedirs(opts.output_dir)
160 except:
161 print >> sys.stderr,'##Error: GSEA wrapper unable to create or find output directory %s. Stopping' % (opts.output_dir)
162 sys.exit(1)
163 fakeGMT = re.sub('[^a-zA-Z0-9_]+', '', opts.input_name) # gives a more useful title for the GSEA report
164 fakeGMT = os.path.join(opts.output_dir,fakeGMT)
165 fakeRanks = '%s.rnk' % fakeGMT
166 if not fakeGMT.endswith('.gmt'):
167 fakeGMT = '%s.gmt' % fakeGMT
168 if opts.builtin_gmt and opts.history_gmt:
169 newfile = open(fakeGMT,'w')
170 subprocess.call(['cat',opts.builtin_gmt,opts.history_gmt],stdout=newfile)
171 newfile.close()
172 elif opts.history_gmt:
173 subprocess.call(['cp',opts.history_gmt,fakeGMT])
174 else:
175 subprocess.call(['cp',opts.builtin_gmt,fakeGMT])
176 # remove dupes from each gene set
177 gmt = open(fakeGMT,'r').readlines()
178 gmt = [x for x in gmt if len(x.split('\t')) > 3]
179 ugmt = []
180 for i,row in enumerate(gmt):
181 rows = row.rstrip().split('\t')
182 gmtname = rows[0]
183 gmtcomment = rows[1]
184 glist = list(set(rows[2:]))
185 newgmt = [gmtname,gmtcomment]
186 newgmt += glist
187 ugmt.append('\t'.join(newgmt))
188 gmt = open(fakeGMT,'w')
189 gmt.write('\n'.join(ugmt))
190 gmt.write('\n')
191 gmt.close()
192 if opts.input_ranks:
193 infname = opts.input_ranks
194 rdat = open(opts.input_ranks,'r').readlines() # suck in and remove blank ids that cause gsea to barf rml april 10 2012
195 rdat = [x.rstrip().split('\t') for x in rdat[1:]] # ignore head
196 dat = [[x[0],x[1],x[1]] for x in rdat]
197 # fake same structure as input tabular file
198 try:
199 pvals = [float(x[1]) for x in dat]
200 signs = [float(x[1]) for x in dat]
201 except:
202 print >> sys.stderr, '## error converting floating point - cannot process this input'
203 sys.exit(99)
204 else: # read tabular
205 self.idcol = int(opts.idcol) - 1
206 self.signcol = int(opts.signcol) - 1
207 self.adjpvalcol = int(opts.adjpvalcol) - 1
208 maxcol = max(self.idcol,self.signcol,self.adjpvalcol)
209 infname = opts.input_tab
210 indat = open(opts.input_tab,'r').readlines()
211 dat = [x.rstrip().split('\t') for x in indat[1:]]
212 dat = [x for x in dat if len(x) > maxcol]
213 dat = [[x[self.idcol],x[self.adjpvalcol],x[self.signcol]] for x in dat] # reduce to rank form
214 pvals = [float(x[1]) for x in dat]
215 outofrange = [x for x in pvals if ((x < 0.0) or (x > 1.0))]
216 assert len(outofrange) == 0, '## p values outside 0-1 encountered - was that the right column for adjusted p value?'
217 signs = [float(x[2]) for x in dat]
218 outofrange = [i for i,x in enumerate(signs) if (not x) and (dat[i][self.signcol] <> '0')]
219 bad = [dat[x][2] for x in outofrange]
220 assert len(outofrange) == 0, '## null numeric values encountered for sign - was that the right column? %s' % bad
221 ids = [x[0] for x in dat]
222 res = []
223 self.comments = []
224 useme = []
225 for i,row in enumerate(dat):
226 if row[1].upper() != 'NA' and row[2].upper() != 'NA' and row[0] != '' :
227 useme.append(i)
228 lost = len(dat) - len(useme)
229 if lost <> 0:
230 newdat = [dat[x] for x in useme]
231 del dat
232 dat = newdat
233 print >> sys.stdout, '## %d lost - NA values or null id' % lost
234 if remove_duplicates:
235 uids = list(set(ids)) # complex procedure to get min pval for each unique id
236 if len(uids) <> len(ids): # dupes - deal with mode
237 print >> sys.stdout,'## Dealing with %d uids in %d ids' % (len(uids),len(ids))
238 ures = {}
239 for i,id in enumerate(ids):
240 p = pvals[i]
241 ures.setdefault(id,[])
242 ures[id].append((p,signs[i]))
243 for id in uids:
244 tlist = ures[id]
245 tp = [x[0] for x in tlist]
246 ts = [x[1] for x in tlist]
247 if len(tp) == 1:
248 p = tp[0]
249 sign = ts[0]
250 else:
251 if opts.mode == "Max_probe":
252 p = min(tp)
253 sign = ts[tp.index(p)]
254 else: # guess median - too bad if even count
255 tp.sort()
256 ltp = len(tp)
257 ind = ltp/2 # yes, this is wrong for evens but what if sign wobbles?
258 if ltp % 2 == 1: # odd
259 ind += 1 # take the median
260 p = tp[ind]
261 sign = ts[ind]
262 if KEEPSELECTION:
263 self.comments.append('## for id=%s, got tp=%s, ts=%s, chose p=%f, sign =%f'\
264 % (id,str(tp),str(ts),p,sign))
265 if opts.input_ranks: # must be a rank file
266 res.append((id,'%f' % p))
267 else:
268 if p == 0.0:
269 p = 1e-99
270 try:
271 lp = -math.log10(p) # large positive if low p value
272 except ValueError:
273 lp = 0.0
274 if sign < 0:
275 lp = -lp # if negative, swap p to negative
276 res.append((id,'%f' % lp))
277 else: # no duplicates
278 for i,row in enumerate(dat):
279 (id,p,sign) = (row[0],float(row[1]),float(row[2]))
280 if opts.input_ranks: # must be a rank file
281 res.append((id,'%f' % p))
282 else:
283 if p == 0.0:
284 p = 1e-99
285 try:
286 lp = -math.log10(p) # large positive if low p value
287 except ValueError:
288 lp = 0.0
289 if sign < 0:
290 lp = -lp # if negative, swap p to negative
291 res.append((id,'%f' % lp))
292 else:
293 for i,row in enumerate(dat):
294 (id,p,sign) = (row[0],float(row[1]),float(row[2]))
295 if opts.input_ranks: # must be a rank file
296 res.append((id,'%f' % p))
297 else:
298 if p == 0.0:
299 p = 1e-99
300 try:
301 lp = -math.log10(p) # large positive if low p value
302 except ValueError:
303 lp = 0.0
304 if sign < 0:
305 lp = -lp # if negative, swap p to negative
306 res.append((id,'%f' % lp))
307 len1 = len(ids)
308 ranks = res # [x for x in res if len(x[0]) > 0 and len(x[1].strip()) > 0 and x[0].upper() <> 'NA']
309 # rd = dict(zip([x[0] for x in res],res)) # this does something mysterious to dupes - probably keeps last one
310 # ranks = rd.values()
311 len2 = len(ranks)
312 delta = len1 - len2
313 if delta <> 0:
314 print >> sys.stdout,'NOTE: %d of %d rank input file %s rows deleted - dup, null or NA IDs, pvals or logFCs' % (delta,len1,infname)
315 ranks = [(float(x[1]),x) for x in ranks] # decorate
316 ranks.sort()
317 ranks.reverse()
318 ranks = [x[1] for x in ranks] # undecorate
319 if opts.chip == '': # if mouse - need HUGO
320 ranks = [[x[0].upper(),x[1]] for x in ranks]
321 print >> sys.stdout, '## Fixed any lower case - now have',','.join([x[0] for x in ranks[:5]])
322 ranks = ['\t'.join(x) for x in ranks]
323 if len(ranks) < 2:
324 print >> sys.stderr,'Input %s has 1 or less rows with two tab delimited fields - please check the tool documentation' % infname
325 sys.exit(1)
326 rclean = open(fakeRanks,'w')
327 rclean.write('contig\tscore\n')
328 rclean.write('\n'.join(ranks))
329 rclean.write('\n')
330 rclean.close()
331 cl = []
332 a = cl.append
333 a('java -Xmx6G -cp')
334 a(opts.gsea_jar)
335 a('xtools.gsea.GseaPreranked')
336 a('-gmx %s' % fakeGMT) # ensure .gmt extension as required by GSEA - gene sets to use
337 a('-gui false') # use preranked file mode and no gui
338 a('-make_sets true -rnd_seed timestamp') # more things from the GUI command line display
339 a('-norm meandiv -zip_report false -scoring_scheme weighted') # ? need to set these?
340 a('-rnk %s' % fakeRanks) # input ranks file symbol (the chip file is the crosswalk for ids in first column)
341 a('-out %s' % opts.output_dir)
342 a('-set_max %s' % opts.setMax)
343 a('-set_min %s' % opts.setMin)
344 a('-mode %s' % opts.mode)
345 if opts.chip > '':
346 #a('-chip %s -collapse true -include_only_symbols true' % opts.chip)
347 a('-chip %s -collapse true' % opts.chip)
348 else:
349 a("-collapse false") # needed if no chip
350 a('-nperm %s' % opts.nPerm)
351 a('-rpt_label %s' % opts.title)
352 a('-plot_top_x %s' % opts.plotTop)
353 self.cl = cl
354 self.comments.append('## GSEA command line:')
355 self.comments.append(' '.join(self.cl))
356 self.fakeRanks = fakeRanks
357 self.fakeGMT = fakeGMT
358
359 def grepIds(self):
360 """
361 """
362 found = []
363 allids = open(self.opts.input_ranks,'r').readlines()
364 allids = [x.strip().split() for x in allids]
365 allids = [x[0] for x in allids] # list of ids
366 gmtpath = os.path.split(self.opts.fakeGMT)[0] # get path to all chip
367
368 def run(self):
369 """
370
371 """
372 tlog = os.path.join(self.opts.output_dir,"gsea_runner.log")
373 sto = open(tlog,'w')
374 x = subprocess.Popen(' '.join(self.cl),shell=True,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
375 retval = x.wait()
376 sto.close()
377 d = glob.glob(os.path.join(self.opts.output_dir,'%s*' % self.opts.title))
378 if len(d) > 0:
379 fix_subdir(d[0],self.opts.output_dir)
380 htmlfname = os.path.join(self.opts.output_dir,'index.html')
381 try:
382 html = open(htmlfname,'r').readlines()
383 html = [x.strip() for x in html if len(x.strip()) > 0]
384 if len(self.comments) > 0:
385 s = ['<pre>']
386 s += self.comments
387 s.append('</pre>')
388 try:
389 i = html.index('<div id="footer">')
390 except:
391 i = len(html) - 7 # fudge
392 html = html[:i] + s + html[i:]
393 except:
394 html = []
395 htmlhead = '<html><head></head><body>'
396 html.append('## Galaxy GSEA wrapper failure')
397 html.append('## Unable to find index.html in %s - listdir=%s' % (d,' '.join(os.listdir(self.opts.output_dir))))
398 html.append('## Command line was %s' % (' '.join(self.cl)))
399 html.append('## commonly caused by mismatched ID/chip selection')
400 glog = open(os.path.join(self.opts.output_dir,'gsea_runner.log'),'r').readlines()
401 html.append('## gsea_runner.log=%s' % '\n'.join(glog))
402 #tryme = self.grepIds()
403 retval = 1
404 print >> sys.stderr,'\n'.join(html)
405 html = ['%s<br/>' % x for x in html]
406 html.insert(0,htmlhead)
407 html.append('</body></html>')
408 htmlf = file(self.opts.outhtml,'w')
409 htmlf.write('\n'.join(html))
410 htmlf.write('\n')
411 htmlf.close()
412 os.unlink(self.fakeRanks)
413 os.unlink(self.fakeGMT)
414 if opts.outtab_neg:
415 tabs = glob.glob(os.path.join(opts.output_dir,"gsea_report_for_*.xls"))
416 if len(tabs) > 0:
417 for tabi,t in enumerate(tabs):
418 tkind = os.path.basename(t).split('_')[4].lower()
419 if tkind == 'neg':
420 outtab = opts.outtab_neg
421 elif tkind == 'pos':
422 outtab = opts.outtab_pos
423 else:
424 print >> sys.stderr, '## tab file matched %s which is not "neg" or "pos" in 4th segment %s' % (t,tkind)
425 sys.exit()
426 content = open(t).readlines()
427 tabf = open(outtab,'w')
428 tabf.write(''.join(content))
429 tabf.close()
430 else:
431 print >> sys.stdout, 'Odd, maketab = %s but no matches - tabs = %s' % (makeTab,tabs)
432 return retval
433
434
435 if __name__ == "__main__":
436 """
437 called as:
438 <command interpreter="python">rgGSEA.py --input_ranks "$input1" --outhtml "$html_file"
439 --setMax "$setMax" --setMin "$setMin" --nPerm "$nPerm" --plotTop "$plotTop" --gsea_jar "$GALAXY_DATA_INDEX_DIR/shared/jars/gsea2-2.07.jar"
440 --output_dir "$html_file.files_path" --use_gmt ""${use_gmt.fields.path}"" --chip "${use_chip.fields.path}"
441 </command>
442 """
443 op = optparse.OptionParser()
444 a = op.add_option
445 a('--input_ranks',default=None)
446 a('--input_tab',default=None)
447 a('--input_name',default=None)
448 a('--use_gmt',default=None)
449 a('--history_gmt',default=None)
450 a('--builtin_gmt',default=None)
451 a('--history_gmt_name',default=None)
452 a('--setMax',default="500")
453 a('--setMin',default="15")
454 a('--nPerm',default="1000")
455 a('--title',default="GSEA report")
456 a('--chip',default='')
457 a('--plotTop',default='20')
458 a('--outhtml',default=None)
459 a('--makeTab',default=None)
460 a('--output_dir',default=None)
461 a('--outtab_neg',default=None)
462 a('--outtab_pos',default=None)
463 a('--adjpvalcol',default=None)
464 a('--signcol',default=None)
465 a('--idcol',default=None)
466 a('--mode',default='Max_probe')
467 a('-j','--gsea_jar',default='/usr/local/bin/gsea2-2.07.jar')
468 opts, args = op.parse_args()
469 assert os.path.isfile(opts.gsea_jar),'## GSEA runner unable to find supplied gsea java desktop executable file %s' % opts.gsea_jar
470 if opts.input_ranks:
471 inpf = opts.input_ranks
472 else:
473 inpf = opts.input_tab
474 assert opts.idcol <> None, '## GSEA runner needs an id column if a tabular file provided'
475 assert opts.signcol <> None, '## GSEA runner needs a sign column if a tabular file provided'
476 assert opts.adjpvalcol <> None, '## GSEA runner needs an adjusted p value column if a tabular file provided'
477 assert os.path.isfile(inpf),'## GSEA runner unable to open supplied input file %s' % inpf
478 if opts.chip > '':
479 assert os.path.isfile(opts.chip),'## GSEA runner unable to open supplied chip file %s' % opts.chip
480 some = None
481 if opts.history_gmt <> None:
482 some = 1
483 assert os.path.isfile(opts.history_gmt),'## GSEA runner unable to open supplied history gene set matrix (.gmt) file %s' % opts.history_gmt
484 if opts.builtin_gmt <> None:
485 some = 1
486 assert os.path.isfile(opts.builtin_gmt),'## GSEA runner unable to open supplied history gene set matrix (.gmt) file %s' % opts.builtin_gmt
487 assert some, '## GSEA runner needs a gene set matrix file - none chosen?'
488 opts.title = re.sub('[^a-zA-Z0-9_]+', '', opts.title)
489 myName=os.path.split(sys.argv[0])[-1]
490 gse = gsea_wrapper(myName, opts=opts)
491 retcode = gse.run()
492 if retcode <> 0:
493 sys.exit(retcode) # indicate failure to job runner
494
495