# HG changeset patch
# User fubar
# Date 1371020431 14400
# Node ID 34cbb9e749dab792812b22a356840a00506e1fcc
# Parent 82e0af5661602a9a6308e96904ead0d24c139844
Uploaded
diff -r 82e0af566160 -r 34cbb9e749da rgedgeR/rgGSEA.py
--- a/rgedgeR/rgGSEA.py Wed Jun 12 02:58:43 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,494 +0,0 @@
-"""
-April 2013
-eeesh GSEA does NOT respect the mode flag!
-
-Now realise that the creation of the input rank file for gsea needs to take the lowest p value for duplicate
-feature names. To make Ish's life easier, remove duplicate gene ids from any gene set to stop GSEA from
-barfing.
-
-October 14 2012
-Amazingly long time to figure out that GSEA fails with useless error message if any filename contains a dash "-"
-eesh.
-
-Added history .gmt source - requires passing a faked name to gsea
-Wrapper for GSEA http://www.broadinstitute.org/gsea/index.jsp
-Started Feb 22
-Copyright 2012 Ross Lazarus
-All rights reserved
-Licensed under the LGPL
-
-called eg as
-
-#!/bin/sh
-GALAXY_LIB="/data/extended/galaxy/lib"
-if [ "$GALAXY_LIB" != "None" ]; then
- if [ -n "$PYTHONPATH" ]; then
- PYTHONPATH="$GALAXY_LIB:$PYTHONPATH"
- else
- PYTHONPATH="$GALAXY_LIB"
- fi
- export PYTHONPATH
-fi
-
-cd /data/extended/galaxy/database/job_working_directory/027/27311
-python /data/extended/galaxy/tools/rgenetics/rgGSEA.py --input_tab "/data/extended/galaxy/database/files/033/dataset_33806.dat" --adjpvalcol "5" --signcol "2"
---idcol "1" --outhtml "/data/extended/galaxy/database/files/034/dataset_34455.dat" --input_name "actaearly-Controlearly-actalate-Controllate_topTable.xls"
---setMax "500" --setMin "15" --nPerm "1000" --plotTop "20"
---gsea_jar "/data/extended/galaxy/tool-data/shared/jars/gsea2-2.0.12.jar"
---output_dir "/data/extended/galaxy/database/job_working_directory/027/27311/dataset_34455_files" --mode "Max_probe"
- --title " actaearly-Controlearly-actalate-Controllate_interpro_GSEA" --builtin_gmt "/data/genomes/gsea/3.1/IPR_DOMAIN.gmt"
-
-
-"""
-import optparse
-import tempfile
-import os
-import sys
-import subprocess
-import time
-import shutil
-import glob
-import math
-import re
-
-KEEPSELECTION = False # detailed records for selection of multiple probes
-
-def timenow():
- """return current time as a string
- """
- return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-
-
-def fix_subdir(adir,destdir):
- """ Galaxy wants everything in the same files_dir
- if os.path.exists(adir):
- for (d,dirs,files) in os.path.walk(adir):
- for f in files:
- sauce = os.path.join(d,f)
- shutil.copy(sauce,destdir)
- """
-
- def fixAffycrap(apath=''):
- """class='richTable'>RUNNING ES
CORE ENRICHMENT | 1 |
- LBR | | | 1113 |
- 0.194 | -0.1065 | No |
2 |
- GGPS1 | | | 4309 | 0.014 | -0.4328 |
- No |
- """
- html = []
- try:
- html = open(apath,'r').readlines()
- except:
- return html
- for i,row in enumerate(html):
- row = re.sub('https\:\/\/www.affymetrix.com\/LinkServlet\?probeset=',"http://www.genecards.org/index.php?path=/Search/keyword/",row)
- html[i] = row
- return html
-
- cleanup = False
- if os.path.exists(adir):
- flist = os.listdir(adir) # get all files created
- for f in flist:
- apath = os.path.join(adir,f)
- dest = os.path.join(destdir,f)
- if not os.path.isdir(apath):
- if os.path.splitext(f)[1].lower() == '.html':
- html = fixAffycrap(apath)
- fixed = open(apath,'w')
- fixed.write('\n'.join(html))
- fixed.write('\n')
- fixed.close()
- if not os.path.isfile(dest):
- shutil.copy(apath,dest)
- else:
- fix_subdir(apath,destdir)
- if cleanup:
- try:
- shutil.rmtree(path=adir,ignore_errors=True)
- except:
- pass
-
-
-
-def getFileString(fpath, outpath):
- """
- format a nice file size string
- """
- size = ''
- fp = os.path.join(outpath, fpath)
- s = fpath
- if os.path.isfile(fp):
- n = float(os.path.getsize(fp))
- if n > 2**20:
- size = ' (%1.1f MB)' % (n/2**20)
- elif n > 2**10:
- size = ' (%1.1f KB)' % (n/2**10)
- elif n > 0:
- size = ' (%d B)' % (int(n))
- s = '%s %s' % (fpath, size)
- return s
-
-class gsea_wrapper:
- """
- GSEA java desktop client has a CL interface. CL can be gleaned by clicking the 'command line' button after setting up an analysis
- We don't want gsea to do the analysis but it can read .rnk files containing rows of identifiers and an evidence weight such as the signed t statistic from limma for differential expression
-(vgalaxy)rlazarus@iaas1:~/public_html/idle_illumina_analysis$ cat gseaHumanREFSEQ.sh
-#!/bin/bash
-for RNK in `ls *.rnk`
-do
-DIRNAME=${RNK%.*}
-echo $DIRNAME
-qsub -cwd -b y java -Xmx4096m -cp /data/app/bin/gsea2-2.07.jar xtools.gsea.GseaPreranked -gmx ../msigdb.v3.0.symbols.gmt -collapse true -mode Max_probe -norm meandiv
--nperm 1000 -rnk $RNK -scoring_scheme weighted -rpt_label $RNK -chip ../RefSeq_human.chip -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp
--set_max 500 -set_min 15 -zip_report false -out gseaout/${DIRNAME} -gui false
-done
- """
-
- def __init__(self,myName=None,opts=None):
- """ setup cl for gsea
- """
- self.idcol = 0
- self.signcol = 0
- self.adjpvalcol = 0
- self.progname=myName
- self.opts = opts
- remove_duplicates=True
- if not os.path.isdir(opts.output_dir):
- try:
- os.makedirs(opts.output_dir)
- except:
- print >> sys.stderr,'##Error: GSEA wrapper unable to create or find output directory %s. Stopping' % (opts.output_dir)
- sys.exit(1)
- fakeGMT = re.sub('[^a-zA-Z0-9_]+', '', opts.input_name) # gives a more useful title for the GSEA report
- fakeGMT = os.path.join(opts.output_dir,fakeGMT)
- fakeGMT = os.path.abspath(fakeGMT)
- fakeRanks = '%s.rnk' % fakeGMT
- if not fakeGMT.endswith('.gmt'):
- fakeGMT = '%s.gmt' % fakeGMT
- if opts.builtin_gmt and opts.history_gmt:
- newfile = open(fakeGMT,'w')
- subprocess.call(['cat',opts.builtin_gmt,opts.history_gmt],stdout=newfile)
- newfile.close()
- elif opts.history_gmt:
- subprocess.call(['cp',opts.history_gmt,fakeGMT])
- else:
- subprocess.call(['cp',opts.builtin_gmt,fakeGMT])
- # remove dupes from each gene set
- gmt = open(fakeGMT,'r').readlines()
- gmt = [x for x in gmt if len(x.split('\t')) > 3]
- ugmt = []
- for i,row in enumerate(gmt):
- rows = row.rstrip().split('\t')
- gmtname = rows[0]
- gmtcomment = rows[1]
- glist = list(set(rows[2:]))
- newgmt = [gmtname,gmtcomment]
- newgmt += glist
- ugmt.append('\t'.join(newgmt))
- gmt = open(fakeGMT,'w')
- gmt.write('\n'.join(ugmt))
- gmt.write('\n')
- gmt.close()
- if opts.input_ranks:
- infname = opts.input_ranks
- rdat = open(opts.input_ranks,'r').readlines() # suck in and remove blank ids that cause gsea to barf rml april 10 2012
- rdat = [x.rstrip().split('\t') for x in rdat[1:]] # ignore head
- dat = [[x[0],x[1],x[1]] for x in rdat]
- # fake same structure as input tabular file
- try:
- pvals = [float(x[1]) for x in dat]
- signs = [float(x[1]) for x in dat]
- except:
- print >> sys.stderr, '## error converting floating point - cannot process this input'
- sys.exit(99)
- else: # read tabular
- self.idcol = int(opts.idcol) - 1
- self.signcol = int(opts.signcol) - 1
- self.adjpvalcol = int(opts.adjpvalcol) - 1
- maxcol = max(self.idcol,self.signcol,self.adjpvalcol)
- infname = opts.input_tab
- indat = open(opts.input_tab,'r').readlines()
- dat = [x.rstrip().split('\t') for x in indat[1:]]
- dat = [x for x in dat if len(x) > maxcol]
- dat = [[x[self.idcol],x[self.adjpvalcol],x[self.signcol]] for x in dat] # reduce to rank form
- pvals = [float(x[1]) for x in dat]
- outofrange = [x for x in pvals if ((x < 0.0) or (x > 1.0))]
- assert len(outofrange) == 0, '## p values outside 0-1 encountered - was that the right column for adjusted p value?'
- signs = [float(x[2]) for x in dat]
- outofrange = [i for i,x in enumerate(signs) if (not x) and (dat[i][self.signcol] <> '0')]
- bad = [dat[x][2] for x in outofrange]
- assert len(outofrange) == 0, '## null numeric values encountered for sign - was that the right column? %s' % bad
- ids = [x[0] for x in dat]
- res = []
- self.comments = []
- useme = []
- for i,row in enumerate(dat):
- if row[1].upper() != 'NA' and row[2].upper() != 'NA' and row[0] != '' :
- useme.append(i)
- lost = len(dat) - len(useme)
- if lost <> 0:
- newdat = [dat[x] for x in useme]
- del dat
- dat = newdat
- print >> sys.stdout, '## %d lost - NA values or null id' % lost
- if remove_duplicates:
- uids = list(set(ids)) # complex procedure to get min pval for each unique id
- if len(uids) <> len(ids): # dupes - deal with mode
- print >> sys.stdout,'## Dealing with %d uids in %d ids' % (len(uids),len(ids))
- ures = {}
- for i,id in enumerate(ids):
- p = pvals[i]
- ures.setdefault(id,[])
- ures[id].append((p,signs[i]))
- for id in uids:
- tlist = ures[id]
- tp = [x[0] for x in tlist]
- ts = [x[1] for x in tlist]
- if len(tp) == 1:
- p = tp[0]
- sign = ts[0]
- else:
- if opts.mode == "Max_probe":
- p = min(tp)
- sign = ts[tp.index(p)]
- else: # guess median - too bad if even count
- tp.sort()
- ltp = len(tp)
- ind = ltp/2 # yes, this is wrong for evens but what if sign wobbles?
- if ltp % 2 == 1: # odd
- ind += 1 # take the median
- p = tp[ind]
- sign = ts[ind]
- if KEEPSELECTION:
- self.comments.append('## for id=%s, got tp=%s, ts=%s, chose p=%f, sign =%f'\
- % (id,str(tp),str(ts),p,sign))
- if opts.input_ranks: # must be a rank file
- res.append((id,'%f' % p))
- else:
- if p == 0.0:
- p = 1e-99
- try:
- lp = -math.log10(p) # large positive if low p value
- except ValueError:
- lp = 0.0
- if sign < 0:
- lp = -lp # if negative, swap p to negative
- res.append((id,'%f' % lp))
- else: # no duplicates
- for i,row in enumerate(dat):
- (id,p,sign) = (row[0],float(row[1]),float(row[2]))
- if opts.input_ranks: # must be a rank file
- res.append((id,'%f' % p))
- else:
- if p == 0.0:
- p = 1e-99
- try:
- lp = -math.log10(p) # large positive if low p value
- except ValueError:
- lp = 0.0
- if sign < 0:
- lp = -lp # if negative, swap p to negative
- res.append((id,'%f' % lp))
- else:
- for i,row in enumerate(dat):
- (id,p,sign) = (row[0],float(row[1]),float(row[2]))
- if opts.input_ranks: # must be a rank file
- res.append((id,'%f' % p))
- else:
- if p == 0.0:
- p = 1e-99
- try:
- lp = -math.log10(p) # large positive if low p value
- except ValueError:
- lp = 0.0
- if sign < 0:
- lp = -lp # if negative, swap p to negative
- res.append((id,'%f' % lp))
- len1 = len(ids)
- len2 = len(ranks)
- delta = len1 - len2
- if delta <> 0:
- print >> sys.stdout,'NOTE: %d of %d rank input file %s rows deleted - dup, null or NA IDs, pvals or logFCs' % (delta,len1,infname)
- ranks = [(float(x[1]),x) for x in ranks] # decorate
- ranks.sort()
- ranks.reverse()
- ranks = [x[1] for x in ranks] # undecorate
- if opts.chip == '': # if mouse - need HUGO
- ranks = [[x[0].upper(),x[1]] for x in ranks]
- print >> sys.stdout, '## Fixed any lower case - now have',','.join([x[0] for x in ranks[:5]])
- ranks = ['\t'.join(x) for x in ranks]
- if len(ranks) < 2:
- print >> sys.stderr,'Input %s has 1 or less rows with two tab delimited fields - please check the tool documentation' % infname
- sys.exit(1)
- print '### opening %s and writing %s' % (fakeRanks,str(ranks[:10]))
- rclean = open(fakeRanks,'w')
- rclean.write('contig\tscore\n')
- rclean.write('\n'.join(ranks))
- rclean.write('\n')
- rclean.close()
- cl = []
- a = cl.append
- a('java -Xmx6G -cp')
- a(opts.gsea_jar)
- a('xtools.gsea.GseaPreranked')
- a('-gmx %s' % fakeGMT) # ensure .gmt extension as required by GSEA - gene sets to use
- a('-gui false') # use preranked file mode and no gui
- a('-make_sets true -rnd_seed timestamp') # more things from the GUI command line display
- a('-norm meandiv -zip_report false -scoring_scheme weighted') # ? need to set these?
- a('-rnk %s' % fakeRanks) # input ranks file symbol (the chip file is the crosswalk for ids in first column)
- a('-out %s' % opts.output_dir)
- a('-set_max %s' % opts.setMax)
- a('-set_min %s' % opts.setMin)
- a('-mode %s' % opts.mode)
- if opts.chip > '':
- #a('-chip %s -collapse true -include_only_symbols true' % opts.chip)
- a('-chip %s -collapse true' % opts.chip)
- else:
- a("-collapse false") # needed if no chip
- a('-nperm %s' % opts.nPerm)
- a('-rpt_label %s' % opts.title)
- a('-plot_top_x %s' % opts.plotTop)
- self.cl = cl
- self.comments.append('## GSEA command line:')
- self.comments.append(' '.join(self.cl))
- self.fakeRanks = fakeRanks
- self.fakeGMT = fakeGMT
-
- def grepIds(self):
- """
- """
- found = []
- allids = open(self.opts.input_ranks,'r').readlines()
- allids = [x.strip().split() for x in allids]
- allids = [x[0] for x in allids] # list of ids
- gmtpath = os.path.split(self.opts.fakeGMT)[0] # get path to all chip
-
- def run(self):
- """
-
- """
- tlog = os.path.join(self.opts.output_dir,"gsea_runner.log")
- sto = open(tlog,'w')
- x = subprocess.Popen(' '.join(self.cl),shell=True,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
- retval = x.wait()
- sto.close()
- d = glob.glob(os.path.join(self.opts.output_dir,'%s*' % self.opts.title))
- if len(d) > 0:
- fix_subdir(d[0],self.opts.output_dir)
- htmlfname = os.path.join(self.opts.output_dir,'index.html')
- try:
- html = open(htmlfname,'r').readlines()
- html = [x.strip() for x in html if len(x.strip()) > 0]
- if len(self.comments) > 0:
- s = ['']
- s += self.comments
- s.append('
')
- try:
- i = html.index('