# HG changeset patch
# User fubar
# Date 1370755820 14400
# Node ID aef0ecc0068312b0b69ab9191e65c40bf8e87323
initial commit
diff -r 000000000000 -r aef0ecc00683 rgGSEA.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rgGSEA.py Sun Jun 09 01:30:20 2013 -0400
@@ -0,0 +1,495 @@
+"""
+April 2013
+eeesh GSEA does NOT respect the mode flag!
+
+Now realise that the creation of the input rank file for gsea needs to take the lowest p value for duplicate
+feature names. To make Ish's life easier, remove duplicate gene ids from any gene set to stop GSEA from
+barfing.
+
+October 14 2012
+Amazingly long time to figure out that GSEA fails with useless error message if any filename contains a dash "-"
+eesh.
+
+Added history .gmt source - requires passing a faked name to gsea
+Wrapper for GSEA http://www.broadinstitute.org/gsea/index.jsp
+Started Feb 22
+Copyright 2012 Ross Lazarus
+All rights reserved
+Licensed under the LGPL
+
+called eg as
+
+#!/bin/sh
+GALAXY_LIB="/data/extended/galaxy/lib"
+if [ "$GALAXY_LIB" != "None" ]; then
+ if [ -n "$PYTHONPATH" ]; then
+ PYTHONPATH="$GALAXY_LIB:$PYTHONPATH"
+ else
+ PYTHONPATH="$GALAXY_LIB"
+ fi
+ export PYTHONPATH
+fi
+
+cd /data/extended/galaxy/database/job_working_directory/027/27311
+python /data/extended/galaxy/tools/rgenetics/rgGSEA.py --input_tab "/data/extended/galaxy/database/files/033/dataset_33806.dat" --adjpvalcol "5" --signcol "2"
+--idcol "1" --outhtml "/data/extended/galaxy/database/files/034/dataset_34455.dat" --input_name "actaearly-Controlearly-actalate-Controllate_topTable.xls"
+--setMax "500" --setMin "15" --nPerm "1000" --plotTop "20"
+--gsea_jar "/data/extended/galaxy/tool-data/shared/jars/gsea2-2.0.12.jar"
+--output_dir "/data/extended/galaxy/database/job_working_directory/027/27311/dataset_34455_files" --mode "Max_probe"
+ --title " actaearly-Controlearly-actalate-Controllate_interpro_GSEA" --builtin_gmt "/data/genomes/gsea/3.1/IPR_DOMAIN.gmt"
+
+
+"""
+import optparse
+import tempfile
+import os
+import sys
+import subprocess
+import time
+import shutil
+import glob
+import math
+import re
+
+KEEPSELECTION = False # detailed records for selection of multiple probes
+
+def timenow():
+ """return current time as a string
+ """
+ return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+
+
+
+def fix_subdir(adir,destdir):
+ """ Galaxy wants everything in the same files_dir
+ if os.path.exists(adir):
+ for (d,dirs,files) in os.path.walk(adir):
+ for f in files:
+ sauce = os.path.join(d,f)
+ shutil.copy(sauce,destdir)
+ """
+
+ def fixAffycrap(apath=''):
+ """class='richTable'>RUNNING ES
CORE ENRICHMENT | 1 |
+ LBR | | | 1113 |
+ 0.194 | -0.1065 | No |
2 |
+ GGPS1 | | | 4309 | 0.014 | -0.4328 |
+ No |
+ """
+ html = []
+ try:
+ html = open(apath,'r').readlines()
+ except:
+ return html
+ for i,row in enumerate(html):
+ row = re.sub('https\:\/\/www.affymetrix.com\/LinkServlet\?probeset=',"http://www.genecards.org/index.php?path=/Search/keyword/",row)
+ html[i] = row
+ return html
+
+ cleanup = False
+ if os.path.exists(adir):
+ flist = os.listdir(adir) # get all files created
+ for f in flist:
+ apath = os.path.join(adir,f)
+ dest = os.path.join(destdir,f)
+ if not os.path.isdir(apath):
+ if os.path.splitext(f)[1].lower() == '.html':
+ html = fixAffycrap(apath)
+ fixed = open(apath,'w')
+ fixed.write('\n'.join(html))
+ fixed.write('\n')
+ fixed.close()
+ if not os.path.isfile(dest):
+ shutil.copy(apath,dest)
+ else:
+ fix_subdir(apath,destdir)
+ if cleanup:
+ try:
+ shutil.rmtree(path=adir,ignore_errors=True)
+ except:
+ pass
+
+
+
+def getFileString(fpath, outpath):
+ """
+ format a nice file size string
+ """
+ size = ''
+ fp = os.path.join(outpath, fpath)
+ s = fpath
+ if os.path.isfile(fp):
+ n = float(os.path.getsize(fp))
+ if n > 2**20:
+ size = ' (%1.1f MB)' % (n/2**20)
+ elif n > 2**10:
+ size = ' (%1.1f KB)' % (n/2**10)
+ elif n > 0:
+ size = ' (%d B)' % (int(n))
+ s = '%s %s' % (fpath, size)
+ return s
+
+class gsea_wrapper:
+ """
+ GSEA java desktop client has a CL interface. CL can be gleaned by clicking the 'command line' button after setting up an analysis
+ We don't want gsea to do the analysis but it can read .rnk files containing rows of identifiers and an evidence weight such as the signed t statistic from limma for differential expression
+(vgalaxy)rlazarus@iaas1:~/public_html/idle_illumina_analysis$ cat gseaHumanREFSEQ.sh
+#!/bin/bash
+for RNK in `ls *.rnk`
+do
+DIRNAME=${RNK%.*}
+echo $DIRNAME
+qsub -cwd -b y java -Xmx4096m -cp /data/app/bin/gsea2-2.07.jar xtools.gsea.GseaPreranked -gmx ../msigdb.v3.0.symbols.gmt -collapse true -mode Max_probe -norm meandiv
+-nperm 1000 -rnk $RNK -scoring_scheme weighted -rpt_label $RNK -chip ../RefSeq_human.chip -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp
+-set_max 500 -set_min 15 -zip_report false -out gseaout/${DIRNAME} -gui false
+done
+ """
+
+ def __init__(self,myName=None,opts=None):
+ """ setup cl for gsea
+ """
+ self.idcol = 0
+ self.signcol = 0
+ self.adjpvalcol = 0
+ self.progname=myName
+ self.opts = opts
+ remove_duplicates=True
+ if not os.path.isdir(opts.output_dir):
+ try:
+ os.makedirs(opts.output_dir)
+ except:
+ print >> sys.stderr,'##Error: GSEA wrapper unable to create or find output directory %s. Stopping' % (opts.output_dir)
+ sys.exit(1)
+ fakeGMT = re.sub('[^a-zA-Z0-9_]+', '', opts.input_name) # gives a more useful title for the GSEA report
+ fakeGMT = os.path.join(opts.output_dir,fakeGMT)
+ fakeRanks = '%s.rnk' % fakeGMT
+ if not fakeGMT.endswith('.gmt'):
+ fakeGMT = '%s.gmt' % fakeGMT
+ if opts.builtin_gmt and opts.history_gmt:
+ newfile = open(fakeGMT,'w')
+ subprocess.call(['cat',opts.builtin_gmt,opts.history_gmt],stdout=newfile)
+ newfile.close()
+ elif opts.history_gmt:
+ subprocess.call(['cp',opts.history_gmt,fakeGMT])
+ else:
+ subprocess.call(['cp',opts.builtin_gmt,fakeGMT])
+ # remove dupes from each gene set
+ gmt = open(fakeGMT,'r').readlines()
+ gmt = [x for x in gmt if len(x.split('\t')) > 3]
+ ugmt = []
+ for i,row in enumerate(gmt):
+ rows = row.rstrip().split('\t')
+ gmtname = rows[0]
+ gmtcomment = rows[1]
+ glist = list(set(rows[2:]))
+ newgmt = [gmtname,gmtcomment]
+ newgmt += glist
+ ugmt.append('\t'.join(newgmt))
+ gmt = open(fakeGMT,'w')
+ gmt.write('\n'.join(ugmt))
+ gmt.write('\n')
+ gmt.close()
+ if opts.input_ranks:
+ infname = opts.input_ranks
+ rdat = open(opts.input_ranks,'r').readlines() # suck in and remove blank ids that cause gsea to barf rml april 10 2012
+ rdat = [x.rstrip().split('\t') for x in rdat[1:]] # ignore head
+ dat = [[x[0],x[1],x[1]] for x in rdat]
+ # fake same structure as input tabular file
+ try:
+ pvals = [float(x[1]) for x in dat]
+ signs = [float(x[1]) for x in dat]
+ except:
+ print >> sys.stderr, '## error converting floating point - cannot process this input'
+ sys.exit(99)
+ else: # read tabular
+ self.idcol = int(opts.idcol) - 1
+ self.signcol = int(opts.signcol) - 1
+ self.adjpvalcol = int(opts.adjpvalcol) - 1
+ maxcol = max(self.idcol,self.signcol,self.adjpvalcol)
+ infname = opts.input_tab
+ indat = open(opts.input_tab,'r').readlines()
+ dat = [x.rstrip().split('\t') for x in indat[1:]]
+ dat = [x for x in dat if len(x) > maxcol]
+ dat = [[x[self.idcol],x[self.adjpvalcol],x[self.signcol]] for x in dat] # reduce to rank form
+ pvals = [float(x[1]) for x in dat]
+ outofrange = [x for x in pvals if ((x < 0.0) or (x > 1.0))]
+ assert len(outofrange) == 0, '## p values outside 0-1 encountered - was that the right column for adjusted p value?'
+ signs = [float(x[2]) for x in dat]
+ outofrange = [i for i,x in enumerate(signs) if (not x) and (dat[i][self.signcol] <> '0')]
+ bad = [dat[x][2] for x in outofrange]
+ assert len(outofrange) == 0, '## null numeric values encountered for sign - was that the right column? %s' % bad
+ ids = [x[0] for x in dat]
+ res = []
+ self.comments = []
+ useme = []
+ for i,row in enumerate(dat):
+ if row[1].upper() != 'NA' and row[2].upper() != 'NA' and row[0] != '' :
+ useme.append(i)
+ lost = len(dat) - len(useme)
+ if lost <> 0:
+ newdat = [dat[x] for x in useme]
+ del dat
+ dat = newdat
+ print >> sys.stdout, '## %d lost - NA values or null id' % lost
+ if remove_duplicates:
+ uids = list(set(ids)) # complex procedure to get min pval for each unique id
+ if len(uids) <> len(ids): # dupes - deal with mode
+ print >> sys.stdout,'## Dealing with %d uids in %d ids' % (len(uids),len(ids))
+ ures = {}
+ for i,id in enumerate(ids):
+ p = pvals[i]
+ ures.setdefault(id,[])
+ ures[id].append((p,signs[i]))
+ for id in uids:
+ tlist = ures[id]
+ tp = [x[0] for x in tlist]
+ ts = [x[1] for x in tlist]
+ if len(tp) == 1:
+ p = tp[0]
+ sign = ts[0]
+ else:
+ if opts.mode == "Max_probe":
+ p = min(tp)
+ sign = ts[tp.index(p)]
+ else: # guess median - too bad if even count
+ tp.sort()
+ ltp = len(tp)
+ ind = ltp/2 # yes, this is wrong for evens but what if sign wobbles?
+ if ltp % 2 == 1: # odd
+ ind += 1 # take the median
+ p = tp[ind]
+ sign = ts[ind]
+ if KEEPSELECTION:
+ self.comments.append('## for id=%s, got tp=%s, ts=%s, chose p=%f, sign =%f'\
+ % (id,str(tp),str(ts),p,sign))
+ if opts.input_ranks: # must be a rank file
+ res.append((id,'%f' % p))
+ else:
+ if p == 0.0:
+ p = 1e-99
+ try:
+ lp = -math.log10(p) # large positive if low p value
+ except ValueError:
+ lp = 0.0
+ if sign < 0:
+ lp = -lp # if negative, swap p to negative
+ res.append((id,'%f' % lp))
+ else: # no duplicates
+ for i,row in enumerate(dat):
+ (id,p,sign) = (row[0],float(row[1]),float(row[2]))
+ if opts.input_ranks: # must be a rank file
+ res.append((id,'%f' % p))
+ else:
+ if p == 0.0:
+ p = 1e-99
+ try:
+ lp = -math.log10(p) # large positive if low p value
+ except ValueError:
+ lp = 0.0
+ if sign < 0:
+ lp = -lp # if negative, swap p to negative
+ res.append((id,'%f' % lp))
+ else:
+ for i,row in enumerate(dat):
+ (id,p,sign) = (row[0],float(row[1]),float(row[2]))
+ if opts.input_ranks: # must be a rank file
+ res.append((id,'%f' % p))
+ else:
+ if p == 0.0:
+ p = 1e-99
+ try:
+ lp = -math.log10(p) # large positive if low p value
+ except ValueError:
+ lp = 0.0
+ if sign < 0:
+ lp = -lp # if negative, swap p to negative
+ res.append((id,'%f' % lp))
+ len1 = len(ids)
+ ranks = res # [x for x in res if len(x[0]) > 0 and len(x[1].strip()) > 0 and x[0].upper() <> 'NA']
+ # rd = dict(zip([x[0] for x in res],res)) # this does something mysterious to dupes - probably keeps last one
+ # ranks = rd.values()
+ len2 = len(ranks)
+ delta = len1 - len2
+ if delta <> 0:
+ print >> sys.stdout,'NOTE: %d of %d rank input file %s rows deleted - dup, null or NA IDs, pvals or logFCs' % (delta,len1,infname)
+ ranks = [(float(x[1]),x) for x in ranks] # decorate
+ ranks.sort()
+ ranks.reverse()
+ ranks = [x[1] for x in ranks] # undecorate
+ if opts.chip == '': # if mouse - need HUGO
+ ranks = [[x[0].upper(),x[1]] for x in ranks]
+ print >> sys.stdout, '## Fixed any lower case - now have',','.join([x[0] for x in ranks[:5]])
+ ranks = ['\t'.join(x) for x in ranks]
+ if len(ranks) < 2:
+ print >> sys.stderr,'Input %s has 1 or less rows with two tab delimited fields - please check the tool documentation' % infname
+ sys.exit(1)
+ rclean = open(fakeRanks,'w')
+ rclean.write('contig\tscore\n')
+ rclean.write('\n'.join(ranks))
+ rclean.write('\n')
+ rclean.close()
+ cl = []
+ a = cl.append
+ a('java -Xmx6G -cp')
+ a(opts.gsea_jar)
+ a('xtools.gsea.GseaPreranked')
+ a('-gmx %s' % fakeGMT) # ensure .gmt extension as required by GSEA - gene sets to use
+ a('-gui false') # use preranked file mode and no gui
+ a('-make_sets true -rnd_seed timestamp') # more things from the GUI command line display
+ a('-norm meandiv -zip_report false -scoring_scheme weighted') # ? need to set these?
+ a('-rnk %s' % fakeRanks) # input ranks file symbol (the chip file is the crosswalk for ids in first column)
+ a('-out %s' % opts.output_dir)
+ a('-set_max %s' % opts.setMax)
+ a('-set_min %s' % opts.setMin)
+ a('-mode %s' % opts.mode)
+ if opts.chip > '':
+ #a('-chip %s -collapse true -include_only_symbols true' % opts.chip)
+ a('-chip %s -collapse true' % opts.chip)
+ else:
+ a("-collapse false") # needed if no chip
+ a('-nperm %s' % opts.nPerm)
+ a('-rpt_label %s' % opts.title)
+ a('-plot_top_x %s' % opts.plotTop)
+ self.cl = cl
+ self.comments.append('## GSEA command line:')
+ self.comments.append(' '.join(self.cl))
+ self.fakeRanks = fakeRanks
+ self.fakeGMT = fakeGMT
+
+ def grepIds(self):
+ """
+ """
+ found = []
+ allids = open(self.opts.input_ranks,'r').readlines()
+ allids = [x.strip().split() for x in allids]
+ allids = [x[0] for x in allids] # list of ids
+ gmtpath = os.path.split(self.opts.fakeGMT)[0] # get path to all chip
+
+ def run(self):
+ """
+
+ """
+ tlog = os.path.join(self.opts.output_dir,"gsea_runner.log")
+ sto = open(tlog,'w')
+ x = subprocess.Popen(' '.join(self.cl),shell=True,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
+ retval = x.wait()
+ sto.close()
+ d = glob.glob(os.path.join(self.opts.output_dir,'%s*' % self.opts.title))
+ if len(d) > 0:
+ fix_subdir(d[0],self.opts.output_dir)
+ htmlfname = os.path.join(self.opts.output_dir,'index.html')
+ try:
+ html = open(htmlfname,'r').readlines()
+ html = [x.strip() for x in html if len(x.strip()) > 0]
+ if len(self.comments) > 0:
+ s = ['']
+ s += self.comments
+ s.append('
')
+ try:
+ i = html.index('