view rgGSEA.py @ 0:aef0ecc00683 draft

initial commit
author fubar
date Sun, 09 Jun 2013 01:30:20 -0400
parents
children
line wrap: on
line source

"""
April 2013
eeesh GSEA does NOT respect the mode flag!

Now realise that the creation of the input rank file for gsea needs to take the lowest p value for duplicate 
feature names. To make Ish's life easier, remove duplicate gene ids from any gene set to stop GSEA from 
barfing.

October 14 2012
Amazingly long time to figure out that GSEA fails with useless error message if any filename contains a dash "-"
eesh.

Added history .gmt source - requires passing a faked name to gsea 
Wrapper for GSEA http://www.broadinstitute.org/gsea/index.jsp
Started Feb 22 
Copyright 2012 Ross Lazarus
All rights reserved
Licensed under the LGPL

called eg as

#!/bin/sh
GALAXY_LIB="/data/extended/galaxy/lib"
if [ "$GALAXY_LIB" != "None" ]; then
    if [ -n "$PYTHONPATH" ]; then
        PYTHONPATH="$GALAXY_LIB:$PYTHONPATH"
    else
        PYTHONPATH="$GALAXY_LIB"
    fi
    export PYTHONPATH
fi

cd /data/extended/galaxy/database/job_working_directory/027/27311
python /data/extended/galaxy/tools/rgenetics/rgGSEA.py --input_tab "/data/extended/galaxy/database/files/033/dataset_33806.dat"  --adjpvalcol "5" --signcol "2"
--idcol "1" --outhtml "/data/extended/galaxy/database/files/034/dataset_34455.dat" --input_name "actaearly-Controlearly-actalate-Controllate_topTable.xls"
--setMax "500" --setMin "15" --nPerm "1000" --plotTop "20"
--gsea_jar "/data/extended/galaxy/tool-data/shared/jars/gsea2-2.0.12.jar"
--output_dir "/data/extended/galaxy/database/job_working_directory/027/27311/dataset_34455_files" --mode "Max_probe"
 --title " actaearly-Controlearly-actalate-Controllate_interpro_GSEA" --builtin_gmt "/data/genomes/gsea/3.1/IPR_DOMAIN.gmt"


"""
import optparse
import tempfile
import os
import sys
import subprocess
import time
import shutil
import glob
import math
import re

KEEPSELECTION = False # detailed records for selection of multiple probes

def timenow():
    """return current time as a string
    """
    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))



def fix_subdir(adir,destdir):
    """ Galaxy wants everything in the same files_dir
    if os.path.exists(adir):
        for (d,dirs,files) in os.path.walk(adir):
            for f in files:
                sauce = os.path.join(d,f) 
                shutil.copy(sauce,destdir)   
    """

    def fixAffycrap(apath=''):
        """class='richTable'>RUNNING ES</th><th class='richTable'>CORE ENRICHMENT</th><tr><td class='lessen'>1</td>
        <td><a href='https://www.affymetrix.com/LinkServlet?probeset=LBR'>LBR</a></td><td></td><td></td><td>1113</td>
        <td>0.194</td><td>-0.1065</td><td>No</td></tr><tr><td class='lessen'>2</td><td>
        <a href='https://www.affymetrix.com/LinkServlet?probeset=GGPS1'>GGPS1</a></td><td></td><td></td><td>4309</td><td>0.014</td><td>-0.4328</td>
        <td>No</td></tr>
        """
        html = []
        try:
            html = open(apath,'r').readlines()       
        except:
             return html
        for i,row in enumerate(html):
            row = re.sub('https\:\/\/www.affymetrix.com\/LinkServlet\?probeset=',"http://www.genecards.org/index.php?path=/Search/keyword/",row)
            html[i] = row
        return html

    cleanup = False
    if os.path.exists(adir):
        flist = os.listdir(adir) # get all files created
        for f in flist:
           apath = os.path.join(adir,f)
           dest = os.path.join(destdir,f)
           if not os.path.isdir(apath):
               if os.path.splitext(f)[1].lower() == '.html':
                   html = fixAffycrap(apath)
                   fixed = open(apath,'w')
                   fixed.write('\n'.join(html))
                   fixed.write('\n')
                   fixed.close()
               if not os.path.isfile(dest):
                   shutil.copy(apath,dest)
           else:
               fix_subdir(apath,destdir)
        if cleanup:
            try:
                shutil.rmtree(path=adir,ignore_errors=True)
            except:
                pass



def getFileString(fpath, outpath):
    """
    format a nice file size string
    """
    size = ''
    fp = os.path.join(outpath, fpath)
    s = fpath
    if os.path.isfile(fp):
        n = float(os.path.getsize(fp))
        if n > 2**20:
            size = ' (%1.1f MB)' % (n/2**20)
        elif n > 2**10:
            size = ' (%1.1f KB)' % (n/2**10)
        elif n > 0:
            size = ' (%d B)' % (int(n))
        s = '%s %s' % (fpath, size) 
    return s

class gsea_wrapper:
    """
    GSEA java desktop client has a CL interface. CL can be gleaned by clicking the 'command line' button after setting up an analysis
    We don't want gsea to do the analysis but it can read .rnk files containing rows of identifiers and an evidence weight such as the signed t statistic from limma for differential expression
(vgalaxy)rlazarus@iaas1:~/public_html/idle_illumina_analysis$ cat gseaHumanREFSEQ.sh 
#!/bin/bash
for RNK in `ls *.rnk`
do
DIRNAME=${RNK%.*}
echo $DIRNAME
qsub -cwd -b y java -Xmx4096m -cp /data/app/bin/gsea2-2.07.jar xtools.gsea.GseaPreranked -gmx ../msigdb.v3.0.symbols.gmt -collapse true -mode Max_probe -norm meandiv 
-nperm 1000 -rnk $RNK -scoring_scheme weighted -rpt_label $RNK -chip ../RefSeq_human.chip -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp 
-set_max 500 -set_min 15 -zip_report false -out gseaout/${DIRNAME} -gui false
done
    """
    
    def __init__(self,myName=None,opts=None):
        """ setup cl for gsea
        """
        self.idcol = 0
        self.signcol = 0
        self.adjpvalcol = 0
        self.progname=myName
        self.opts = opts
        remove_duplicates=True
        if not os.path.isdir(opts.output_dir):
            try:
                os.makedirs(opts.output_dir)
            except:
                print >> sys.stderr,'##Error: GSEA wrapper unable to create or find output directory %s. Stopping' % (opts.output_dir)
                sys.exit(1)
        fakeGMT = re.sub('[^a-zA-Z0-9_]+', '', opts.input_name) # gives a more useful title for the GSEA report
        fakeGMT = os.path.join(opts.output_dir,fakeGMT) 
        fakeRanks = '%s.rnk' % fakeGMT
        if not fakeGMT.endswith('.gmt'):
            fakeGMT = '%s.gmt' % fakeGMT
        if opts.builtin_gmt and opts.history_gmt:
            newfile = open(fakeGMT,'w')
            subprocess.call(['cat',opts.builtin_gmt,opts.history_gmt],stdout=newfile)
            newfile.close()
        elif opts.history_gmt:
           subprocess.call(['cp',opts.history_gmt,fakeGMT])
        else:       
           subprocess.call(['cp',opts.builtin_gmt,fakeGMT])
        # remove dupes from each gene set
        gmt = open(fakeGMT,'r').readlines()
        gmt = [x for x in gmt if len(x.split('\t')) > 3]
        ugmt = []
        for i,row in enumerate(gmt):
            rows = row.rstrip().split('\t')
            gmtname = rows[0]
            gmtcomment = rows[1]
            glist = list(set(rows[2:]))
            newgmt = [gmtname,gmtcomment]
            newgmt += glist
            ugmt.append('\t'.join(newgmt))
        gmt = open(fakeGMT,'w')
        gmt.write('\n'.join(ugmt))
        gmt.write('\n')
        gmt.close()
        if opts.input_ranks:
            infname = opts.input_ranks
            rdat = open(opts.input_ranks,'r').readlines() # suck in and remove blank ids that cause gsea to barf rml april 10 2012
            rdat = [x.rstrip().split('\t') for x in rdat[1:]] # ignore head
            dat = [[x[0],x[1],x[1]] for x in rdat] 
            # fake same structure as input tabular file
            try:
                pvals = [float(x[1]) for x in dat]
                signs = [float(x[1]) for x in dat]
            except:
                print >> sys.stderr, '## error converting floating point - cannot process this input'
                sys.exit(99)
        else: # read tabular
            self.idcol = int(opts.idcol) - 1
            self.signcol = int(opts.signcol) - 1
            self.adjpvalcol = int(opts.adjpvalcol) - 1
            maxcol = max(self.idcol,self.signcol,self.adjpvalcol)
            infname = opts.input_tab
            indat = open(opts.input_tab,'r').readlines()
            dat = [x.rstrip().split('\t') for x in indat[1:]]
            dat = [x for x in dat if len(x) > maxcol]
            dat = [[x[self.idcol],x[self.adjpvalcol],x[self.signcol]] for x in dat] # reduce to rank form 
            pvals = [float(x[1]) for x in dat]
            outofrange = [x for x in pvals if ((x < 0.0) or (x > 1.0))]
            assert len(outofrange) == 0, '## p values outside 0-1 encountered - was that the right column for adjusted p value?'
            signs = [float(x[2]) for x in dat]
            outofrange = [i for i,x in enumerate(signs) if (not x) and (dat[i][self.signcol] <> '0')]
            bad = [dat[x][2] for x in outofrange] 
            assert len(outofrange) == 0, '## null numeric values encountered for sign - was that the right column? %s' % bad 
        ids = [x[0] for x in dat]
        res = []
        self.comments = []
        useme = []
        for i,row in enumerate(dat):
            if row[1].upper() != 'NA' and row[2].upper() != 'NA' and row[0] != '' :
                useme.append(i)
        lost = len(dat) - len(useme)
        if lost <> 0:
            newdat = [dat[x] for x in useme]
            del dat
            dat = newdat  
            print >> sys.stdout, '## %d lost - NA values or null id' % lost
        if remove_duplicates:
            uids = list(set(ids)) # complex procedure to get min pval for each unique id
            if len(uids) <> len(ids): # dupes - deal with mode
                print >> sys.stdout,'## Dealing with %d uids in %d ids' % (len(uids),len(ids))
                ures = {}
                for i,id in enumerate(ids):
                    p = pvals[i]
                    ures.setdefault(id,[])
                    ures[id].append((p,signs[i]))
                for id in uids:
                    tlist = ures[id]
                    tp = [x[0] for x in tlist]
                    ts = [x[1] for x in tlist]
                    if len(tp) == 1:
                        p = tp[0]
                        sign = ts[0]
                    else:
                        if  opts.mode == "Max_probe":
                            p = min(tp)
                            sign = ts[tp.index(p)]
                        else: # guess median - too bad if even count
                            tp.sort()
                            ltp = len(tp)
                            ind = ltp/2 # yes, this is wrong for evens but what if sign wobbles?
                            if ltp % 2 == 1: # odd
                                ind += 1 # take the median                            
                            p = tp[ind]
                            sign = ts[ind]
                        if KEEPSELECTION:
                              self.comments.append('## for id=%s, got tp=%s, ts=%s, chose p=%f, sign =%f'\
                                   % (id,str(tp),str(ts),p,sign))
                    if opts.input_ranks: # must be a rank file
                        res.append((id,'%f' % p))
                    else:
                        if p == 0.0:
                            p = 1e-99
                        try:
                            lp = -math.log10(p) # large positive if low p value
                        except ValueError:
                            lp = 0.0
                        if sign < 0:
                            lp = -lp # if negative, swap p to negative
                        res.append((id,'%f' % lp))
            else: # no duplicates
                for i,row in enumerate(dat):
                    (id,p,sign) = (row[0],float(row[1]),float(row[2]))
                    if opts.input_ranks: # must be a rank file
                        res.append((id,'%f' % p))
                    else:
                        if p == 0.0:
                            p = 1e-99
                        try:
                            lp = -math.log10(p) # large positive if low p value
                        except ValueError:
                            lp = 0.0
                        if sign < 0:
                            lp = -lp # if negative, swap p to negative
                        res.append((id,'%f' % lp))
        else:
            for i,row in enumerate(dat):
                (id,p,sign) = (row[0],float(row[1]),float(row[2]))
                if opts.input_ranks: # must be a rank file
                    res.append((id,'%f' % p))
                else:
                    if p == 0.0:
                        p = 1e-99
                    try:
                        lp = -math.log10(p) # large positive if low p value
                    except ValueError:
                        lp = 0.0
                    if sign < 0:
                        lp = -lp # if negative, swap p to negative
                    res.append((id,'%f' % lp))
        len1 = len(ids)
        ranks = res # [x for x in res if len(x[0]) > 0 and len(x[1].strip()) > 0 and x[0].upper() <> 'NA']
        # rd = dict(zip([x[0] for x in res],res)) # this does something mysterious to dupes - probably keeps last one
        # ranks = rd.values()
        len2 = len(ranks)
        delta = len1 - len2
        if delta <> 0:
            print >> sys.stdout,'NOTE: %d of %d rank input file %s rows deleted - dup, null or NA IDs, pvals or logFCs' % (delta,len1,infname)
        ranks = [(float(x[1]),x) for x in ranks] # decorate
        ranks.sort()
        ranks.reverse()
        ranks = [x[1] for x in ranks] # undecorate
        if opts.chip == '': # if mouse - need HUGO
             ranks = [[x[0].upper(),x[1]] for x in ranks]
             print >> sys.stdout, '## Fixed any lower case - now have',','.join([x[0] for x in ranks[:5]])
        ranks = ['\t'.join(x) for x in ranks]
        if len(ranks) < 2:
             print >> sys.stderr,'Input %s has 1 or less rows with two tab delimited fields - please check the tool documentation' % infname
             sys.exit(1)
        rclean = open(fakeRanks,'w')
        rclean.write('contig\tscore\n')
        rclean.write('\n'.join(ranks)) 
        rclean.write('\n')
        rclean.close()
        cl = []
        a = cl.append  
        a('java -Xmx6G -cp')
        a(opts.gsea_jar)
        a('xtools.gsea.GseaPreranked')
        a('-gmx %s' % fakeGMT) # ensure .gmt extension as required by GSEA - gene sets to use
        a('-gui false')    # use preranked file mode and no gui
        a('-make_sets true -rnd_seed timestamp') # more things from the GUI command line display
        a('-norm meandiv -zip_report false -scoring_scheme weighted')            # ? need to set these? 
        a('-rnk %s' % fakeRanks) # input ranks file symbol (the chip file is the crosswalk for ids in first column)
        a('-out %s' % opts.output_dir) 
        a('-set_max %s' % opts.setMax)
        a('-set_min %s' % opts.setMin)
        a('-mode %s' % opts.mode)
        if opts.chip > '':
           #a('-chip %s -collapse true -include_only_symbols true' % opts.chip)   
           a('-chip %s -collapse true' % opts.chip)   
        else:
           a("-collapse false") # needed if no chip 
        a('-nperm %s' % opts.nPerm)
        a('-rpt_label %s' % opts.title)
        a('-plot_top_x %s' % opts.plotTop)
        self.cl = cl
        self.comments.append('## GSEA command line:')
        self.comments.append(' '.join(self.cl))
        self.fakeRanks = fakeRanks
        self.fakeGMT = fakeGMT
     
    def grepIds(self):
        """
        """
        found = []
        allids = open(self.opts.input_ranks,'r').readlines()
        allids = [x.strip().split() for x in allids]
        allids = [x[0] for x in allids] # list of ids
        gmtpath = os.path.split(self.opts.fakeGMT)[0] # get path to all chip

    def run(self):
        """
        
        """
        tlog = os.path.join(self.opts.output_dir,"gsea_runner.log")
        sto = open(tlog,'w')
        x = subprocess.Popen(' '.join(self.cl),shell=True,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
        retval = x.wait()
        sto.close()
        d = glob.glob(os.path.join(self.opts.output_dir,'%s*' % self.opts.title))
        if len(d) > 0:
            fix_subdir(d[0],self.opts.output_dir)       
        htmlfname = os.path.join(self.opts.output_dir,'index.html')     
        try:
            html = open(htmlfname,'r').readlines()
            html = [x.strip() for x in html if len(x.strip()) > 0]
            if len(self.comments) > 0:
                s = ['<pre>']
                s += self.comments
                s.append('</pre>')
                try:
                    i = html.index('<div id="footer">')
                except:
                    i = len(html) - 7 # fudge
                html = html[:i] + s + html[i:]
        except:
            html = []
            htmlhead = '<html><head></head><body>'
            html.append('## Galaxy GSEA wrapper failure')
            html.append('## Unable to find index.html in %s - listdir=%s' % (d,' '.join(os.listdir(self.opts.output_dir))))
            html.append('## Command line was %s' % (' '.join(self.cl)))
            html.append('## commonly caused by mismatched ID/chip selection')
            glog = open(os.path.join(self.opts.output_dir,'gsea_runner.log'),'r').readlines()
            html.append('## gsea_runner.log=%s' % '\n'.join(glog))
            #tryme = self.grepIds()
            retval = 1
            print >> sys.stderr,'\n'.join(html)
            html = ['%s<br/>' % x for x in html]
            html.insert(0,htmlhead)
            html.append('</body></html>')
        htmlf = file(self.opts.outhtml,'w')
        htmlf.write('\n'.join(html))
        htmlf.write('\n')
        htmlf.close()
        os.unlink(self.fakeRanks)
        os.unlink(self.fakeGMT)
        if opts.outtab_neg:
            tabs = glob.glob(os.path.join(opts.output_dir,"gsea_report_for_*.xls"))
            if len(tabs) > 0:
                for tabi,t in enumerate(tabs):
                    tkind = os.path.basename(t).split('_')[4].lower()
                    if tkind == 'neg':
                       outtab = opts.outtab_neg
                    elif tkind == 'pos':
                       outtab = opts.outtab_pos
                    else:
                       print >> sys.stderr, '## tab file matched %s which is not "neg" or "pos" in 4th segment %s' % (t,tkind)
                       sys.exit()
                    content = open(t).readlines()
                    tabf = open(outtab,'w')
                    tabf.write(''.join(content))
                    tabf.close()
            else:
                print >> sys.stdout, 'Odd, maketab = %s but no matches - tabs = %s' % (makeTab,tabs)
        return retval
        

if __name__ == "__main__":
    """ 
    called as:
   <command interpreter="python">rgGSEA.py --input_ranks "$input1"  --outhtml "$html_file"
       --setMax "$setMax" --setMin "$setMin" --nPerm "$nPerm" --plotTop "$plotTop" --gsea_jar "$GALAXY_DATA_INDEX_DIR/shared/jars/gsea2-2.07.jar" 
       --output_dir "$html_file.files_path" --use_gmt ""${use_gmt.fields.path}"" --chip "${use_chip.fields.path}"
  </command>
    """
    op = optparse.OptionParser()
    a = op.add_option
    a('--input_ranks',default=None)
    a('--input_tab',default=None)
    a('--input_name',default=None)
    a('--use_gmt',default=None)
    a('--history_gmt',default=None)
    a('--builtin_gmt',default=None)
    a('--history_gmt_name',default=None)
    a('--setMax',default="500")
    a('--setMin',default="15")
    a('--nPerm',default="1000") 
    a('--title',default="GSEA report") 
    a('--chip',default='')
    a('--plotTop',default='20')
    a('--outhtml',default=None)
    a('--makeTab',default=None)
    a('--output_dir',default=None)
    a('--outtab_neg',default=None)
    a('--outtab_pos',default=None)
    a('--adjpvalcol',default=None)
    a('--signcol',default=None)
    a('--idcol',default=None)
    a('--mode',default='Max_probe')
    a('-j','--gsea_jar',default='/usr/local/bin/gsea2-2.07.jar')
    opts, args = op.parse_args() 
    assert os.path.isfile(opts.gsea_jar),'## GSEA runner unable to find supplied gsea java desktop executable file %s' % opts.gsea_jar
    if opts.input_ranks:
        inpf = opts.input_ranks
    else:
        inpf = opts.input_tab
        assert opts.idcol <> None, '## GSEA runner needs an id column if a tabular file provided'
        assert opts.signcol <> None, '## GSEA runner needs a sign column if a tabular file provided'
        assert opts.adjpvalcol <> None, '## GSEA runner needs an adjusted p value column if a tabular file provided'
    assert os.path.isfile(inpf),'## GSEA runner unable to open supplied input file %s' % inpf
    if opts.chip > '':
        assert os.path.isfile(opts.chip),'## GSEA runner unable to open supplied chip file %s' % opts.chip
    some = None
    if opts.history_gmt <> None:
        some = 1
        assert os.path.isfile(opts.history_gmt),'## GSEA runner unable to open supplied history gene set matrix (.gmt) file %s' % opts.history_gmt
    if opts.builtin_gmt <> None:
        some = 1
        assert os.path.isfile(opts.builtin_gmt),'## GSEA runner unable to open supplied history gene set matrix (.gmt) file %s' % opts.builtin_gmt
    assert some, '## GSEA runner needs a gene set matrix file - none chosen?'
    opts.title = re.sub('[^a-zA-Z0-9_]+', '', opts.title)
    myName=os.path.split(sys.argv[0])[-1]
    gse = gsea_wrapper(myName, opts=opts)
    retcode = gse.run()
    if retcode <> 0:
        sys.exit(retcode) # indicate failure to job runner