Mercurial > repos > kellrott > ontologizer

Binary file ontologizer/._ontologizer.xml has changed
Binary file ontologizer/._ontologizer_wrap.py has changed
--- a/ontologizer/ontologizer.xml	Thu Nov 22 01:29:57 2012 -0500
+++ b/ontologizer/ontologizer.xml	Fri Dec 21 16:27:23 2012 -0500
@@ -1,6 +1,9 @@
 <tool id="ontologizer" name="Ontologizer" version="1.0.0">
 	<description>Ontologizer</description>
 	<command interpreter="python">ontologizer_wrap.py
+--association $association
+--go $go
+-w ./
 #if str($mtc) != 'None':
 --mtc $mtc
 #end if
@@ -8,23 +11,26 @@
 #if str($filter) != ''
 --filter $filter
 #end if
---go $go
 #if $ignore
 --ignore
 #end if
+#if $infiles.mode == "list":
+--studyset ${infiles.studyset}
+--population ${infiles.population}
 --out $out
---association $association
 --annotation $annotation
 --dot-out $dot
---studyset $studyset
---population $population
+#end if
+#if $infiles.mode == "matrix":
+--matrix ${infiles.input_matrix}
+--matrix-eval "${infiles.eval_text}"
+--matrix-type ${infiles.matrix_type}
+--out $outmatrix
+#end if
 	</command>
 	<inputs>
 		<param name="go" type="data" label="Gene Ontology File" help="Gene Ontology OBO file"/>
 		<param name="association" type="data" label="Gene Association File" help="File containing associations from genes to GO terms"/>
-		<param name="studyset" type="data" label="Study Set" help="File of the study set or a directory containing study set files"/>
-		<param name="population" type="data" label="Background Set file" help="File containing genes within the population"/>
-
 		<param name="mtc" type="select" label="Multiple Test Correction" help="Specifies the Multiple Test Correction method to use.">
 			<option value="None"/>
 			<option value="Benjamini-Hochberg"/>
@@ -45,11 +51,50 @@
 		<param name="ignore" type="boolean" help="Ignore genes to which no association exist within the calculation"/>
 		<param name="resamplingsteps" type="integer" value="1000" help="Specifies the number of steps used in resampling based MTCs"/>
 		<param name="dot_cutoff" type="float" value="0.1" help="For dot-file cutoff range between 0 and 0.5 specifies the maximum level on which a term is considered as significantly enriched (0.0 - 0.5)"/>
+
+		<conditional name="infiles">
+  			<param name="mode" type="select" label="Input Mode">
+	        	<option value="list">Gene Lists</option>
+    		    <option value="matrix">Gene Matrix</option>
+	      	</param>
+		  	<when value="list">
+				<param name="studyset" type="data" label="Study Set" help="File of the study set or a directory containing study set files"/>
+				<param name="population" type="data" label="Background Set file" help="File containing genes within the population"/>
+			</when>
+			<when value="matrix">
+				<param name="input_matrix" type="data" label="Input Matrix" help="Matrix of gene values"/>
+				<param name="eval_text" type="text" size="90" label="Evaluation Statement" help="evalation statement to select positive set" value="value > 0.0">
+					<sanitizer>
+        				<valid initial="string.printable">
+							<remove value="&quot;"/>
+						</valid>
+        				<mapping initial="none">
+         					<add source="&quot;" target="\&quot;"/>
+       					</mapping>
+      				</sanitizer>
+				</param>
+				<param name="matrix_type" type="select" label="Output Matrix Type">
+					<option value="p">P-Value</option>
+					<option value="p.adjusted">P-Value Adjusted</option>
+					<option value="p.min">P-Value min</option>
+				</param>
+			</when>
+		</conditional>
 	</inputs>
 	<outputs>
-		<data name="out" format="tabular" label="Study GO Stats" help="Result Data info"/>
-		<data name="annotation" format="tabular" label="Annotation" help="An additional file per study set which contains the annotations"/>
-		<data name="dot" format="dot" label="DOT file" help="For every studyset analysis write out an additional .dot file (GraphViz) containing the GOTerm graph with significant nodes. The optional argument in range between 0 and 0.5 specifies the maximum level on which a term is considered as significantly enriched. By appending a GO Term identifier (separated by a comma) the output is restriced to the subgraph originating at this GO Term."/>
+		<data name="out" format="tabular" label="Study GO Stats" help="Result Data info">
+			<filter>infiles['mode'] == "list"</filter>
+		</data>
+		<data name="annotation" format="tabular" label="Annotation" help="An additional file per study set which contains the annotations">
+			<filter>infiles['mode'] == "list"</filter>
+		</data>
+		<data name="dot" format="dot" label="DOT file" help="For every studyset analysis write out an additional .dot file (GraphViz) containing the GOTerm graph with significant nodes. The optional argument in range between 0 and 0.5 specifies the maximum level on which a term is considered as significantly enriched. By appending a GO Term identifier (separated by a comma) the output is restriced to the subgraph originating at this GO Term.">
+			<filter>infiles['mode'] == "list"</filter>
+		</data>
+
+		<data name="outmatrix" format="tabular">
+			<filter>infiles['mode'] == "matrix"</filter>
+		</data>
 	</outputs>
 	<help>
 Ontologizer is a Java application called the Ontologizer that can be used to analyze
--- a/ontologizer/ontologizer_wrap.py	Thu Nov 22 01:29:57 2012 -0500
+++ b/ontologizer/ontologizer_wrap.py	Fri Dec 21 16:27:23 2012 -0500
@@ -2,12 +2,14 @@

 import sys
 import os
+import re
 import urllib
 import tempfile
+import csv
 import shutil
 from glob import glob
 import subprocess
-
+import array
 from optparse import OptionParser

 basedir = os.path.dirname(os.path.realpath( __file__))
@@ -18,7 +20,6 @@
 humanGoaURL = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD"


-
 if __name__ == "__main__":
     parser = OptionParser()
     parser.add_option("-m", "--mtc", dest="mtc", help="Specifies the MTC method to use. Possible values are: 'Bonferroni' (default), 'None', 'Westfall-Young-Single-Step'", default=None)
@@ -35,6 +36,10 @@
     parser.add_option("-r", "--resamplingsteps", dest="resamplingsteps", help="Specifies the number of steps used in resampling based MTCs", default=None)
     parser.add_option("-s", "--studyset", dest="studyset", help="File of the study set or a directory containing study set files. Required", default=None)
     parser.add_option("-b", "--basedir", dest="basedir", help="Where to store Ontologizer.jar. If not defined, assumed to be ./", default=None)
+    parser.add_option("-e", "--matrix", dest="matrix", help="Input Matrix (alternate to providing gene sets", default=None)
+    parser.add_option("-k", "--matrix-eval", dest="matrix_eval", help="Method to determine positive genes in an input matrix", default="value > 0.0")
+    parser.add_option("-w", "--workdir", dest="workdir", help="Working directory", default=None)
+    parser.add_option("--matrix-type", dest="matrix_type", help="Matrix mode output type(p, p.adjusted, p.min)", default="p")

     options, args = parser.parse_args()

@@ -47,46 +52,143 @@
     if not os.path.exists(jarPath):
         sys.stdout.write("Downloading Ontologizer.jar\n")
         urllib.urlretrieve( jarURL, jarPath)
-
-    cmdline = ["java", "-Xmx1024M", "-jar", jarPath]
-
-    cmdline.extend( ["--go", options.go] )
-    cmdline.extend( ["--association", options.association] )
-    cmdline.extend( ["--studyset", options.studyset] )
-    cmdline.extend( ["--population", options.population] )
-
-    tmpdir = tempfile.mkdtemp( prefix="ontologizer")
-    cmdline.extend( ["-o", tmpdir] )
-
-    if options.annotation is not None:
-        cmdline.extend( ["-n"] )

-    if options.dot_output is not None and options.dot is not None:
-        cmdline.extend( ["--dot", options.dot] )
-
-    if options.calculation is not None and options.calculation != 'None':
-        cmdline.extend(["--calculation", options.calculation])
+    tmpdir = tempfile.mkdtemp( prefix="ontologizer", dir=options.workdir)

-    #sys.stdout.write("Running %s\n" % (" ".join(cmdline)))
-    sys.stdout.write("Running %s\n" % str(cmdline))
-
-    proc = subprocess.Popen(cmdline, stderr=subprocess.PIPE)
-    stdout, stderr = proc.communicate()
-
-    if proc.poll() != 0:
-        sys.stderr.write(stderr)
+    if options.studyset is not None and options.population is not None:
+        cmdline = ["java", "-Xmx1024M", "-jar", jarPath]
+
+        cmdline.extend( ["--go", options.go] )
+        cmdline.extend( ["--association", options.association] )
+        cmdline.extend( ["--studyset", options.studyset] )
+        cmdline.extend( ["--population", options.population] )
+
+        cmdline.extend( ["-o", tmpdir] )
+
+        if options.annotation is not None:
+            cmdline.extend( ["-n"] )

+        if options.dot_output is not None and options.dot is not None:
+            cmdline.extend( ["--dot", options.dot] )
+
+        if options.calculation is not None and options.calculation != 'None':
+            cmdline.extend(["--calculation", options.calculation])
+
+        #sys.stdout.write("Running %s\n" % (" ".join(cmdline)))
+        sys.stdout.write("Running %s\n" % str(cmdline))
+
+        proc = subprocess.Popen(cmdline, stderr=subprocess.PIPE)
+        stdout, stderr = proc.communicate()
+
+        if proc.poll() != 0:
+            sys.stderr.write(stderr)
+
+        tmp = glob(os.path.join(tmpdir, "table-*.txt"))
+        shutil.move(tmp[0], options.out)
+
+        if options.annotation is not None:
+            tmp=glob(os.path.join(tmpdir, "anno-*.txt"))
+            shutil.move(tmp[0], options.annotation)
+
+        if options.dot_output is not None:
+            tmp=glob(os.path.join(tmpdir, "view-*.dot"))
+            shutil.move(tmp[0], options.dot_output)
+    elif options.matrix is not None:
+        col_set = None
+        col_rev = None
+        row_set = None
+        handle = open(options.matrix)
+        reader = csv.reader(handle, delimiter="\t")
+        for row in reader:
+            if col_set is None:
+                col_set = {}
+                col_rev = {}
+                for i, a in enumerate(row):
+                    if i != 0:
+                        col_set[a] = i
+                        col_rev[i] = a
+                row_set = []
+            else:
+                row_set.append(row[0])
+        handle.close()
+
+        population_file = os.path.join(tmpdir, "pop_set")
+        study_dir = os.path.join(tmpdir, "study_set")
+
+        handle = open(population_file, "w")
+        for r in row_set:
+            handle.write("%s\n" % (r))
+        handle.close()

-
-    tmp = glob(os.path.join(tmpdir, "table-*.txt"))
-    shutil.move(tmp[0], options.out)
-
-    if options.annotation is not None:
-        tmp=glob(os.path.join(tmpdir, "anno-*.txt"))
-        shutil.move(tmp[0], options.annotation)
+        cmdline = ["java", "-Xmx1024M", "-jar", jarPath]
+
+        cmdline.extend( ["--go", options.go] )
+        cmdline.extend( ["--association", options.association] )
+        cmdline.extend( ["--population", population_file] )
+
+        cmdline.extend( ["--studyset", study_dir] )
+
+        cmdline.extend( ["-o", tmpdir] )
+
+        os.mkdir(study_dir)
+        for col in col_set:
+            ihandle = open(options.matrix)
+            ohandle = open( os.path.join(study_dir, str(col_set[col])), "w")
+            head = True
+            reader = csv.reader(ihandle, delimiter="\t")
+            for row in reader:
+                if head:
+                    head = False
+                else:
+                    value = row[col_set[col]]
+                    try:
+                        value = float(value)
+                    except ValueError:
+                        pass
+
+                    if eval(options.matrix_eval, {"__builtins__":None}, { 'float' : float, 'int' : int, 'value' : value } ):
+                        ohandle.write("%s\n" % (row[0]))
+            ihandle.close()
+            ohandle.close()

-    if options.dot_output is not None:
-        tmp=glob(os.path.join(tmpdir, "view-*.dot"))
-        shutil.move(tmp[0], options.dot_output)
-
+        print cmdline
+        proc = subprocess.Popen(cmdline, stderr=subprocess.PIPE)
+        stdout, stderr = proc.communicate()
+
+        if proc.poll() != 0:
+            sys.stderr.write(stderr)
+
+        go_values = {}
+
+        col_select_map = {
+            "p" : 9,
+            "p.adjusted" : 10,
+            "p.min" : 11
+        }
+
+        column_select = col_select_map[ options.matrix_type]
+        for a in glob(os.path.join(tmpdir,"table-*.txt")):
+            res = re.search("table-(\d+)-", a)
+            if res:
+                cur_col = int(res.group(1))
+                head = True
+                ihandle = open(a)
+                reader = csv.reader(ihandle, delimiter="\t")
+                for row in reader:
+                    if head:
+                        head = False
+                    else:
+                        if row[0] not in go_values:
+                            go_values[row[0]] = array.array("f", [float('nan')] * (len(col_set) ))
+                        go_values[row[0]][cur_col-1] = float(row[column_select])
+                ihandle.close()
+        ohandle = open(options.out, "w")
+        row = ["#go"] + [""] * (len(col_set))
+        for c in col_set:
+            row[col_set[c]] = c
+        ohandle.write( "%s\n" % ("\t".join(row)))
+        for go in go_values:
+            ohandle.write("%s\t%s\n" % (go, "\t".join((str(f) for f in go_values[go]))) )
+        ohandle.close()
+
     shutil.rmtree(tmpdir)