Mercurial > repos > kellrott > ontologizer
changeset 1:593c09e9c660 draft default tip
Uploaded
| author | kellrott |
|---|---|
| date | Fri, 21 Dec 2012 16:27:23 -0500 |
| parents | b23bb96ea991 |
| children | |
| files | ontologizer/._ontologizer.xml ontologizer/._ontologizer_wrap.py ontologizer/ontologizer.xml ontologizer/ontologizer_wrap.py |
| diffstat | 4 files changed, 196 insertions(+), 49 deletions(-) [+] |
line wrap: on
line diff
--- a/ontologizer/ontologizer.xml Thu Nov 22 01:29:57 2012 -0500 +++ b/ontologizer/ontologizer.xml Fri Dec 21 16:27:23 2012 -0500 @@ -1,6 +1,9 @@ <tool id="ontologizer" name="Ontologizer" version="1.0.0"> <description>Ontologizer</description> <command interpreter="python">ontologizer_wrap.py +--association $association +--go $go +-w ./ #if str($mtc) != 'None': --mtc $mtc #end if @@ -8,23 +11,26 @@ #if str($filter) != '' --filter $filter #end if ---go $go #if $ignore --ignore #end if +#if $infiles.mode == "list": +--studyset ${infiles.studyset} +--population ${infiles.population} --out $out ---association $association --annotation $annotation --dot-out $dot ---studyset $studyset ---population $population +#end if +#if $infiles.mode == "matrix": +--matrix ${infiles.input_matrix} +--matrix-eval "${infiles.eval_text}" +--matrix-type ${infiles.matrix_type} +--out $outmatrix +#end if </command> <inputs> <param name="go" type="data" label="Gene Ontology File" help="Gene Ontology OBO file"/> <param name="association" type="data" label="Gene Association File" help="File containing associations from genes to GO terms"/> - <param name="studyset" type="data" label="Study Set" help="File of the study set or a directory containing study set files"/> - <param name="population" type="data" label="Background Set file" help="File containing genes within the population"/> - <param name="mtc" type="select" label="Multiple Test Correction" help="Specifies the Multiple Test Correction method to use."> <option value="None"/> <option value="Benjamini-Hochberg"/> @@ -45,11 +51,50 @@ <param name="ignore" type="boolean" help="Ignore genes to which no association exist within the calculation"/> <param name="resamplingsteps" type="integer" value="1000" help="Specifies the number of steps used in resampling based MTCs"/> <param name="dot_cutoff" type="float" value="0.1" help="For dot-file cutoff range between 0 and 0.5 specifies the maximum level on which a term is considered as significantly enriched (0.0 - 0.5)"/> + + <conditional name="infiles"> + <param name="mode" type="select" label="Input Mode"> + <option value="list">Gene Lists</option> + <option value="matrix">Gene Matrix</option> + </param> + <when value="list"> + <param name="studyset" type="data" label="Study Set" help="File of the study set or a directory containing study set files"/> + <param name="population" type="data" label="Background Set file" help="File containing genes within the population"/> + </when> + <when value="matrix"> + <param name="input_matrix" type="data" label="Input Matrix" help="Matrix of gene values"/> + <param name="eval_text" type="text" size="90" label="Evaluation Statement" help="evalation statement to select positive set" value="value > 0.0"> + <sanitizer> + <valid initial="string.printable"> + <remove value="""/> + </valid> + <mapping initial="none"> + <add source=""" target="\""/> + </mapping> + </sanitizer> + </param> + <param name="matrix_type" type="select" label="Output Matrix Type"> + <option value="p">P-Value</option> + <option value="p.adjusted">P-Value Adjusted</option> + <option value="p.min">P-Value min</option> + </param> + </when> + </conditional> </inputs> <outputs> - <data name="out" format="tabular" label="Study GO Stats" help="Result Data info"/> - <data name="annotation" format="tabular" label="Annotation" help="An additional file per study set which contains the annotations"/> - <data name="dot" format="dot" label="DOT file" help="For every studyset analysis write out an additional .dot file (GraphViz) containing the GOTerm graph with significant nodes. The optional argument in range between 0 and 0.5 specifies the maximum level on which a term is considered as significantly enriched. By appending a GO Term identifier (separated by a comma) the output is restriced to the subgraph originating at this GO Term."/> + <data name="out" format="tabular" label="Study GO Stats" help="Result Data info"> + <filter>infiles['mode'] == "list"</filter> + </data> + <data name="annotation" format="tabular" label="Annotation" help="An additional file per study set which contains the annotations"> + <filter>infiles['mode'] == "list"</filter> + </data> + <data name="dot" format="dot" label="DOT file" help="For every studyset analysis write out an additional .dot file (GraphViz) containing the GOTerm graph with significant nodes. The optional argument in range between 0 and 0.5 specifies the maximum level on which a term is considered as significantly enriched. By appending a GO Term identifier (separated by a comma) the output is restriced to the subgraph originating at this GO Term."> + <filter>infiles['mode'] == "list"</filter> + </data> + + <data name="outmatrix" format="tabular"> + <filter>infiles['mode'] == "matrix"</filter> + </data> </outputs> <help> Ontologizer is a Java application called the Ontologizer that can be used to analyze
--- a/ontologizer/ontologizer_wrap.py Thu Nov 22 01:29:57 2012 -0500 +++ b/ontologizer/ontologizer_wrap.py Fri Dec 21 16:27:23 2012 -0500 @@ -2,12 +2,14 @@ import sys import os +import re import urllib import tempfile +import csv import shutil from glob import glob import subprocess - +import array from optparse import OptionParser basedir = os.path.dirname(os.path.realpath( __file__)) @@ -18,7 +20,6 @@ humanGoaURL = "http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/gene_association.goa_human.gz?rev=HEAD" - if __name__ == "__main__": parser = OptionParser() parser.add_option("-m", "--mtc", dest="mtc", help="Specifies the MTC method to use. Possible values are: 'Bonferroni' (default), 'None', 'Westfall-Young-Single-Step'", default=None) @@ -35,6 +36,10 @@ parser.add_option("-r", "--resamplingsteps", dest="resamplingsteps", help="Specifies the number of steps used in resampling based MTCs", default=None) parser.add_option("-s", "--studyset", dest="studyset", help="File of the study set or a directory containing study set files. Required", default=None) parser.add_option("-b", "--basedir", dest="basedir", help="Where to store Ontologizer.jar. If not defined, assumed to be ./", default=None) + parser.add_option("-e", "--matrix", dest="matrix", help="Input Matrix (alternate to providing gene sets", default=None) + parser.add_option("-k", "--matrix-eval", dest="matrix_eval", help="Method to determine positive genes in an input matrix", default="value > 0.0") + parser.add_option("-w", "--workdir", dest="workdir", help="Working directory", default=None) + parser.add_option("--matrix-type", dest="matrix_type", help="Matrix mode output type(p, p.adjusted, p.min)", default="p") options, args = parser.parse_args() @@ -47,46 +52,143 @@ if not os.path.exists(jarPath): sys.stdout.write("Downloading Ontologizer.jar\n") urllib.urlretrieve( jarURL, jarPath) - - cmdline = ["java", "-Xmx1024M", "-jar", jarPath] - - cmdline.extend( ["--go", options.go] ) - cmdline.extend( ["--association", options.association] ) - cmdline.extend( ["--studyset", options.studyset] ) - cmdline.extend( ["--population", options.population] ) - - tmpdir = tempfile.mkdtemp( prefix="ontologizer") - cmdline.extend( ["-o", tmpdir] ) - - if options.annotation is not None: - cmdline.extend( ["-n"] ) - if options.dot_output is not None and options.dot is not None: - cmdline.extend( ["--dot", options.dot] ) - - if options.calculation is not None and options.calculation != 'None': - cmdline.extend(["--calculation", options.calculation]) + tmpdir = tempfile.mkdtemp( prefix="ontologizer", dir=options.workdir) - #sys.stdout.write("Running %s\n" % (" ".join(cmdline))) - sys.stdout.write("Running %s\n" % str(cmdline)) - - proc = subprocess.Popen(cmdline, stderr=subprocess.PIPE) - stdout, stderr = proc.communicate() - - if proc.poll() != 0: - sys.stderr.write(stderr) + if options.studyset is not None and options.population is not None: + cmdline = ["java", "-Xmx1024M", "-jar", jarPath] + + cmdline.extend( ["--go", options.go] ) + cmdline.extend( ["--association", options.association] ) + cmdline.extend( ["--studyset", options.studyset] ) + cmdline.extend( ["--population", options.population] ) + + cmdline.extend( ["-o", tmpdir] ) + + if options.annotation is not None: + cmdline.extend( ["-n"] ) + if options.dot_output is not None and options.dot is not None: + cmdline.extend( ["--dot", options.dot] ) + + if options.calculation is not None and options.calculation != 'None': + cmdline.extend(["--calculation", options.calculation]) + + #sys.stdout.write("Running %s\n" % (" ".join(cmdline))) + sys.stdout.write("Running %s\n" % str(cmdline)) + + proc = subprocess.Popen(cmdline, stderr=subprocess.PIPE) + stdout, stderr = proc.communicate() + + if proc.poll() != 0: + sys.stderr.write(stderr) + + tmp = glob(os.path.join(tmpdir, "table-*.txt")) + shutil.move(tmp[0], options.out) + + if options.annotation is not None: + tmp=glob(os.path.join(tmpdir, "anno-*.txt")) + shutil.move(tmp[0], options.annotation) + + if options.dot_output is not None: + tmp=glob(os.path.join(tmpdir, "view-*.dot")) + shutil.move(tmp[0], options.dot_output) + elif options.matrix is not None: + col_set = None + col_rev = None + row_set = None + handle = open(options.matrix) + reader = csv.reader(handle, delimiter="\t") + for row in reader: + if col_set is None: + col_set = {} + col_rev = {} + for i, a in enumerate(row): + if i != 0: + col_set[a] = i + col_rev[i] = a + row_set = [] + else: + row_set.append(row[0]) + handle.close() + + population_file = os.path.join(tmpdir, "pop_set") + study_dir = os.path.join(tmpdir, "study_set") + + handle = open(population_file, "w") + for r in row_set: + handle.write("%s\n" % (r)) + handle.close() - - tmp = glob(os.path.join(tmpdir, "table-*.txt")) - shutil.move(tmp[0], options.out) - - if options.annotation is not None: - tmp=glob(os.path.join(tmpdir, "anno-*.txt")) - shutil.move(tmp[0], options.annotation) + cmdline = ["java", "-Xmx1024M", "-jar", jarPath] + + cmdline.extend( ["--go", options.go] ) + cmdline.extend( ["--association", options.association] ) + cmdline.extend( ["--population", population_file] ) + + cmdline.extend( ["--studyset", study_dir] ) + + cmdline.extend( ["-o", tmpdir] ) + + os.mkdir(study_dir) + for col in col_set: + ihandle = open(options.matrix) + ohandle = open( os.path.join(study_dir, str(col_set[col])), "w") + head = True + reader = csv.reader(ihandle, delimiter="\t") + for row in reader: + if head: + head = False + else: + value = row[col_set[col]] + try: + value = float(value) + except ValueError: + pass + + if eval(options.matrix_eval, {"__builtins__":None}, { 'float' : float, 'int' : int, 'value' : value } ): + ohandle.write("%s\n" % (row[0])) + ihandle.close() + ohandle.close() - if options.dot_output is not None: - tmp=glob(os.path.join(tmpdir, "view-*.dot")) - shutil.move(tmp[0], options.dot_output) - + print cmdline + proc = subprocess.Popen(cmdline, stderr=subprocess.PIPE) + stdout, stderr = proc.communicate() + + if proc.poll() != 0: + sys.stderr.write(stderr) + + go_values = {} + + col_select_map = { + "p" : 9, + "p.adjusted" : 10, + "p.min" : 11 + } + + column_select = col_select_map[ options.matrix_type] + for a in glob(os.path.join(tmpdir,"table-*.txt")): + res = re.search("table-(\d+)-", a) + if res: + cur_col = int(res.group(1)) + head = True + ihandle = open(a) + reader = csv.reader(ihandle, delimiter="\t") + for row in reader: + if head: + head = False + else: + if row[0] not in go_values: + go_values[row[0]] = array.array("f", [float('nan')] * (len(col_set) )) + go_values[row[0]][cur_col-1] = float(row[column_select]) + ihandle.close() + ohandle = open(options.out, "w") + row = ["#go"] + [""] * (len(col_set)) + for c in col_set: + row[col_set[c]] = c + ohandle.write( "%s\n" % ("\t".join(row))) + for go in go_values: + ohandle.write("%s\t%s\n" % (go, "\t".join((str(f) for f in go_values[go]))) ) + ohandle.close() + shutil.rmtree(tmpdir)
