changeset 1:5ab17fe9e056

Uploaded
author greg
date Tue, 19 Jul 2011 11:33:10 -0400
parents 4f07f3a33605
children 2da9b965ac4f
files blast2go-7b53cc52e7ed/.hg_archival.txt blast2go-7b53cc52e7ed/tools/ncbi_blast_plus/blast2go.py blast2go-7b53cc52e7ed/tools/ncbi_blast_plus/blast2go.txt blast2go-7b53cc52e7ed/tools/ncbi_blast_plus/blast2go.xml filtering.py filtering.xml
diffstat 6 files changed, 369 insertions(+), 193 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast2go-7b53cc52e7ed/.hg_archival.txt	Tue Jul 19 11:33:10 2011 -0400
@@ -0,0 +1,5 @@
+repo: 4bfd64cf18ab5d0fe74e14afdb6634d8a5f9abb2
+node: 7b53cc52e7eda18a49312bfab66a962d4b6ada71
+branch: default
+latesttag: null
+latesttagdistance: 2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast2go-7b53cc52e7ed/tools/ncbi_blast_plus/blast2go.py	Tue Jul 19 11:33:10 2011 -0400
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+"""Galaxy wrapper for Blast2GO for pipelines, b2g4pipe v2.3.5.
+
+This script takes exactly three command line arguments:
+ * Input BLAST XML filename
+ * Blast2GO properties filename (settings file)
+ * Output tabular filename
+
+Sadly b2g4pipe v2.3.5 cannot cope with current style large BLAST XML
+files (e.g. from BLAST 2.2.25+), so we have to reformat these to
+avoid it crashing with a Java heap space OutOfMemoryError.
+
+As part of this reformatting, we check for BLASTP or BLASTX output
+(otherwise raise an error), and print the query count.
+
+It then calls the Java command line tool, and moves the output file to
+the location Galaxy is expecting, and removes the tempory XML file.
+"""
+import sys
+import os
+import subprocess
+
+#You may need to edit this to match your local setup,
+blast2go_jar = "/opt/b2g4pipe/blast2go.jar"
+
+
+def stop_err(msg, error_level=1):
+   """Print error message to stdout and quit with given error level."""
+   sys.stderr.write("%s\n" % msg)
+   sys.exit(error_level)
+
+if len(sys.argv) != 4:
+   stop_err("Require three arguments: XML filename, properties filename, output tabular filename")
+
+xml_file, prop_file, tabular_file = sys.argv[1:]
+
+#We should have write access here:
+tmp_xml_file = tabular_file + ".tmp.xml"
+
+if not os.path.isfile(xml_file):
+   stop_err("Input BLAST XML file not found: %s" % xml_file)
+
+if not os.path.isfile(prop_file):
+   stop_err("Blast2GO configuration file not found: %s" % prop_file)
+
+def prepare_xml(original_xml, mangled_xml):
+    """Reformat BLAST XML to suit Blast2GO.
+
+    Blast2GO can't cope with 1000s of <Iteration> tags within a
+    single <BlastResult> tag, so instead split this into one
+    full XML record per interation (i.e. per query). This gives
+    a concatenated XML file mimicing old versions of BLAST.
+
+    This also checks for BLASTP or BLASTX output, and outputs
+    the number of queries. Galaxy will show this as "info".
+    """
+    in_handle = open(original_xml)
+    footer = "  </BlastOutput_iterations>\n</BlastOutput>\n"
+    header = ""
+    while True:
+        line = in_handle.readline()
+        if not line:
+            #No hits?
+            stop_err("Problem with XML file?")
+        if line.strip() == "<Iteration>":
+            break
+        header += line
+
+    if "<BlastOutput_program>blastx</BlastOutput_program>" in header:
+        print "BLASTX output identified"
+    elif "<BlastOutput_program>blastp</BlastOutput_program>" in header:
+        print "BLASTP output identified"
+    else:
+        in_handle.close()
+        stop_err("Expect BLASTP or BLASTX output")
+
+    out_handle = open(mangled_xml, "w")
+    out_handle.write(header)
+    out_handle.write(line)
+    count = 1
+    while True:
+        line = in_handle.readline()
+        if not line:
+            break
+        elif line.strip() == "<Iteration>":
+           #Insert footer/header
+           out_handle.write(footer)
+           out_handle.write(header)
+           count += 1
+        out_handle.write(line)
+
+    out_handle.close()
+    in_handle.close()
+    print "Input has %i queries" % count
+
+
+def run(cmd):
+    #Avoid using shell=True when we call subprocess to ensure if the Python
+    #script is killed, so too is the child process.
+    try:
+        child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    except Exception, err:
+        stop_err("Error invoking command:\n%s\n\n%s\n" % (" ".join(cmd), err))
+    #Use .communicate as can get deadlocks with .wait(),
+    stdout, stderr = child.communicate()
+    return_code = child.returncode
+    if return_code:
+        cmd_str = " ".join(cmd)
+        if stderr and stdout:
+            stop_err("Return code %i from command:\n%s\n\n%s\n\n%s" % (return_code, cmd_str, stdout, stderr))
+        else:
+            stop_err("Return code %i from command:\n%s\n%s" % (return_code, cmd_str, stderr))
+    #For early diagnostics,
+    else:
+       print stdout
+       print stderr
+
+if not os.path.isfile(blast2go_jar):
+   stop_err("Blast2GO JAR file not found: %s" % blast2go_jar)
+
+prepare_xml(xml_file, tmp_xml_file)
+#print "XML file prepared for Blast2GO"
+
+#We will have write access wherever the output should be,
+#so we'll ask Blast2GO to use that as the stem for its output
+#(it will append .annot to the filename)
+cmd = ["java", "-jar", blast2go_jar,
+       "-in", tmp_xml_file,
+       "-prop", prop_file,
+       "-out", tabular_file, #Used as base name for output files
+       "-a", # Generate *.annot tabular file
+       #"-img", # Generate images, feature not in v2.3.5
+       ]
+#print " ".join(cmd)
+run(cmd)
+
+#Remove the temp XML file
+os.remove(tmp_xml_file)
+
+out_file = tabular_file + ".annot"
+if not os.path.isfile(out_file):
+   stop_err("ERROR - No output annotation file from Blast2GO")
+
+#Move the output file where Galaxy expects it to be:
+os.rename(out_file, tabular_file)
+
+print "Done"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast2go-7b53cc52e7ed/tools/ncbi_blast_plus/blast2go.txt	Tue Jul 19 11:33:10 2011 -0400
@@ -0,0 +1,127 @@
+Galaxy wrapper for Blast2GO for pipelines, b2g4pipe
+===================================================
+
+This wrapper is copyright 2011 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below.
+
+This is a wrapper for the command line Java tool b2g4pipe v2.3.5,
+Blast2GO for pipelines. See:
+
+S. Götz et al.
+High-throughput functional annotation and data mining with the Blast2GO suite.
+Nucleic Acids Res. 36(10):3420–3435, 2008.
+http://dx.doi.org/10.1093/nar/gkn176
+
+A. Conesa and S. Götz.
+Blast2GO: A Comprehensive Suite for Functional Analysis in Plant Genomics.
+Int. J. Plant Genomics. 619832, 2008.
+http://dx.doi.org/10.1155/2008/619832
+
+A. Conesa et al.
+Blast2GO: A universal tool for annotation, visualization and analysis in functional genomics research.
+Bioinformatics 21:3674-3676, 2005.
+http://dx.doi.org/10.1093/bioinformatics/bti610
+
+http://www.blast2go.org/
+
+
+
+Installation
+============
+
+You can change the path by editing the definition near the start of the Python
+script blast2go.py, but by default it expects the underlying tool to be here:
+
+/opt/b2g4pip/blast2go.jar
+
+To install the wrapper copy or move the following files under the Galaxy tools
+folder, e.g. in the tools/ncbi_blast_blast folder:
+
+* blast2go.xml (the Galaxy tool definition)
+* blast2go.py (the Python wrapper script)
+* blast2go.txt (this README file)
+
+You will also need to modify the tools_conf.xml file to tell Galaxy to offer the
+tool. We suggest putting it next to the NCBI BLAST+ wrappers. Just add the line:
+
+<tool file="ncbi_blast_plus/blast2go.xml" />
+
+As part of setting up b2g4pipe you will need to setup one or more Blast2GO
+property files which tell the tool which database to use etc. The example
+b2gPipe.properties provided with b2g4pipe v2.3.5 is out of date, with the
+latest server IP address and database name given on the Blast2GO website.
+These files can be anywhere accessable to the Galaxy Unix user, we put them
+under /opt/b2g4pipe with the JAR file etc.
+
+You must tell Galaxy about these Blast2GO property files so that they can be
+offered to the user. Create the file tool-data/blast2go.loc under the Galaxy
+folder. This must be plain text, tab separated, with three columns:
+
+(1) ID for the setup, e.g. Spain_2010_May
+(2) Description for the setup, e.g. Database in Spain (May 2010)
+(3) Properties filename for the setup, e.g. /opt/b2g4pipe/Spain_2010_May.properties
+
+Avoid including "Blast2GO" in the description (column 2) as this will be
+included in the automatically assigned output dataset name. The blast2go.loc
+file allows you to customise the database setup. If for example you have a local
+Blast2GO server running (which we recommend for speed), and you want this to be
+the default setting, include it as the first line in your blast2go.loc file.
+
+Consult the Blast2GO documentation for details about the property files and
+setting up a local MySQL Blast2GO database.
+
+
+History
+=======
+
+v0.0.1 - Initial public release
+v0.0.2 - Documentation clarifications, e.g. concatenated BLAST XML is allowed.
+       - Fixed error handler in wrapper script (for when b2g4pipe fails).
+       - Reformats the XML to use old NCBI-style concatenated BLAST XML since
+         b2g4pipe crashes with heap space error on with large files using
+         current NCBI output.
+
+
+Developers
+==========
+
+This script and related tools are being developed on the following hg branch:
+http://bitbucket.org/peterjc/galaxy-central/src/tools
+
+For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball I use
+the following command from the Galaxy root folder:
+
+$ tar -czf blast2go.tar.gz tools/ncbi_blast_plus/blast2go.xml tools/ncbi_blast_plus/blast2go.py tools/ncbi_blast_plus/blast2go.txt
+
+Check this worked:
+
+$ tar -tzf blast2go.tar.gz
+tools/ncbi_blast_plus/blast2go.xml
+tools/ncbi_blast_plus/blast2go.py
+tools/ncbi_blast_plus/blast2go.txt
+
+
+Licence (MIT/BSD style)
+=======================
+
+Permission to use, copy, modify, and distribute this software and its
+documentation with or without modifications and for any purpose and
+without fee is hereby granted, provided that any copyright notices
+appear in all copies and that both those copyright notices and this
+permission notice appear in supporting documentation, and that the
+names of the contributors or copyright holders not be used in
+advertising or publicity pertaining to distribution of the software
+without specific prior permission.
+
+THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
+WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+OR PERFORMANCE OF THIS SOFTWARE.
+
+NOTE: This is the licence for the Galaxy Wrapper only. Blast2GO and
+associated data files are available and licenced separately.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast2go-7b53cc52e7ed/tools/ncbi_blast_plus/blast2go.xml	Tue Jul 19 11:33:10 2011 -0400
@@ -0,0 +1,90 @@
+<tool id="blast2go" name="Blast2GO" version="0.0.2">
+    <description>Maps BLAST results to GO annotation terms</description>
+    <command interpreter="python">
+        blast2go.py $xml ${prop.fields.path} $tab
+    </command>
+    <inputs>
+        <param name="xml" type="data" format="blastxml" label="BLAST XML results" description="You must have run BLAST against a protein database such as the NCBI non-redundant (NR) database. Use BLASTX for nucleotide queries, BLASTP for protein queries." /> 
+        <param name="prop" type="select" label="Blast2GO settings" description="One or more configurations can be setup, such as using the Blast2GO team's server in Spain, or a local database.">
+             <options from_file="blast2go.loc">
+                 <column name="value" index="0"/>
+                 <column name="name" index="1"/>
+                 <column name="path" index="2"/>
+             </options>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="tab" format="tabular" label="Blast2GO ${prop.fields.name}" />
+    </outputs>
+    <requirements>
+    </requirements>
+    <tests>
+    </tests>
+    <help>
+.. class:: warningmark
+
+**Note**. Blast2GO may take a substantial amount of time, especially if
+running against the public server in Spain. For large input datasets it
+is advisable to allow overnight processing, or consider subdividing.
+
+-----
+
+**What it does**
+
+This runs b2g4Pipe, the command line (no GUI) version of Blast2GO designed
+for use in pipelines.
+
+It takes as input BLAST XML results against a protein database, typically
+the NCBI non-redundant (NR) database. This tool will accept concatenated
+BLAST XML files (although they are technically invalid XML), which is very
+useful if you have sub-divided your protein FASTA files and run BLAST on
+them in batches.
+
+The BLAST matches are used to assign Gene Ontology (GO) annotation terms
+to each query sequence.
+
+The output from this tool is a tabular file containing three columns, with
+the order taken from query order in the original BLAST XML file:
+
+====== ====================================
+Column Description
+------ ------------------------------------
+     1 ID and description of query sequence
+     2 GO term
+     3 GO description
+====== ====================================
+
+Note that if no GO terms are assigned to a sequence (e.g. if it had no
+BLAST matches), then it will not be present in the output file.
+
+
+**Advanced Settings**
+
+Blast2GO has a properties setting file which includes which database
+server to connect to (e.g. the public server in Valencia, Spain, or a
+local server), as well as more advanced options such as thresholds and
+evidence code weights. To change these settings, your Galaxy administrator
+must create a new properties file, and add it to the drop down menu above.
+
+
+**References**
+
+S. Götz et al.
+High-throughput functional annotation and data mining with the Blast2GO suite.
+Nucleic Acids Res. 36(10):3420–3435, 2008.
+http://dx.doi.org/10.1093/nar/gkn176
+
+A. Conesa and S. Götz.
+Blast2GO: A Comprehensive Suite for Functional Analysis in Plant Genomics.
+Int. J. Plant Genomics. 619832, 2008.
+http://dx.doi.org/10.1155/2008/619832
+
+A. Conesa et al.
+Blast2GO: A universal tool for annotation, visualization and analysis in functional genomics research.
+Bioinformatics 21:3674-3676, 2005.
+http://dx.doi.org/10.1093/bioinformatics/bti610
+
+http://www.blast2go.org/
+
+    </help>
+</tool>
--- a/filtering.py	Fri Jul 15 09:41:23 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,128 +0,0 @@
-#!/usr/bin/env python
-# This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties.
-# The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
-
-from __future__ import division
-import sys, re, os.path
-from galaxy import eggs
-
-# Older py compatibility
-try:
-    set()
-except:
-    from sets import Set as set
-
-assert sys.version_info[:2] >= ( 2, 4 )
-
-def get_operands( filter_condition ):
-    # Note that the order of all_operators is important
-    items_to_strip = ['+', '-', '**', '*', '//', '/', '%', '<<', '>>', '&', '|', '^', '~', '<=', '<', '>=', '>', '==', '!=', '<>', ' and ', ' or ', ' not ', ' is ', ' is not ', ' in ', ' not in ']
-    for item in items_to_strip:
-        if filter_condition.find( item ) >= 0:
-            filter_condition = filter_condition.replace( item, ' ' )
-    operands = set( filter_condition.split( ' ' ) )
-    return operands
-
-def stop_err( msg ):
-    sys.stderr.write( msg )
-    sys.exit()
-
-in_fname = sys.argv[1]
-out_fname = sys.argv[2]
-cond_text = sys.argv[3]
-try:
-    in_columns = int( sys.argv[4] )
-    assert sys.argv[5]  #check to see that the column types varaible isn't null
-    in_column_types = sys.argv[5].split( ',' )
-except:
-    stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
-
-# Unescape if input has been escaped
-mapped_str = {
-    '__lt__': '<',
-    '__le__': '<=',
-    '__eq__': '==',
-    '__ne__': '!=',
-    '__gt__': '>',
-    '__ge__': '>=',
-    '__sq__': '\'',
-    '__dq__': '"',
-}
-for key, value in mapped_str.items():
-    cond_text = cond_text.replace( key, value )
-    
-# Attempt to determine if the condition includes executable stuff and, if so, exit
-secured = dir()
-operands = get_operands(cond_text)
-for operand in operands:
-    try:
-        check = int( operand )
-    except:
-        if operand in secured:
-            stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
-
-# Prepare the column variable names and wrappers for column data types
-cols, type_casts = [], []
-for col in range( 1, in_columns + 1 ):
-    col_name = "c%d" % col
-    cols.append( col_name )
-    col_type = in_column_types[ col - 1 ]
-    type_cast = "%s(%s)" % ( col_type, col_name )
-    type_casts.append( type_cast )
- 
-col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
-type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
-assign = "%s = line.split( '\\t' )" % col_str
-wrap = "%s = %s" % ( col_str, type_cast_str )
-skipped_lines = 0
-first_invalid_line = 0
-invalid_line = None
-lines_kept = 0
-total_lines = 0
-out = open( out_fname, 'wt' )
-    
-# Read and filter input file, skipping invalid lines
-code = '''
-for i, line in enumerate( file( in_fname ) ):
-    total_lines += 1
-    line = line.rstrip( '\\r\\n' )
-    if not line or line.startswith( '#' ):
-        skipped_lines += 1
-        if not invalid_line:
-            first_invalid_line = i + 1
-            invalid_line = line
-        continue
-    try:
-        %s
-        %s
-        if %s:
-            lines_kept += 1
-            print >> out, line
-    except:
-        skipped_lines += 1
-        if not invalid_line:
-            first_invalid_line = i + 1
-            invalid_line = line
-''' % ( assign, wrap, cond_text )
-
-valid_filter = True
-try:
-    exec code
-except Exception, e:
-    out.close()
-    if str( e ).startswith( 'invalid syntax' ):
-        valid_filter = False
-        stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text )
-    else:
-        stop_err( str( e ) )
-
-if valid_filter:
-    out.close()
-    valid_lines = total_lines - skipped_lines
-    print 'Filtering with %s, ' % cond_text
-    if valid_lines > 0:
-        print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
-    else:
-        print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
-    if skipped_lines > 0:
-        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
--- a/filtering.xml	Fri Jul 15 09:41:23 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,65 +0,0 @@
-<tool id="Filter1" name="Filter" version="1.0.1">
-  <description>data on any column using simple expressions</description>
-  <command interpreter="python">
-    filtering.py $input $out_file1 "$cond" ${input.metadata.columns} "${input.metadata.column_types}"
-  </command>
-  <inputs>
-    <param format="tabular" name="input" type="data" label="Filter" help="Query missing? See TIP below."/>
-    <param name="cond" size="40" type="text" value="c1=='chr22'" label="With following condition" help="Double equal signs, ==, must be used as shown above. To filter for an arbitrary string, use the Select tool.">
-      <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/>
-    </param>
-  </inputs>
-  <outputs>
-    <data format="input" name="out_file1" metadata_source="input"/>
-  </outputs>
-  <tests>
-    <test>
-      <param name="input" value="1.bed"/>
-      <param name="cond" value="c1=='chr22'"/>
-      <output name="out_file1" file="filter1_test1.bed"/>
-    </test>
-    <test>
-      <param name="input" value="7.bed"/>
-      <param name="cond" value="c1=='chr1' and c3-c2>=2000 and c6=='+'"/>
-      <output name="out_file1" file="filter1_test2.bed"/>
-    </test>
-  </tests>
-  <help>
-
-.. class:: warningmark
-
-Double equal signs, ==, must be used as *"equal to"* (e.g., **c1 == 'chr22'**)
-
-.. class:: infomark
-
-**TIP:** Attempting to apply a filtering condition may throw exceptions if the data type (e.g., string, integer) in every line of the columns being filtered is not appropriate for the condition (e.g., attempting certain numerical calculations on strings).  If an exception is thrown when applying the condition to a line, that line is skipped as invalid for the filter condition.  The number of invalid skipped lines is documented in the resulting history item as a "Condition/data issue".
-
-.. class:: infomark
-
-**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
-
------
-
-**Syntax**
-
-The filter tool allows you to restrict the dataset using simple conditional statements.
-
-- Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file
-- Make sure that multi-character operators contain no white space ( e.g., **&lt;=** is valid while **&lt; =** is not valid )
-- When using 'equal-to' operator **double equal sign '==' must be used** ( e.g., **c1=='chr1'** )
-- Non-numerical values must be included in single or double quotes ( e.g., **c6=='+'** )
-- Filtering condition can include logical operators, but **make sure operators are all lower case** ( e.g., **(c1!='chrX' and c1!='chrY') or not c6=='+'** )
-
------
-
-**Example**
-
-- **c1=='chr1'** selects lines in which the first column is chr1
-- **c3-c2&lt;100*c4** selects lines where subtracting column 3 from column 2 is less than the value of column 4 times 100
-- **len(c2.split(',')) &lt; 4** will select lines where the second column has less than four comma separated elements
-- **c2>=1** selects lines in which the value of column 2 is greater than or equal to 1
-- Numbers should not contain commas - **c2&lt;=44,554,350** will not work, but **c2&lt;=44554350** will
-- Some words in the data can be used, but must be single or double quoted ( e.g., **c3=='exon'** )
-
-</help>
-</tool>