changeset 0:fb0d744e32fa draft

Imported from capsule None
author devteam
date Mon, 19 May 2014 10:59:45 -0400
parents
children b2c29f4b6b6c
files megablast_wrapper.py megablast_wrapper.xml test-data/megablast_wrapper_test1.fa test-data/megablast_wrapper_test1.out tool-data/blastdb.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml
diffstat 7 files changed, 323 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/megablast_wrapper.py	Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+"""
+run megablast for metagenomics data
+
+usage: %prog [options]
+   -d, --db_build=d: The database to use
+   -i, --input=i: Input FASTQ candidate file
+   -w, --word_size=w: Size of best perfect match
+   -c, --identity_cutoff=c: Report hits at or above this identity
+   -e, --eval_cutoff=e: Expectation value cutoff
+   -f, --filter_query=f: Filter out low complexity regions
+   -x, --index_dir=x: Data index directory
+   -o, --output=o: Output file
+   
+usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
+"""
+
+# This version (April 26, 2012) replaces megablast with blast+ blastn
+# There is now no need to augment NCBI-formatted databases and these can be
+# directly downloaded from NCBI ftp site
+
+import os, subprocess, sys, tempfile
+from bx.cookbook import doc_optparse
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+
+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    query_filename = options.input.strip() # -query
+    output_filename = options.output.strip() # -out
+    mega_word_size = options.word_size        # -word_size
+    mega_iden_cutoff = options.identity_cutoff      # -perc_identity
+    mega_evalue_cutoff = options.eval_cutoff      # -evalue
+    mega_temp_output = tempfile.NamedTemporaryFile().name
+    GALAXY_DATA_INDEX_DIR = options.index_dir
+    DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR
+
+    # megablast parameters
+    try:
+        int( mega_word_size )    
+    except:
+        stop_err( 'Invalid value for word size' )
+    try:
+        float( mega_iden_cutoff )
+    except:
+        stop_err( 'Invalid value for identity cut-off' )
+    try:
+        float( mega_evalue_cutoff )
+    except:
+        stop_err( 'Invalid value for Expectation value' )
+
+    if not os.path.exists( os.path.split( options.db_build )[0] ):
+        stop_err( 'Cannot locate the target database directory. Please check your location file.' )
+
+    try:
+        threads = int( os.environ['GALAXY_SLOTS'])
+    except Exception:
+        threads = 8
+    # arguments for megablast
+    megablast_command = "blastn -task megablast -db %s -query %s -out %s -outfmt '6 qseqid sgi slen ppos length mismatch gaps qstart qend sstart send evalue bitscore' -num_threads %d -word_size %s -perc_identity %s -evalue %s -dust %s > /dev/null" \
+        % ( options.db_build, query_filename, mega_temp_output, threads, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query ) 
+
+    print megablast_command
+
+    tmp = tempfile.NamedTemporaryFile().name
+    try:
+        tmp_stderr = open( tmp, 'wb' )
+        proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() )
+        returncode = proc.wait()
+        tmp_stderr.close()
+        # get stderr, allowing for case where it's very large
+        tmp_stderr = open( tmp, 'rb' )
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:
+                stderr += tmp_stderr.read( buffsize )
+                if not stderr or len( stderr ) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        tmp_stderr.close()
+        if returncode != 0:
+            raise Exception, stderr
+        if os.path.exists( tmp ):
+            os.unlink( tmp )
+    except Exception, e:
+        if os.path.exists( mega_temp_output ):
+            os.unlink( mega_temp_output )
+        if os.path.exists( tmp ):
+            os.unlink( tmp )
+        stop_err( 'Cannot execute megaablast. ' + str( e ) )
+
+    output = open( output_filename, 'w' )
+    invalid_lines = 0
+    for i, line in enumerate( file( mega_temp_output ) ):
+        line = line.rstrip( '\r\n' )
+        fields = line.split()
+        try:
+            # convert the last column (bit-score as this is causing problem in filter tool) to float
+            fields[-1] = float( fields[-1] )
+            new_line = "%s\t%0.1f" % ( '\t'.join( fields[:-1] ), fields[-1] )
+        except:
+            new_line = line
+            invalid_lines += 1
+        output.write( "%s\n" % new_line )
+    output.close()
+
+    if os.path.exists( mega_temp_output ):
+        os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
+
+    if invalid_lines:
+        print "Unable to parse %d lines. Keep the default format." % invalid_lines
+
+    # megablast generates a file called error.log, if empty, delete it, if not, show the contents
+    if os.path.exists( './error.log' ):
+        for i, line in enumerate( file( './error.log' ) ):
+            line = line.rstrip( '\r\n' )
+            print line
+        os.remove( './error.log' )
+
+if __name__ == "__main__" : __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/megablast_wrapper.xml	Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,88 @@
+<tool id="megablast_wrapper" name="Megablast" version="1.2.0">
+    <description> compare short reads against htgs, nt, and wgs databases</description>
+    <requirements>
+        <requirement type="package" version="2.2.26+">blast+</requirement>
+        <requirement type="package" version="0.7.1">bx-python</requirement>
+    </requirements>
+    <command interpreter="python">
+      megablast_wrapper.py
+        --db_build="${source_select.fields.path}"
+        --input=$input_query
+        --word_size=$word_size
+        --identity_cutoff=$iden_cutoff
+        --eval_cutoff=$evalue_cutoff 
+        --filter_query=$filter_query
+        --index_dir=${GALAXY_DATA_INDEX_DIR}
+        --output=$output1
+    </command>
+    <inputs>
+        <param name="input_query" type="data" format="fasta" label="Compare these sequences"/> 
+        <param name="source_select" type="select" display="radio" label="against target database">
+            <options from_data_table="blastdb" />
+        </param>
+        <param name="word_size" type="select" label="using word size" help="Size of best perfect match (-word_size)">
+            <option value="28">28</option>
+            <option value="16">16</option>
+        </param>
+        <param name="iden_cutoff" type="float" size="15" value="90.0" label="report hits above this identity (-perc_identity)" help="no cutoff if 0" />
+        <param name="evalue_cutoff" type="float" size="15" value="0.001" label="set expectation value cutoff (-evalue)" />
+        <param name="filter_query" type="select" label="Filter out low complexity regions? (-dust)">
+            <option value="yes">Yes</option>
+            <option value="no">No</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output1" format="tabular"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_query" value="megablast_wrapper_test1.fa" ftype="fasta"/>
+            <!-- source_select needs to match the entry in the blastdb.loc file, which includes the last update date if appropriate --> 
+            <param name="source_select" value="phiX" />
+            <param name="word_size" value="28" />
+            <param name="iden_cutoff" value="99.0" />
+            <param name="evalue_cutoff" value="10.0" />
+            <param name="filter_query" value="yes" />
+            <output name="output1" file="megablast_wrapper_test1.out"/> 
+        </test>
+    </tests>
+    <help>
+    
+.. class:: warningmark
+
+**Note**. Database searches may take substantial amount of time. For large input datasets it is advisable to allow overnight processing.  
+
+-----
+
+**What it does**
+
+This tool runs **megablast** function of BLAST+ blastn tool - a high performance nucleotide local aligner developed by Webb Miller and colleagues.
+
+-----
+
+**Output format**
+
+Output of this tool contains 13 columns delimited by Tabs:
+
+1. Id of your sequence 
+2. GI of the database hit 
+3. Length of the database hit
+4. % identity
+5. Alignment length
+6. # mismatches
+7. # gaps
+8. Start position in your sequence
+9. End position in your sequence
+10. Start position in database hit
+11. End position in database hit
+12. E-value
+13. Bit score
+
+-------
+
+**Reference**
+
+Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/megablast_wrapper_test1.fa	Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,40 @@
+>0_0.666667
+GAGTTTTATCGCTTCCATGACGCAGAAGTT
+>1_0.600000
+AACACTTTCGGATATTTCTGATGAGTCGAA
+>2_0.400000
+AGCAGGAATTACTACTGCTTGTTTACGAAT
+>3_0.566667
+TAAATCGAAGTGGACTGCTGGCGGAAAATG
+>4_0.766667
+TCCTTGCGCAGCTCGAGAAGCTCTTACTTT
+>5_0.533333
+GCGACCTTTCGCCATCAACTAACGATTCTG
+>6_0.533333
+TTGGATGAGGAGAAGTGGCTTAATATGCTT
+>7_0.400000
+GGCACGTTCGTCAAGGACTGGTTTAGATAT
+>8_0.500000
+CATGGTAGAGATTCTCTTGTTGACATTTTA
+>9_0.400000
+AAAGAGCGTGGATTACTATCTGAGTCCGAT
+>10_0.500000
+ATAGGTAAGAAATCATGAGTCAAGTTACTG
+>11_0.533333
+AACAATCCGTACGTTTCCAGACCGCTTTGG
+>12_0.700000
+TTCAGGCTTCTGCCGTTTTGGATTTAACCG
+>13_0.533333
+AAGATGATTTCGATTTTCTGACGAGTAACA
+>14_0.666667
+CTGACCGCTCTCGTGCTCGTCGCTGCGTTG
+>15_0.666667
+AGGCTTGCGTTTATGGTACGCTGGACTTTG
+>16_0.566667
+TTCCTGCTCCTGTTGAGTTTATTGCTGCCG
+>17_0.533333
+TCATTGCTTATTATGTTCATCCCGTCAACA
+>18_0.566667
+TCATCATGGAAGGCGCTGAATTTACGGAAA
+>19_0.566667
+ACATTATTAATGGCGTCGAGCGTCCGGTTA
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/megablast_wrapper_test1.out	Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,20 @@
+0_0.666667	phiX174	5386	100.00	30	0	0	1	30	1	30	1e-13	60.0
+1_0.600000	phiX174	5386	100.00	30	0	0	1	30	31	60	1e-13	60.0
+2_0.400000	phiX174	5386	100.00	30	0	0	1	30	76	105	1e-13	60.0
+3_0.566667	phiX174	5386	100.00	30	0	0	1	30	106	135	1e-13	60.0
+4_0.766667	phiX174	5386	100.00	30	0	0	1	30	151	180	1e-13	60.0
+5_0.533333	phiX174	5386	100.00	30	0	0	1	30	181	210	1e-13	60.0
+6_0.533333	phiX174	5386	100.00	30	0	0	1	30	226	255	1e-13	60.0
+7_0.400000	phiX174	5386	100.00	30	0	0	1	30	256	285	1e-13	60.0
+8_0.500000	phiX174	5386	100.00	30	0	0	1	30	301	330	1e-13	60.0
+9_0.400000	phiX174	5386	100.00	30	0	0	1	30	331	360	1e-13	60.0
+10_0.500000	phiX174	5386	100.00	30	0	0	1	30	376	405	1e-13	60.0
+11_0.533333	phiX174	5386	100.00	30	0	0	1	30	406	435	1e-13	60.0
+12_0.700000	phiX174	5386	100.00	30	0	0	1	30	451	480	1e-13	60.0
+13_0.533333	phiX174	5386	100.00	30	0	0	1	30	481	510	1e-13	60.0
+14_0.666667	phiX174	5386	100.00	30	0	0	1	30	526	555	1e-13	60.0
+15_0.666667	phiX174	5386	100.00	30	0	0	1	30	556	585	1e-13	60.0
+16_0.566667	phiX174	5386	100.00	30	0	0	1	30	601	630	1e-13	60.0
+17_0.533333	phiX174	5386	100.00	30	0	0	1	30	631	660	1e-13	60.0
+18_0.566667	phiX174	5386	100.00	30	0	0	1	30	676	705	1e-13	60.0
+19_0.566667	phiX174	5386	100.00	30	0	0	1	30	706	735	1e-13	60.0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/blastdb.loc.sample	Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,31 @@
+#This is a sample file distributed with Galaxy that is used to define a
+#list of nucleotide BLAST databases, using three columns tab separated
+#(longer whitespace are TAB characters):
+#
+#<unique_id>      <database_caption>     <base_name_path>
+#
+#The captions typically contain spaces and might end with the release date.
+#It is important that the actual database name does not have a space in it.
+#
+#So, for example, if your database is nt and the path to your base name 
+#is /galaxy/data/blastdb/nt/DDmmmYYYY/nt, then the blastdb.loc entry 
+#could look like this:
+#
+#02dec2009     nt 02-Dec-2009    /galaxy/data/blastdb/nt/02dec2009/nt
+#
+#A /galaxy/data/blastdb/nt/02dec2009 directory would contain all of 
+#the "nt" blast indexes from ftp://ftp.ncbi.nlm.nih.gov/blast/db/nt* (e.g.):
+#
+#-rw-r--r--  1 wychung galaxy  23437408 2008-04-09 11:26 nt.chunk.00.nhr
+#-rw-r--r--  1 wychung galaxy   3689920 2008-04-09 11:26 nt.chunk.00.nin
+#-rw-r--r--  1 wychung galaxy 251215198 2008-04-09 11:26 nt.chunk.00.nsq
+#...etc...
+#
+#The blastdb.loc file should include one entry per line for each database. 
+#
+#See also blastdb_p.loc, used for protein BLAST database.
+#
+#Note that for backwards compatibility with workflows, the <unique_id> of
+#an entry must be the path that was in the original loc file.
+#The metadata <unique_id> is the value stored in workflows for "database".
+#
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,8 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <!-- Locations of nucleotide (mega)blast databases -->
+    <table name="blastdb" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/blastdb.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,9 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="blast+" version="2.2.26+">
+        <repository changeset_revision="6fa5f871bf0b" name="package_blast_plus_2_2_26" owner="devteam" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="bx-python" version="0.7.1">
+        <repository changeset_revision="35e2457234ef" name="package_bx_python_0_7" owner="devteam" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>