# HG changeset patch
# User devteam
# Date 1400511585 14400
# Node ID fb0d744e32fa20f8f6e6180932b39744dfe5eab1
Imported from capsule None
diff -r 000000000000 -r fb0d744e32fa megablast_wrapper.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/megablast_wrapper.py Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+"""
+run megablast for metagenomics data
+
+usage: %prog [options]
+ -d, --db_build=d: The database to use
+ -i, --input=i: Input FASTQ candidate file
+ -w, --word_size=w: Size of best perfect match
+ -c, --identity_cutoff=c: Report hits at or above this identity
+ -e, --eval_cutoff=e: Expectation value cutoff
+ -f, --filter_query=f: Filter out low complexity regions
+ -x, --index_dir=x: Data index directory
+ -o, --output=o: Output file
+
+usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
+"""
+
+# This version (April 26, 2012) replaces megablast with blast+ blastn
+# There is now no need to augment NCBI-formatted databases and these can be
+# directly downloaded from NCBI ftp site
+
+import os, subprocess, sys, tempfile
+from bx.cookbook import doc_optparse
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+ sys.stderr.write( "%s\n" % msg )
+ sys.exit()
+
+def __main__():
+ #Parse Command Line
+ options, args = doc_optparse.parse( __doc__ )
+ query_filename = options.input.strip() # -query
+ output_filename = options.output.strip() # -out
+ mega_word_size = options.word_size # -word_size
+ mega_iden_cutoff = options.identity_cutoff # -perc_identity
+ mega_evalue_cutoff = options.eval_cutoff # -evalue
+ mega_temp_output = tempfile.NamedTemporaryFile().name
+ GALAXY_DATA_INDEX_DIR = options.index_dir
+ DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR
+
+ # megablast parameters
+ try:
+ int( mega_word_size )
+ except:
+ stop_err( 'Invalid value for word size' )
+ try:
+ float( mega_iden_cutoff )
+ except:
+ stop_err( 'Invalid value for identity cut-off' )
+ try:
+ float( mega_evalue_cutoff )
+ except:
+ stop_err( 'Invalid value for Expectation value' )
+
+ if not os.path.exists( os.path.split( options.db_build )[0] ):
+ stop_err( 'Cannot locate the target database directory. Please check your location file.' )
+
+ try:
+ threads = int( os.environ['GALAXY_SLOTS'])
+ except Exception:
+ threads = 8
+ # arguments for megablast
+ megablast_command = "blastn -task megablast -db %s -query %s -out %s -outfmt '6 qseqid sgi slen ppos length mismatch gaps qstart qend sstart send evalue bitscore' -num_threads %d -word_size %s -perc_identity %s -evalue %s -dust %s > /dev/null" \
+ % ( options.db_build, query_filename, mega_temp_output, threads, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query )
+
+ print megablast_command
+
+ tmp = tempfile.NamedTemporaryFile().name
+ try:
+ tmp_stderr = open( tmp, 'wb' )
+ proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() )
+ returncode = proc.wait()
+ tmp_stderr.close()
+ # get stderr, allowing for case where it's very large
+ tmp_stderr = open( tmp, 'rb' )
+ stderr = ''
+ buffsize = 1048576
+ try:
+ while True:
+ stderr += tmp_stderr.read( buffsize )
+ if not stderr or len( stderr ) % buffsize != 0:
+ break
+ except OverflowError:
+ pass
+ tmp_stderr.close()
+ if returncode != 0:
+ raise Exception, stderr
+ if os.path.exists( tmp ):
+ os.unlink( tmp )
+ except Exception, e:
+ if os.path.exists( mega_temp_output ):
+ os.unlink( mega_temp_output )
+ if os.path.exists( tmp ):
+ os.unlink( tmp )
+ stop_err( 'Cannot execute megaablast. ' + str( e ) )
+
+ output = open( output_filename, 'w' )
+ invalid_lines = 0
+ for i, line in enumerate( file( mega_temp_output ) ):
+ line = line.rstrip( '\r\n' )
+ fields = line.split()
+ try:
+ # convert the last column (bit-score as this is causing problem in filter tool) to float
+ fields[-1] = float( fields[-1] )
+ new_line = "%s\t%0.1f" % ( '\t'.join( fields[:-1] ), fields[-1] )
+ except:
+ new_line = line
+ invalid_lines += 1
+ output.write( "%s\n" % new_line )
+ output.close()
+
+ if os.path.exists( mega_temp_output ):
+ os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
+
+ if invalid_lines:
+ print "Unable to parse %d lines. Keep the default format." % invalid_lines
+
+ # megablast generates a file called error.log, if empty, delete it, if not, show the contents
+ if os.path.exists( './error.log' ):
+ for i, line in enumerate( file( './error.log' ) ):
+ line = line.rstrip( '\r\n' )
+ print line
+ os.remove( './error.log' )
+
+if __name__ == "__main__" : __main__()
diff -r 000000000000 -r fb0d744e32fa megablast_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/megablast_wrapper.xml Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,88 @@
+
+ compare short reads against htgs, nt, and wgs databases
+
+ blast+
+ bx-python
+
+
+ megablast_wrapper.py
+ --db_build="${source_select.fields.path}"
+ --input=$input_query
+ --word_size=$word_size
+ --identity_cutoff=$iden_cutoff
+ --eval_cutoff=$evalue_cutoff
+ --filter_query=$filter_query
+ --index_dir=${GALAXY_DATA_INDEX_DIR}
+ --output=$output1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: warningmark
+
+**Note**. Database searches may take substantial amount of time. For large input datasets it is advisable to allow overnight processing.
+
+-----
+
+**What it does**
+
+This tool runs **megablast** function of BLAST+ blastn tool - a high performance nucleotide local aligner developed by Webb Miller and colleagues.
+
+-----
+
+**Output format**
+
+Output of this tool contains 13 columns delimited by Tabs:
+
+1. Id of your sequence
+2. GI of the database hit
+3. Length of the database hit
+4. % identity
+5. Alignment length
+6. # mismatches
+7. # gaps
+8. Start position in your sequence
+9. End position in your sequence
+10. Start position in database hit
+11. End position in database hit
+12. E-value
+13. Bit score
+
+-------
+
+**Reference**
+
+Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214.
+
+
+
diff -r 000000000000 -r fb0d744e32fa test-data/megablast_wrapper_test1.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/megablast_wrapper_test1.fa Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,40 @@
+>0_0.666667
+GAGTTTTATCGCTTCCATGACGCAGAAGTT
+>1_0.600000
+AACACTTTCGGATATTTCTGATGAGTCGAA
+>2_0.400000
+AGCAGGAATTACTACTGCTTGTTTACGAAT
+>3_0.566667
+TAAATCGAAGTGGACTGCTGGCGGAAAATG
+>4_0.766667
+TCCTTGCGCAGCTCGAGAAGCTCTTACTTT
+>5_0.533333
+GCGACCTTTCGCCATCAACTAACGATTCTG
+>6_0.533333
+TTGGATGAGGAGAAGTGGCTTAATATGCTT
+>7_0.400000
+GGCACGTTCGTCAAGGACTGGTTTAGATAT
+>8_0.500000
+CATGGTAGAGATTCTCTTGTTGACATTTTA
+>9_0.400000
+AAAGAGCGTGGATTACTATCTGAGTCCGAT
+>10_0.500000
+ATAGGTAAGAAATCATGAGTCAAGTTACTG
+>11_0.533333
+AACAATCCGTACGTTTCCAGACCGCTTTGG
+>12_0.700000
+TTCAGGCTTCTGCCGTTTTGGATTTAACCG
+>13_0.533333
+AAGATGATTTCGATTTTCTGACGAGTAACA
+>14_0.666667
+CTGACCGCTCTCGTGCTCGTCGCTGCGTTG
+>15_0.666667
+AGGCTTGCGTTTATGGTACGCTGGACTTTG
+>16_0.566667
+TTCCTGCTCCTGTTGAGTTTATTGCTGCCG
+>17_0.533333
+TCATTGCTTATTATGTTCATCCCGTCAACA
+>18_0.566667
+TCATCATGGAAGGCGCTGAATTTACGGAAA
+>19_0.566667
+ACATTATTAATGGCGTCGAGCGTCCGGTTA
\ No newline at end of file
diff -r 000000000000 -r fb0d744e32fa test-data/megablast_wrapper_test1.out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/megablast_wrapper_test1.out Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,20 @@
+0_0.666667 phiX174 5386 100.00 30 0 0 1 30 1 30 1e-13 60.0
+1_0.600000 phiX174 5386 100.00 30 0 0 1 30 31 60 1e-13 60.0
+2_0.400000 phiX174 5386 100.00 30 0 0 1 30 76 105 1e-13 60.0
+3_0.566667 phiX174 5386 100.00 30 0 0 1 30 106 135 1e-13 60.0
+4_0.766667 phiX174 5386 100.00 30 0 0 1 30 151 180 1e-13 60.0
+5_0.533333 phiX174 5386 100.00 30 0 0 1 30 181 210 1e-13 60.0
+6_0.533333 phiX174 5386 100.00 30 0 0 1 30 226 255 1e-13 60.0
+7_0.400000 phiX174 5386 100.00 30 0 0 1 30 256 285 1e-13 60.0
+8_0.500000 phiX174 5386 100.00 30 0 0 1 30 301 330 1e-13 60.0
+9_0.400000 phiX174 5386 100.00 30 0 0 1 30 331 360 1e-13 60.0
+10_0.500000 phiX174 5386 100.00 30 0 0 1 30 376 405 1e-13 60.0
+11_0.533333 phiX174 5386 100.00 30 0 0 1 30 406 435 1e-13 60.0
+12_0.700000 phiX174 5386 100.00 30 0 0 1 30 451 480 1e-13 60.0
+13_0.533333 phiX174 5386 100.00 30 0 0 1 30 481 510 1e-13 60.0
+14_0.666667 phiX174 5386 100.00 30 0 0 1 30 526 555 1e-13 60.0
+15_0.666667 phiX174 5386 100.00 30 0 0 1 30 556 585 1e-13 60.0
+16_0.566667 phiX174 5386 100.00 30 0 0 1 30 601 630 1e-13 60.0
+17_0.533333 phiX174 5386 100.00 30 0 0 1 30 631 660 1e-13 60.0
+18_0.566667 phiX174 5386 100.00 30 0 0 1 30 676 705 1e-13 60.0
+19_0.566667 phiX174 5386 100.00 30 0 0 1 30 706 735 1e-13 60.0
diff -r 000000000000 -r fb0d744e32fa tool-data/blastdb.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/blastdb.loc.sample Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,31 @@
+#This is a sample file distributed with Galaxy that is used to define a
+#list of nucleotide BLAST databases, using three columns tab separated
+#(longer whitespace are TAB characters):
+#
+#
+#
+#The captions typically contain spaces and might end with the release date.
+#It is important that the actual database name does not have a space in it.
+#
+#So, for example, if your database is nt and the path to your base name
+#is /galaxy/data/blastdb/nt/DDmmmYYYY/nt, then the blastdb.loc entry
+#could look like this:
+#
+#02dec2009 nt 02-Dec-2009 /galaxy/data/blastdb/nt/02dec2009/nt
+#
+#A /galaxy/data/blastdb/nt/02dec2009 directory would contain all of
+#the "nt" blast indexes from ftp://ftp.ncbi.nlm.nih.gov/blast/db/nt* (e.g.):
+#
+#-rw-r--r-- 1 wychung galaxy 23437408 2008-04-09 11:26 nt.chunk.00.nhr
+#-rw-r--r-- 1 wychung galaxy 3689920 2008-04-09 11:26 nt.chunk.00.nin
+#-rw-r--r-- 1 wychung galaxy 251215198 2008-04-09 11:26 nt.chunk.00.nsq
+#...etc...
+#
+#The blastdb.loc file should include one entry per line for each database.
+#
+#See also blastdb_p.loc, used for protein BLAST database.
+#
+#Note that for backwards compatibility with workflows, the of
+#an entry must be the path that was in the original loc file.
+#The metadata is the value stored in workflows for "database".
+#
\ No newline at end of file
diff -r 000000000000 -r fb0d744e32fa tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,8 @@
+
+
+
+
+
diff -r 000000000000 -r fb0d744e32fa tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Mon May 19 10:59:45 2014 -0400
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+