Mercurial > repos > devteam > megablast_wrapper
changeset 0:fb0d744e32fa draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 10:59:45 -0400 |
parents | |
children | b2c29f4b6b6c |
files | megablast_wrapper.py megablast_wrapper.xml test-data/megablast_wrapper_test1.fa test-data/megablast_wrapper_test1.out tool-data/blastdb.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 7 files changed, 323 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/megablast_wrapper.py Mon May 19 10:59:45 2014 -0400 @@ -0,0 +1,127 @@ +#!/usr/bin/env python +""" +run megablast for metagenomics data + +usage: %prog [options] + -d, --db_build=d: The database to use + -i, --input=i: Input FASTQ candidate file + -w, --word_size=w: Size of best perfect match + -c, --identity_cutoff=c: Report hits at or above this identity + -e, --eval_cutoff=e: Expectation value cutoff + -f, --filter_query=f: Filter out low complexity regions + -x, --index_dir=x: Data index directory + -o, --output=o: Output file + +usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file +""" + +# This version (April 26, 2012) replaces megablast with blast+ blastn +# There is now no need to augment NCBI-formatted databases and these can be +# directly downloaded from NCBI ftp site + +import os, subprocess, sys, tempfile +from bx.cookbook import doc_optparse + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + options, args = doc_optparse.parse( __doc__ ) + query_filename = options.input.strip() # -query + output_filename = options.output.strip() # -out + mega_word_size = options.word_size # -word_size + mega_iden_cutoff = options.identity_cutoff # -perc_identity + mega_evalue_cutoff = options.eval_cutoff # -evalue + mega_temp_output = tempfile.NamedTemporaryFile().name + GALAXY_DATA_INDEX_DIR = options.index_dir + DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR + + # megablast parameters + try: + int( mega_word_size ) + except: + stop_err( 'Invalid value for word size' ) + try: + float( mega_iden_cutoff ) + except: + stop_err( 'Invalid value for identity cut-off' ) + try: + float( mega_evalue_cutoff ) + except: + stop_err( 'Invalid value for Expectation value' ) + + if not os.path.exists( os.path.split( options.db_build )[0] ): + stop_err( 'Cannot locate the target database directory. Please check your location file.' ) + + try: + threads = int( os.environ['GALAXY_SLOTS']) + except Exception: + threads = 8 + # arguments for megablast + megablast_command = "blastn -task megablast -db %s -query %s -out %s -outfmt '6 qseqid sgi slen ppos length mismatch gaps qstart qend sstart send evalue bitscore' -num_threads %d -word_size %s -perc_identity %s -evalue %s -dust %s > /dev/null" \ + % ( options.db_build, query_filename, mega_temp_output, threads, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, options.filter_query ) + + print megablast_command + + tmp = tempfile.NamedTemporaryFile().name + try: + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + if os.path.exists( tmp ): + os.unlink( tmp ) + except Exception, e: + if os.path.exists( mega_temp_output ): + os.unlink( mega_temp_output ) + if os.path.exists( tmp ): + os.unlink( tmp ) + stop_err( 'Cannot execute megaablast. ' + str( e ) ) + + output = open( output_filename, 'w' ) + invalid_lines = 0 + for i, line in enumerate( file( mega_temp_output ) ): + line = line.rstrip( '\r\n' ) + fields = line.split() + try: + # convert the last column (bit-score as this is causing problem in filter tool) to float + fields[-1] = float( fields[-1] ) + new_line = "%s\t%0.1f" % ( '\t'.join( fields[:-1] ), fields[-1] ) + except: + new_line = line + invalid_lines += 1 + output.write( "%s\n" % new_line ) + output.close() + + if os.path.exists( mega_temp_output ): + os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of + + if invalid_lines: + print "Unable to parse %d lines. Keep the default format." % invalid_lines + + # megablast generates a file called error.log, if empty, delete it, if not, show the contents + if os.path.exists( './error.log' ): + for i, line in enumerate( file( './error.log' ) ): + line = line.rstrip( '\r\n' ) + print line + os.remove( './error.log' ) + +if __name__ == "__main__" : __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/megablast_wrapper.xml Mon May 19 10:59:45 2014 -0400 @@ -0,0 +1,88 @@ +<tool id="megablast_wrapper" name="Megablast" version="1.2.0"> + <description> compare short reads against htgs, nt, and wgs databases</description> + <requirements> + <requirement type="package" version="2.2.26+">blast+</requirement> + <requirement type="package" version="0.7.1">bx-python</requirement> + </requirements> + <command interpreter="python"> + megablast_wrapper.py + --db_build="${source_select.fields.path}" + --input=$input_query + --word_size=$word_size + --identity_cutoff=$iden_cutoff + --eval_cutoff=$evalue_cutoff + --filter_query=$filter_query + --index_dir=${GALAXY_DATA_INDEX_DIR} + --output=$output1 + </command> + <inputs> + <param name="input_query" type="data" format="fasta" label="Compare these sequences"/> + <param name="source_select" type="select" display="radio" label="against target database"> + <options from_data_table="blastdb" /> + </param> + <param name="word_size" type="select" label="using word size" help="Size of best perfect match (-word_size)"> + <option value="28">28</option> + <option value="16">16</option> + </param> + <param name="iden_cutoff" type="float" size="15" value="90.0" label="report hits above this identity (-perc_identity)" help="no cutoff if 0" /> + <param name="evalue_cutoff" type="float" size="15" value="0.001" label="set expectation value cutoff (-evalue)" /> + <param name="filter_query" type="select" label="Filter out low complexity regions? (-dust)"> + <option value="yes">Yes</option> + <option value="no">No</option> + </param> + </inputs> + <outputs> + <data name="output1" format="tabular"/> + </outputs> + <tests> + <test> + <param name="input_query" value="megablast_wrapper_test1.fa" ftype="fasta"/> + <!-- source_select needs to match the entry in the blastdb.loc file, which includes the last update date if appropriate --> + <param name="source_select" value="phiX" /> + <param name="word_size" value="28" /> + <param name="iden_cutoff" value="99.0" /> + <param name="evalue_cutoff" value="10.0" /> + <param name="filter_query" value="yes" /> + <output name="output1" file="megablast_wrapper_test1.out"/> + </test> + </tests> + <help> + +.. class:: warningmark + +**Note**. Database searches may take substantial amount of time. For large input datasets it is advisable to allow overnight processing. + +----- + +**What it does** + +This tool runs **megablast** function of BLAST+ blastn tool - a high performance nucleotide local aligner developed by Webb Miller and colleagues. + +----- + +**Output format** + +Output of this tool contains 13 columns delimited by Tabs: + +1. Id of your sequence +2. GI of the database hit +3. Length of the database hit +4. % identity +5. Alignment length +6. # mismatches +7. # gaps +8. Start position in your sequence +9. End position in your sequence +10. Start position in database hit +11. End position in database hit +12. E-value +13. Bit score + +------- + +**Reference** + +Zhang et al. A Greedy Algorithm for Aligning DNA Sequences. 2000. JCB: 203-214. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/megablast_wrapper_test1.fa Mon May 19 10:59:45 2014 -0400 @@ -0,0 +1,40 @@ +>0_0.666667 +GAGTTTTATCGCTTCCATGACGCAGAAGTT +>1_0.600000 +AACACTTTCGGATATTTCTGATGAGTCGAA +>2_0.400000 +AGCAGGAATTACTACTGCTTGTTTACGAAT +>3_0.566667 +TAAATCGAAGTGGACTGCTGGCGGAAAATG +>4_0.766667 +TCCTTGCGCAGCTCGAGAAGCTCTTACTTT +>5_0.533333 +GCGACCTTTCGCCATCAACTAACGATTCTG +>6_0.533333 +TTGGATGAGGAGAAGTGGCTTAATATGCTT +>7_0.400000 +GGCACGTTCGTCAAGGACTGGTTTAGATAT +>8_0.500000 +CATGGTAGAGATTCTCTTGTTGACATTTTA +>9_0.400000 +AAAGAGCGTGGATTACTATCTGAGTCCGAT +>10_0.500000 +ATAGGTAAGAAATCATGAGTCAAGTTACTG +>11_0.533333 +AACAATCCGTACGTTTCCAGACCGCTTTGG +>12_0.700000 +TTCAGGCTTCTGCCGTTTTGGATTTAACCG +>13_0.533333 +AAGATGATTTCGATTTTCTGACGAGTAACA +>14_0.666667 +CTGACCGCTCTCGTGCTCGTCGCTGCGTTG +>15_0.666667 +AGGCTTGCGTTTATGGTACGCTGGACTTTG +>16_0.566667 +TTCCTGCTCCTGTTGAGTTTATTGCTGCCG +>17_0.533333 +TCATTGCTTATTATGTTCATCCCGTCAACA +>18_0.566667 +TCATCATGGAAGGCGCTGAATTTACGGAAA +>19_0.566667 +ACATTATTAATGGCGTCGAGCGTCCGGTTA \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/megablast_wrapper_test1.out Mon May 19 10:59:45 2014 -0400 @@ -0,0 +1,20 @@ +0_0.666667 phiX174 5386 100.00 30 0 0 1 30 1 30 1e-13 60.0 +1_0.600000 phiX174 5386 100.00 30 0 0 1 30 31 60 1e-13 60.0 +2_0.400000 phiX174 5386 100.00 30 0 0 1 30 76 105 1e-13 60.0 +3_0.566667 phiX174 5386 100.00 30 0 0 1 30 106 135 1e-13 60.0 +4_0.766667 phiX174 5386 100.00 30 0 0 1 30 151 180 1e-13 60.0 +5_0.533333 phiX174 5386 100.00 30 0 0 1 30 181 210 1e-13 60.0 +6_0.533333 phiX174 5386 100.00 30 0 0 1 30 226 255 1e-13 60.0 +7_0.400000 phiX174 5386 100.00 30 0 0 1 30 256 285 1e-13 60.0 +8_0.500000 phiX174 5386 100.00 30 0 0 1 30 301 330 1e-13 60.0 +9_0.400000 phiX174 5386 100.00 30 0 0 1 30 331 360 1e-13 60.0 +10_0.500000 phiX174 5386 100.00 30 0 0 1 30 376 405 1e-13 60.0 +11_0.533333 phiX174 5386 100.00 30 0 0 1 30 406 435 1e-13 60.0 +12_0.700000 phiX174 5386 100.00 30 0 0 1 30 451 480 1e-13 60.0 +13_0.533333 phiX174 5386 100.00 30 0 0 1 30 481 510 1e-13 60.0 +14_0.666667 phiX174 5386 100.00 30 0 0 1 30 526 555 1e-13 60.0 +15_0.666667 phiX174 5386 100.00 30 0 0 1 30 556 585 1e-13 60.0 +16_0.566667 phiX174 5386 100.00 30 0 0 1 30 601 630 1e-13 60.0 +17_0.533333 phiX174 5386 100.00 30 0 0 1 30 631 660 1e-13 60.0 +18_0.566667 phiX174 5386 100.00 30 0 0 1 30 676 705 1e-13 60.0 +19_0.566667 phiX174 5386 100.00 30 0 0 1 30 706 735 1e-13 60.0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/blastdb.loc.sample Mon May 19 10:59:45 2014 -0400 @@ -0,0 +1,31 @@ +#This is a sample file distributed with Galaxy that is used to define a +#list of nucleotide BLAST databases, using three columns tab separated +#(longer whitespace are TAB characters): +# +#<unique_id> <database_caption> <base_name_path> +# +#The captions typically contain spaces and might end with the release date. +#It is important that the actual database name does not have a space in it. +# +#So, for example, if your database is nt and the path to your base name +#is /galaxy/data/blastdb/nt/DDmmmYYYY/nt, then the blastdb.loc entry +#could look like this: +# +#02dec2009 nt 02-Dec-2009 /galaxy/data/blastdb/nt/02dec2009/nt +# +#A /galaxy/data/blastdb/nt/02dec2009 directory would contain all of +#the "nt" blast indexes from ftp://ftp.ncbi.nlm.nih.gov/blast/db/nt* (e.g.): +# +#-rw-r--r-- 1 wychung galaxy 23437408 2008-04-09 11:26 nt.chunk.00.nhr +#-rw-r--r-- 1 wychung galaxy 3689920 2008-04-09 11:26 nt.chunk.00.nin +#-rw-r--r-- 1 wychung galaxy 251215198 2008-04-09 11:26 nt.chunk.00.nsq +#...etc... +# +#The blastdb.loc file should include one entry per line for each database. +# +#See also blastdb_p.loc, used for protein BLAST database. +# +#Note that for backwards compatibility with workflows, the <unique_id> of +#an entry must be the path that was in the original loc file. +#The metadata <unique_id> is the value stored in workflows for "database". +# \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon May 19 10:59:45 2014 -0400 @@ -0,0 +1,8 @@ +<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> +<tables> + <!-- Locations of nucleotide (mega)blast databases --> + <table name="blastdb" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/blastdb.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon May 19 10:59:45 2014 -0400 @@ -0,0 +1,9 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="blast+" version="2.2.26+"> + <repository changeset_revision="6fa5f871bf0b" name="package_blast_plus_2_2_26" owner="devteam" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="bx-python" version="0.7.1"> + <repository changeset_revision="35e2457234ef" name="package_bx_python_0_7" owner="devteam" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>