# HG changeset patch # User Daniel Blankenberg # Date 1386797051 18000 # Node ID c88d28377bd1aa02e095ee02f996db3ed6ddc3c6 Create an example blastdb Data Manager. diff -r 000000000000 -r c88d28377bd1 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Wed Dec 11 16:24:11 2013 -0500 @@ -0,0 +1,3 @@ +Downloads and populates blastdb data table. This is just a simple example to demonstrate the use of Data Managers for processing BlastDB. + +Uses ncbi's update_blast.pl script. \ No newline at end of file diff -r 000000000000 -r c88d28377bd1 data_manager/blastdb.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/blastdb.xml Wed Dec 11 16:24:11 2013 -0500 @@ -0,0 +1,48 @@ + + Downloader + fetch_blast_db.py --filename "${out_file}" --tool_data_table_name "blastdb" + + blast+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +Downloads Blast DBs and updates blastdb tool data tables. + +------ + + +.. class:: infomark + +**Notice:** This is a functional, but basic, tool for fetching preformatted blastdbs. + + + diff -r 000000000000 -r c88d28377bd1 data_manager/fetch_blast_db.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/fetch_blast_db.py Wed Dec 11 16:24:11 2013 -0500 @@ -0,0 +1,93 @@ +#!/usr/bin/env python +#Dan Blankenberg +#Script that calls update_blastdb.pl to download preformatted databases + +import optparse +import os +import sys +import subprocess +import hashlib + +from galaxy.util.json import from_json_string, to_json_string +DEFAULT_ALGORITHM = hashlib.sha512 +CHUNK_SIZE = 2**20 #1mb + +def get_dir_hash( directory, algorithm=None, followlinks=True, chunk_size=None ): + chunk_size = chunk_size or CHUNK_SIZE + algorithm = algorithm or DEFAULT_ALGORITHM + if isinstance( algorithm, basestring ): + hash = hashlib.new( algorithm ) + else: + hash = algorithm() + #we hash a directory by taking names of directories, files and their contents + for dirpath, dirnames, filenames in os.walk( directory, followlinks=followlinks ): + dirnames.sort() + filenames.sort() + for name in dirnames: + hash.update( os.path.relpath( os.path.join( dirpath, name ), directory ) ) + for name in filenames: + filename = os.path.join( dirpath, name ) + hash.update( os.path.relpath( filename, directory ) ) + fh = open( filename, 'rb' ) + while True: + data = fh.read( chunk_size ) + if not data: + break + hash.update( data ) + fh.close() + + return hash.hexdigest() + +def main(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-f', '--filename', dest='filename', action='store', type='string', default=None, help='filename' ) + parser.add_option( '-t', '--tool_data_table_name', dest='tool_data_table_name', action='store', type='string', default=None, help='tool_data_table_name' ) + (options, args) = parser.parse_args() + + params = from_json_string( open( options.filename ).read() ) + target_directory = params[ 'output_data' ][0]['extra_files_path'] + os.mkdir( target_directory ) + + blastdb_name = params['param_dict']['blastdb_name'] #value + data_description = params['param_dict']['advanced'].get( 'data_description', None ) + data_id = params['param_dict']['advanced'].get( 'data_id', None ) + + cmd_options = [ '--decompress' ] + + args = [ 'update_blastdb.pl' ] + cmd_options + [ blastdb_name ] + proc = subprocess.Popen( args=args, shell=False, cwd=target_directory ) + return_code = proc.wait() + if return_code != 1: + print >> sys.stderr, "Error obtaining blastdb (%s)" % return_code + sys.exit( 1 ) + + if not data_id: + data_id = "%s_%s" % ( blastdb_name, get_dir_hash( target_directory ) ) + + if not data_description: + alias_date = None + try: + for line in open( os.path.join( target_directory, "%s.nal" % ( blastdb_name ) ) ): + if line.startswith( '# Alias file created ' ): + alias_date = line.split( '# Alias file created ', 1 )[1].strip() + if line.startswith( 'TITLE' ): + data_description = line.split( None, 1 )[1].strip() + break + except Exception, e: + print >> sys.stderr, "Error Parsing Alias file for TITLE and date: %s" % ( e ) + if alias_date and data_description: + data_description = "%s (%s)" % ( data_description, alias_date ) + + if not data_description: + data_description = data_id + + data_table_entry = { 'value':data_id, 'name':data_description, 'path': os.path.join( blastdb_name, data_id ), 'nucleotide_alias_name': blastdb_name } + data_manager_dict = { 'data_tables': { options.tool_data_table_name: [ data_table_entry ] } } + + #save info to json file + with open( options.filename, 'wb' ) as fh: + fh.write( to_json_string( data_manager_dict ) ) + +if __name__ == "__main__": + main() diff -r 000000000000 -r c88d28377bd1 data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Wed Dec 11 16:24:11 2013 -0500 @@ -0,0 +1,18 @@ + + + + + + + + + + blastdb/${path} + + ${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb/${path}/${nucleotide_alias_name} + abspath + + + + + diff -r 000000000000 -r c88d28377bd1 test-data/est_out.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/est_out.json Wed Dec 11 16:24:11 2013 -0500 @@ -0,0 +1,1 @@ +{"data_tables": {"blastdb": [{"path": "est/est_a3aebb9941bff066cfbd40ebab14c3992f7aadabb64999f3e3b53d783c06f08033ba9066e5efd9380c6bbf9dcec808a281b7a6e9138087cc207c93f2e3ae3f67", "nucleotide_alias_name": "est", "name": "Database of GenBank+EMBL+DDBJ sequences from EST Divisions (12/05/2013 07:12:35)", "value": "est_a3aebb9941bff066cfbd40ebab14c3992f7aadabb64999f3e3b53d783c06f08033ba9066e5efd9380c6bbf9dcec808a281b7a6e9138087cc207c93f2e3ae3f67"}]}} \ No newline at end of file diff -r 000000000000 -r c88d28377bd1 tool-data/blastdb.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/blastdb.loc.sample Wed Dec 11 16:24:11 2013 -0500 @@ -0,0 +1,38 @@ +#This is a sample file distributed with Galaxy that is used to define a +#list of nucleotide BLAST databases, using three columns tab separated +#(longer whitespace are TAB characters): +# +# +# +#The captions typically contain spaces and might end with the build date. +#It is important that the actual database name does not have a space in it, +#and that the first tab that appears in the line is right before the path. +# +#So, for example, if your database is nt and the path to your base name +#is /depot/data2/galaxy/blastdb/nt/nt.chunk, then the blastdb.loc entry +#would look like this: +# +#nt_02_Dec_2009 nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk +# +#and your /depot/data2/galaxy/blastdb/nt directory would contain all of +#your "base names" (e.g.): +# +#-rw-r--r-- 1 wychung galaxy 23437408 2008-04-09 11:26 nt.chunk.00.nhr +#-rw-r--r-- 1 wychung galaxy 3689920 2008-04-09 11:26 nt.chunk.00.nin +#-rw-r--r-- 1 wychung galaxy 251215198 2008-04-09 11:26 nt.chunk.00.nsq +#...etc... +# +#Your blastdb.loc file should include an entry per line for each "base name" +#you have stored. For example: +# +#nt_02_Dec_2009 nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk +#wgs_30_Nov_2009 wgs 30 Nov 2009 /depot/data2/galaxy/blastdb/wgs/wgs.chunk +#test_20_Sep_2008 test 20 Sep 2008 /depot/data2/galaxy/blastdb/test/test +#...etc... +# +#See also blastdb_p.loc which is for any protein BLAST database. +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. +# diff -r 000000000000 -r c88d28377bd1 tool-data/tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/tool_data_table_conf.xml.sample Wed Dec 11 16:24:11 2013 -0500 @@ -0,0 +1,6 @@ + + + value, name, path + +

+ diff -r 000000000000 -r c88d28377bd1 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Dec 11 16:24:11 2013 -0500 @@ -0,0 +1,6 @@ + + + + + +