changeset 0:c88d28377bd1 default tip

Create an example blastdb Data Manager.
author Daniel Blankenberg <dan@bx.psu.edu>
date Wed, 11 Dec 2013 16:24:11 -0500
parents
children
files README data_manager/blastdb.xml data_manager/fetch_blast_db.py data_manager_conf.xml test-data/est_out.json tool-data/blastdb.loc.sample tool-data/tool_data_table_conf.xml.sample tool_dependencies.xml
diffstat 8 files changed, 213 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Wed Dec 11 16:24:11 2013 -0500
@@ -0,0 +1,3 @@
+Downloads and populates blastdb data table. This is just a simple example to demonstrate the use of Data Managers for processing BlastDB.
+
+Uses ncbi's update_blast.pl script.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/blastdb.xml	Wed Dec 11 16:24:11 2013 -0500
@@ -0,0 +1,48 @@
+<tool id="data_manager_blast_db" name="Blast DB" version="0.0.1" tool_type="manage_data">
+    <description>Downloader</description>
+    <command interpreter="python">fetch_blast_db.py --filename "${out_file}" --tool_data_table_name "blastdb"</command>
+    <requirements>
+        <requirement type="package" version="2.2.28">blast+</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" level="fatal" description="Tool exception" />
+    </stdio>
+    <inputs>
+        <param name="blastdb_name" type="text" label="Blast DB Name" help="try &quot;nt&quot; as an example" optional="False"/>
+        <conditional name="advanced">
+            <param name="advanced_selector" type="select" label="Advanced Options">
+                <option value="basic" selected="True">Basic</option>
+                <option value="advanced">Advanced</option>
+            </param>
+            <when value="basic">
+            </when>
+            <when value="advanced">
+                <param type="text" name="data_description" value="" label="Display name" help="Optional"/>
+                <param type="text" name="data_id" value="" label="ID for sequence" help="Optional"/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="blastdb_name" value="est"/>
+            <param name="advanced_selector" value="basic"/>
+            <output name="out_file" file="est_out.json"/>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Downloads Blast DBs and updates blastdb tool data tables.
+
+------
+
+
+.. class:: infomark
+
+**Notice:** This is a functional, but basic, tool for fetching preformatted blastdbs.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_blast_db.py	Wed Dec 11 16:24:11 2013 -0500
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+#Script that calls update_blastdb.pl to download preformatted databases
+
+import optparse
+import os
+import sys
+import subprocess
+import hashlib
+
+from galaxy.util.json import from_json_string, to_json_string
+DEFAULT_ALGORITHM = hashlib.sha512
+CHUNK_SIZE = 2**20 #1mb
+
+def get_dir_hash( directory, algorithm=None, followlinks=True, chunk_size=None ):
+    chunk_size = chunk_size or CHUNK_SIZE
+    algorithm = algorithm or DEFAULT_ALGORITHM
+    if isinstance( algorithm, basestring ):
+        hash = hashlib.new( algorithm )
+    else:
+        hash = algorithm()
+    #we hash a directory by taking names of directories, files and their contents
+    for dirpath, dirnames, filenames in os.walk( directory, followlinks=followlinks ):
+        dirnames.sort()
+        filenames.sort()
+        for name in dirnames:
+            hash.update( os.path.relpath( os.path.join( dirpath, name ), directory ) )
+        for name in filenames:
+            filename = os.path.join( dirpath, name )
+            hash.update( os.path.relpath( filename, directory ) )
+            fh = open( filename, 'rb' )
+            while True:
+                data = fh.read( chunk_size )
+                if not data:
+                    break
+                hash.update( data )
+            fh.close()
+    
+    return hash.hexdigest()
+
+def main():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-f', '--filename', dest='filename', action='store', type='string', default=None, help='filename' )
+    parser.add_option( '-t', '--tool_data_table_name', dest='tool_data_table_name', action='store', type='string', default=None, help='tool_data_table_name' )
+    (options, args) = parser.parse_args()
+    
+    params = from_json_string( open( options.filename ).read() )
+    target_directory = params[ 'output_data' ][0]['extra_files_path']
+    os.mkdir( target_directory )
+    
+    blastdb_name = params['param_dict']['blastdb_name'] #value
+    data_description = params['param_dict']['advanced'].get( 'data_description', None )
+    data_id = params['param_dict']['advanced'].get( 'data_id', None )
+    
+    cmd_options = [ '--decompress' ]
+    
+    args = [ 'update_blastdb.pl' ] + cmd_options + [ blastdb_name ]
+    proc = subprocess.Popen( args=args, shell=False, cwd=target_directory )
+    return_code = proc.wait()
+    if return_code != 1:
+        print >> sys.stderr, "Error obtaining blastdb (%s)" % return_code
+        sys.exit( 1 )
+    
+    if not data_id:
+        data_id = "%s_%s" % ( blastdb_name, get_dir_hash( target_directory ) )
+    
+    if not data_description:
+        alias_date = None
+        try:
+            for line in open( os.path.join( target_directory, "%s.nal" % ( blastdb_name ) ) ):
+                if line.startswith( '# Alias file created ' ):
+                    alias_date = line.split( '# Alias file created ', 1 )[1].strip()
+                if line.startswith( 'TITLE' ):
+                    data_description = line.split( None, 1 )[1].strip()
+                    break
+        except Exception, e:
+            print >> sys.stderr, "Error Parsing Alias file for TITLE and date: %s" % ( e )
+        if alias_date and data_description:
+            data_description = "%s (%s)" % ( data_description, alias_date )
+    
+    if not data_description:
+        data_description = data_id
+    
+    data_table_entry = { 'value':data_id, 'name':data_description, 'path': os.path.join( blastdb_name, data_id ), 'nucleotide_alias_name': blastdb_name }
+    data_manager_dict = { 'data_tables': { options.tool_data_table_name: [ data_table_entry ]  } }
+    
+    #save info to json file
+    with open( options.filename, 'wb' ) as fh:
+        fh.write( to_json_string( data_manager_dict ) )
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Wed Dec 11 16:24:11 2013 -0500
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/blastdb.xml" id="ncbi_blast_plus_update_blastdb">
+        <data_table name="blastdb">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="path" output_ref="out_file" >
+                    <move type="directory">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">blastdb/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/blastdb/${path}/${nucleotide_alias_name}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est_out.json	Wed Dec 11 16:24:11 2013 -0500
@@ -0,0 +1,1 @@
+{"data_tables": {"blastdb": [{"path": "est/est_a3aebb9941bff066cfbd40ebab14c3992f7aadabb64999f3e3b53d783c06f08033ba9066e5efd9380c6bbf9dcec808a281b7a6e9138087cc207c93f2e3ae3f67", "nucleotide_alias_name": "est", "name": "Database of GenBank+EMBL+DDBJ sequences from EST Divisions (12/05/2013 07:12:35)", "value": "est_a3aebb9941bff066cfbd40ebab14c3992f7aadabb64999f3e3b53d783c06f08033ba9066e5efd9380c6bbf9dcec808a281b7a6e9138087cc207c93f2e3ae3f67"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/blastdb.loc.sample	Wed Dec 11 16:24:11 2013 -0500
@@ -0,0 +1,38 @@
+#This is a sample file distributed with Galaxy that is used to define a
+#list of nucleotide BLAST databases, using three columns tab separated
+#(longer whitespace are TAB characters):
+#
+#<unique_id>	<database_caption>	<base_name_path>
+#
+#The captions typically contain spaces and might end with the build date.
+#It is important that the actual database name does not have a space in it,
+#and that the first tab that appears in the line is right before the path.
+#
+#So, for example, if your database is nt and the path to your base name 
+#is /depot/data2/galaxy/blastdb/nt/nt.chunk, then the blastdb.loc entry 
+#would look like this:
+#
+#nt_02_Dec_2009      nt 02 Dec 2009      /depot/data2/galaxy/blastdb/nt/nt.chunk
+#
+#and your /depot/data2/galaxy/blastdb/nt directory would contain all of 
+#your "base names" (e.g.):
+#
+#-rw-r--r--  1 wychung galaxy  23437408 2008-04-09 11:26 nt.chunk.00.nhr
+#-rw-r--r--  1 wychung galaxy   3689920 2008-04-09 11:26 nt.chunk.00.nin
+#-rw-r--r--  1 wychung galaxy 251215198 2008-04-09 11:26 nt.chunk.00.nsq
+#...etc...
+#
+#Your blastdb.loc file should include an entry per line for each "base name" 
+#you have stored.  For example:
+#
+#nt_02_Dec_2009		nt 02 Dec 2009		/depot/data2/galaxy/blastdb/nt/nt.chunk
+#wgs_30_Nov_2009	wgs 30 Nov 2009	/depot/data2/galaxy/blastdb/wgs/wgs.chunk
+#test_20_Sep_2008	test 20 Sep 2008	/depot/data2/galaxy/blastdb/test/test
+#...etc...
+#
+#See also blastdb_p.loc which is for any protein BLAST database.
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/tool_data_table_conf.xml.sample	Wed Dec 11 16:24:11 2013 -0500
@@ -0,0 +1,6 @@
+<tables>
+    <table name="blastdb" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/blastdb.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Wed Dec 11 16:24:11 2013 -0500
@@ -0,0 +1,6 @@
+<tool_dependency>
+    <package name="blast+" version="2.2.28">
+        <repository toolshed="http://testtoolshed.g2.bx.psu.edu" name="package_blast_plus_2_2_28" owner="iuc" changeset_revision="5a449da71d08" />
+    </package>
+</tool_dependency>
+