changeset 0:99091a5d5c84 draft

Uploaded
author bgruening
date Wed, 04 Sep 2013 05:10:04 -0400
parents
children 64a1fb09b10d
files EDeN_feature.xml EDeN_nearest_neighbor.xml EDeN_test.xml EDeN_train.xml README.rst datatypes_conf.xml eden.py mol2gspan.py mol2gspan.xml test-data/3_molecules.gspan test-data/3_molecules.sdf tool_dependencies.xml
diffstat 12 files changed, 1130 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/EDeN_feature.xml	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,273 @@
+<tool id="bg_eden_feature" name="EDeN Converter" version="0.1">
+    <description></description>
+    <requirements>
+        <requirement type="package" version="2.3.2">openbabel</requirement>
+        <requirement type="set_environment">EDEN_SCRIPT_PATH</requirement>
+    </requirements>
+    <command>
+
+        ## pre-processing step if we have a molecule type we need to convert it to the gSpan format at first
+
+        #import tempfile, os
+        #set $temp_gspan = tempfile.NamedTemporaryFile( delete=False )
+        #silent $temp_gspan.close()
+        #set $temp_gspan = $temp_gspan.name
+
+        #if $file_type_opts.file_type_opts_selector == 'sdf':
+            obabel -i sdf -o sdf $infile | \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan
+            #set $file_type = 'GRAPH'
+        #elif $file_type_opts.file_type_opts_selector == 'smi':
+            obabel -i smi -o sdf $infile | \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan
+            #set $file_type = 'GRAPH'
+        #else:
+            #set $temp_gspan = $infile
+            #set $file_type = $file_type_opts.file_type_opts_selector
+        #end if
+        ;
+
+        EDeN --action FEATURE
+
+        --input_data_file_name $temp_gspan
+        --model_file_name $outfile
+
+        ## if we have an molecule datatype the file_type is set to GRAPH, after convertion to the gSpan Graph format
+        --file_type $file_type
+
+        --binary_file_type ## create a binary sparse vector as output
+        --kernel_type $kernel_type
+        --graph_type $graph_type
+
+        $no_normalization
+        $min_kernel
+
+        --hash_bit_size $hash_bit_size
+        --radius $radius
+        --distance $distance
+        --vertex_degree_threshold $vertex_degree_threshold
+
+        $no_normalization
+        $min_kernel
+
+        --kernel_type $kernel_type  ##NSPDK | WDK | PBK | USPK | DDK | NSDDK | ANSDDK | SK [NSPDK]
+        --graph_type $graph_type    ##DIRECTED | UNDIRECTED [UNDIRECTED]
+
+        #if $file_type_opts.file_type_opts_selector == 'SEQUENCE':
+
+            --sequence_degree $sequence_degree
+            $sequence_token
+            $sequence_multi_line
+            $sequence_pairwise_interaction
+
+        #end if
+
+        --tree_lambda $tree_lambda
+        --radius_two $radius_two
+
+
+        ### Adds rescaled features from nearest neighbors ###
+
+        #if $smooth_opts.smooth_opts_selector == 'smooth':
+            --smooth
+            --smooth_param $smooth_opts.smoother_param
+
+            --row_index_file_name $row_index_file_name
+            --col_index_file_name $col_index_file_name
+            --num_hash_functions $smooth_opts.num_hash_functions
+            --num_repeat_hash_functions $smooth_opts.num_repeat_hash_functions
+            --max_size_bin $smooth_opts.max_size_bin
+            --eccess_neighbour_size_factor $smooth_opts.eccess_neighbour_size_factor
+            --num_nearest_neighbours $smooth_opts.num_nearest_neighbours
+            $smooth_opts.shared_neighborhood
+            $smooth_opts.no_neighborhood_cache
+            $smooth_opts.no_minhash_cache
+        #end if
+
+        ;
+        rm $temp_gspan
+
+
+    </command>
+    <inputs>
+        <param format="smi,gspan,inchi,sdf,mol,mol2,txt" name="infile" type="data" label="Input file" 
+            help="File can contain molecule data types (SMILES, InChI, SDF) or Graph datatypes (gSpan, sparse vector, sequence)."/>
+
+        <conditional name="file_type_opts">
+            <param name="file_type_opts_selector" type="select" display="radio" label="Type of Input file">
+                <option value="GRAPH">Graph</option>
+                <option value="SPARSE_VECTOR">sparse vector</option>
+                <option value="SEQUENCE">Sequence</option>
+                <option value="sdf">SDF</option>
+                <option value="smi">SMILES</option>
+            </param>
+            <when value="GRAPH" />
+            <when value="SPARSE_VECTOR" />
+            <when value="SEQUENCE">
+                <param name="sequence_degree" type="integer" value="1" label="Sequence degree" help="">
+                    <validator type="in_range" min="1" />
+                </param>
+                <param name="sequence_token" type="boolean" label="Sequence token" truevalue="--sequence_token" falsevalue="" checked="false" />
+                <param name="sequence_multi_line" type="boolean" label="Sequence is in multi-line notation" truevalue="--sequence_multi_line" falsevalue="" checked="false" />
+                <param name="sequence_pairwise_interaction" type="boolean" label="Sequence pairwise iterations" truevalue="--sequence_pairwise_interaction" falsevalue="" checked="false" />
+            </when>
+            <when value="sdf" />
+            <when value="smi" />
+        </conditional>
+
+        <param name="kernel_type" type="select" display="radio" label="Type of the Kernel">
+            <option value="NSPDK">NSPDK</option>
+            <option value="WDK">WDK</option>
+            <option value="PBK">PBK</option>
+            <option value="USPK">USPK</option>
+            <option value="DDK">DDK</option>
+            <option value="NSDDK">ANSDDK</option>
+            <option value="SK">SK [NSPDK]</option>
+        </param>
+
+        <param name="graph_type" type="select" display="radio" label="Type of Graph">
+            <option value="DIRECTED">directed</option>
+            <option value="UNDIRECTED">undirected</option>
+        </param>
+
+
+        <conditional name="smooth_opts">
+            <param name="smooth_opts_selector" type="select" label="Adds rescaled features from nearest neighbors (--smooth)">
+              <option value="non_smooth" selected="True">Disable smooth</option>
+              <option value="smooth">Enable smooth</option>
+            </param>
+            <when value="non_smooth" />
+            <when value="smooth">
+
+                <param name="smoother_param" type="float" value="0.95" label="Scaling features from neighbors"
+                    help="Features from neighbors are scaled by the kernel value to the power value assigned to this switch.">
+                    <validator type="in_range" min="0.0" />
+                </param>
+
+                <param name="no_minhash_cache" type="boolean" label="Deactivate minhash cache" truevalue="--no_minhash_cache" falsevalue="" checked="false" />
+                <param name="no_neighborhood_cache" type="boolean" label="Deactivate neighborhood cache" truevalue="-no_neighborhood_cache" falsevalue="" checked="false" />
+                <param name="shared_neighborhood" type="boolean" label="Activate shared neighborhood" truevalue="--shared_neighborhood" falsevalue="" checked="false" />
+
+                <param name="num_hash_functions" type="integer" value="400" label="Number of hash functions" help="">
+                    <validator type="in_range" min="1" />
+                </param>
+                <param name="num_repeat_hash_functions" type="integer" value="10" label="Number of repeats for each hash functions" help="">
+                    <validator type="in_range" min="1" />
+                </param>
+                <param name="max_size_bin" type="float" value="0.3" label="Maximum size of one bin" 
+                    help="Expressed as the maximum fraction of the datset size. When a bin contains references to more instances than this quantity, the bin is erased. The ratio is that this featrue is common to too many instances and it is therefore not informative. Morover the runtimes become non sub-linear if a significant fraction of the dataset size has to be checked.">
+                    <validator type="in_range" min="0.0" />
+                </param>
+                <param name="eccess_neighbour_size_factor" type="float" value="5.0" label="Access neighborhood size factor" 
+                    help="Expressed as a multiplicative factor w.r.t. the neighborhood size required. It means that the approximate neighborhood query stops at the X most frequent instances, where X = eccess_neighbor_size_factor * neighborhood size.">
+                    <validator type="in_range" min="0.0" />
+                </param>
+                <param name="num_nearest_neighbours" type="integer" value="10" label="Number of nearest neighbors" help="">
+                    <validator type="in_range" min="1" />
+                </param>
+
+                <param name="row_index" type="text" size="30" label="Row indieces of your input file that should be converted" 
+                    help="Specify a subset of your dataset by providing the row indieces that should be taken into account.">
+                    <sanitizer>
+                        <valid initial="string.digits">
+                            <add value="," />
+                            <add value="-" />
+                            <add value=" " />
+                        </valid>
+                    </sanitizer>
+                    <validator type="empty_field" message="You need to specify row indieces"/>
+                </param>
+                <param name="col_index" type="text" size="30" label="Column indieces of your input file that should be converted" 
+                    help="Specify a subset of your dataset by providing the column indieces that should be taken into account.">
+                    <sanitizer>
+                        <valid initial="string.digits">
+                            <add value="," />
+                            <add value="-" />
+                            <add value=" " />
+                        </valid>
+                    </sanitizer>
+                    <validator type="empty_field" message="You need to specify column indieces"/>
+                </param>
+
+            </when>
+        </conditional>
+
+        <param name="no_normalization" type="boolean" label="Skip normalization" truevalue="--no_normalization" falsevalue="" checked="false" />
+        <param name="min_kernel" type="boolean" label="Use minimal kernel" truevalue="--min_kernel" falsevalue="" checked="false" />
+
+        <param name="hash_bit_size" type="integer" value="15" label="Bit size of the used hashing function" help="">
+            <validator type="in_range" min="1" />
+        </param>
+        <param name="radius" type="integer" value="2" label="Radius that defines a neighborhood" help="">
+            <validator type="in_range" min="1" />
+        </param>
+        <param name="distance" type="integer" value="5" label="Distance that defines a neighborhood" help="">
+            <validator type="in_range" min="1" />
+        </param>
+        <param name="vertex_degree_threshold" type="integer" value="7" label="Vertex degree threshold" help="">
+            <validator type="in_range" min="1" />
+        </param>
+        <param name="radius_two" type="integer" value="2" label="Radius Two" help="">
+            <validator type="in_range" min="1" />
+        </param>
+        <param name="tree_lambda" type="float" value="1.2" label="Tree lambda" help="">
+            <validator type="in_range" min="0.0" />
+        </param>
+
+    </inputs>
+    <configfiles>
+<!-- The strange indentation is necessary, otherwise we get line breaks or white space in our file -->
+<configfile name="row_index_file_name">#if $smooth_opts.smooth_opts_selector == 'smooth':
+                #for $element in str( $smooth_opts.row_index ).split(','):
+                    #set $element = $element.strip().split('-')
+                    #if len($element) == 2:
+                        #for $index in range( int($element[0]), int($element[1]) ):
+                            ## the following writes the value at the beginning of each line
+                            ## #echo $index# inserts a line break automatically, but do not write it
+                            ## to the beginning of the line
+                            #echo '%s\n' % $index
+                        #end for
+                    #else:
+                        #echo '%s\n' % $element[0]
+                    #end if
+                #end for
+            #end if
+</configfile>
+<configfile name="col_index_file_name">#if $smooth_opts.smooth_opts_selector == 'smooth':
+                #for $element in str( $smooth_opts.col_index ).split(','):
+                    #set $element = $element.strip().split('-')
+                    #if len($element) == 2:
+                        #for $index in range( int($element[0]), int($element[1]) ):
+                            ## the following writes the value at the beginning of each line
+                            ## #echo $index# inserts a line break automatically, but do not write it
+                            ## to the beginning of the line
+                            #echo '%s\n' % $index
+                        #end for
+                    #else:
+                        #echo '%s\n' % $element[0]
+                    #end if
+                #end for
+            #end if
+</configfile>
+    </configfiles>
+    <outputs>
+        <data format="eden_sparse_vector" name="outfile" label="Sparse vector from ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" value="3_molceuls.sdf" />
+            <output name="outfile" file="3_molecules.gspan" />
+        </test>
+    </tests>
+    <help>
+
+.. class:: infomark
+
+**What it does** 
+
+The linear model is induced using the accelerated stochastic gradient descent technique by Léon Bottou and Yann LeCun.
+When the target information is 0, a self-training algorithm is used to impute a positive or negative class to the unsupervised instances.
+If the target information is imbalanced a minority class resampling technique is used to rebalance the training set.
+
+This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/EDeN_nearest_neighbor.xml	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,60 @@
+<tool id="bg_eden_nearest_neighbor" name="EDeN Nearest Neighbors" version="0.1">
+    <description></description>
+    <requirements>
+    </requirements>
+    <command>
+        EDeN --action NEAREST_NEIGHBOR
+        
+        --input_data_file_name $infile
+        --file_type "SPARSE_VECTOR"
+        --binary_file_type
+
+        --target_file_name $target_infile
+        ##--model_file_name [model] ????????????????????
+
+        --kernel_type $kernel_type
+        --graph_type $graph_type
+
+    </command>
+    <inputs>
+
+        <param format="eden_sparse_vector" name="infile" type="data" label="Input Graph" help=""/>
+        <param format="txt" name="target_infile" type="data" label="Target file" help=""/>
+
+        <param name="kernel_type" type="select" display="radio" label="Type of the Kernel">
+            <option value="NSPDK">NSPDK</option>
+            <option value="WDK">WDK</option>
+            <option value="PBK">PBK</option>
+            <option value="USPK">USPK</option>
+            <option value="DDK">DDK</option>
+            <option value="NSDDK">ANSDDK</option>
+            <option value="SK">SK [NSPDK]</option>
+        </param>
+
+        <param name="graph_type" type="select" display="radio" label="Type of Graph">
+            <option value="DIRECTED">directed</option>
+            <option value="UNDIRECTED">undirected</option>
+        </param>
+
+    </inputs>
+    <outputs>
+        <data format="gspan" name="outfile" label="gSpan from ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" value="3_molceuls.sdf" />
+            <output name="outfile" file="3_molecules.gspan" />
+        </test>
+    </tests>
+    <help>
+
+.. class:: infomark
+
+**What it does** 
+
+Nearest neighbors are efficiently identified with a locality sensitive hashing technique.
+
+This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/EDeN_test.xml	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,57 @@
+<tool id="bg_eden_test" name="EDeN Test" version="0.1">
+    <description></description>
+    <requirements>
+    </requirements>
+    <command>
+        EDeN --action TEST
+        
+        --input_data_file_name $infile
+        --model_file_name $model_outfile
+
+        --file_type "SPARSE_VECTOR"
+        --binary_file_type
+
+        --kernel_type $kernel_type
+        --graph_type $graph_type
+
+    </command>
+    <inputs>
+        <param format="eden_sparse_vector" name="infile" type="data" label="Input Graph" help=""/>
+
+        <param name="kernel_type" type="select" display="radio" label="Type of the Kernel">
+            <option value="NSPDK">NSPDK</option>
+            <option value="WDK">WDK</option>
+            <option value="PBK">PBK</option>
+            <option value="USPK">USPK</option>
+            <option value="DDK">DDK</option>
+            <option value="NSDDK">ANSDDK</option>
+            <option value="SK">SK [NSPDK]</option>
+        </param>
+
+        <param name="graph_type" type="select" display="radio" label="Type of Graph">
+            <option value="DIRECTED">directed</option>
+            <option value="UNDIRECTED">undirected</option>
+        </param>
+
+    </inputs>
+    <outputs>
+        <data format="txt" name="model_outfile" label="Generated  from ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+        </test>
+    </tests>
+    <help>
+
+.. class:: infomark
+
+**What it does** 
+
+The linear model is induced using the accelerated stochastic gradient descent technique by Léon Bottou and Yann LeCun.
+When the target information is 0, a self-training algorithm is used to impute a positive or negative class to the unsupervised instances.
+If the target information is imbalanced a minority class resampling technique is used to rebalance the training set.
+
+This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/EDeN_train.xml	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,87 @@
+<tool id="bg_eden_train" name="EDeN Train" version="0.1">
+    <description></description>
+    <requirements>
+    </requirements>
+    <command>
+        EDeN --action TRAIN
+
+        --input_data_file_name $infile
+        --file_type "SPARSE_VECTOR"
+        --binary_file_type
+
+        ## TODO: we need a tool that creates such a file, maybe from the metadata of an SDF file
+        ## target_file_name is a file with 1 or -1 one in each row, indicating the class
+        --target_file_name $target_infile
+        --model_file_name $model_outfile
+
+        --lambda $lambda ##??? notation?
+        --epochs $epoch
+        
+        --sparsification_num_iterations $sparsification_num_iterations
+        --topological_regularization_num_neighbors $topological_regularization_num_neighbors
+        --topological_regularization_decay_rate $topological_regularization_decay_rate
+
+        --num_iterations $num_iterations
+        --threshold $threshold
+        --only_positive $only_positive
+        --only_negative $only_negative
+
+        --random_seed $random_seed
+
+    </command>
+    <inputs>
+        <param format="eden_sparse_vector" name="infile" type="data" label="Input Graph" help=""/>
+        <param format="txt" name="target_infile" type="data" label="Target file" help=""/>
+
+        <param name="kernel_type" type="select" display="radio" label="Type of the Kernel">
+            <option value="NSPDK">NSPDK</option>
+            <option value="WDK">WDK</option>
+            <option value="PBK">PBK</option>
+            <option value="USPK">USPK</option>
+            <option value="DDK">DDK</option>
+            <option value="NSDDK">ANSDDK</option>
+            <option value="SK">SK [NSPDK]</option>
+        </param>
+
+        <param name="graph_type" type="select" display="radio" label="Type of Graph">
+            <option value="DIRECTED">directed</option>
+            <option value="UNDIRECTED">undirected</option>
+        </param>
+
+        <param name="epoch" type="integer" value="10" label="Epoch, Stochastic gradient descend algorithm." help="">
+            <validator type="in_range" min="1" />
+        </param>
+        <param name="lambda" type="text" value="1e-4" label="lambda, Stochastic gradient descend algorithm." help="" />
+
+    </inputs>
+    <outputs>
+        <data format="txt" name="model_outfile" label="Train Model from ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" value="3_molceuls.sdf" />
+            <output name="outfile" file="3_molecules.gspan" />
+        </test>
+    </tests>
+    <help>
+
+.. class:: infomark
+
+**What it does** 
+
+The linear model is induced using the accelerated stochastic gradient descent technique by Léon Bottou and Yann LeCun.
+When the target information is 0, a self-training algorithm is used to impute a positive or negative class to the unsupervised instances.
+If the target information is imbalanced a minority class resampling technique is used to rebalance the training set.
+
+This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa.
+
+
+REFERENCES
+==========
+
+The code for Stochastic Gradient Descent SVM is adapted from http://leon.bottou.org/projects/sgd. Léon Bottou and Yann LeCun, ''Large Scale Online Learning'', Advances in Neural Information Processing Systems 16, Edited by Sebastian Thrun, Lawrence Saul and Bernhard Schölkopf, MIT Press, Cambridge, MA, 2004.
+
+
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,40 @@
+Galaxy wrappres and definitions for EDeN
+========================================
+
+
+
+History
+=======
+
+
+
+
+Installation
+============
+
+Doing this automatically via the Galaxy Tool Shed is probably simplest.
+
+
+Licence (MIT)
+=============
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+NOTE: This is the licence for the EDeN wrappers **only**. EDeN
+and associated data files are available and licenced separately.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<datatypes>
+    <datatype_files>
+        <datatype_file name="eden.py"/>
+    </datatype_files>
+    <registration>
+        <datatype extension="gspan" type="galaxy.datatypes.eden:Gspan" mimetype="application/octet-stream" subclass="True" display_in_upload="false"/>
+        <datatype extension="gspan" type="galaxy.datatypes.eden:SparseVector" mimetype="application/octet-stream" subclass="True" display_in_upload="false"/>
+    </registration>
+</datatypes>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/eden.py	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,40 @@
+"""
+EDeN filetypes
+"""
+
+from galaxy.datatypes.data import Binary
+
+
+class Gspan( Binary ):
+    """Class describing an gSpan file"""
+    file_ext = "gspan"
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek  = "gSpan" 
+            dataset.blurb = data.nice_size( dataset.get_size() )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+    def display_peek( self, dataset ):
+        try:
+            return dataset.peek
+        except:
+            return "Binary gSpan file (%s)" % ( data.nice_size( dataset.get_size() ) )
+
+class SparseVector( Binary ):
+    """Class describing an SparseVector file"""
+    file_ext = "sparse"
+
+    def set_peek( self, dataset, is_multi_byte=False ):
+        if not dataset.dataset.purged:
+            dataset.peek  = "SparseVector" 
+            dataset.blurb = data.nice_size( dataset.get_size() )
+        else:
+            dataset.peek = 'file does not exist'
+            dataset.blurb = 'file purged from disk'
+    def display_peek( self, dataset ):
+        try:
+            return dataset.peek
+        except:
+            return "Binary SparseVector file (%s)" % ( data.nice_size( dataset.get_size() ) )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mol2gspan.py	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os, sys
+import argparse
+
+def main(args ):
+
+    begin = True
+    iid = 0
+    graph_counter = 1
+
+    for line in args.infile:
+        if line.rstrip():
+            if line.strip().endswith('END'):
+                begin = False
+            elif line.strip() == '$$$$':
+                graph_counter += 1
+                iid = 0
+            else:
+                # found header line, like:  21 21  0  0  0  0  0  0  0  0999 V2000
+                if len(line.split()) >= 5 and line.split()[-1] == 'V2000':
+                    args.outfile.write('t # id %s\n' % graph_counter)
+                    begin=True
+                    continue
+                # connection or coordinate/atom table
+                if len(line.split()) >= 4 and begin:
+                    # coordinate/atom table
+                    if line.split()[3].isalpha():
+                        args.outfile.write( 'v %s %s \n' % (iid, line.split()[3]) )
+                        iid += 1
+                    else:
+                        #connection table
+                        id, node, edge, trash = line.split(None, 3)
+                        args.outfile.write( 'e %s %s %s\n' % ( int(id) - 1 , int(node) -1, edge ) )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--infile', nargs='?', type=argparse.FileType('r'),
+        default=sys.stdin, help="Specify one or more input files")
+    parser.add_argument('--outfile', type=argparse.FileType('w'),
+        default=sys.stdout, help="Specify one output file")
+    args = parser.parse_args()
+    main( args )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mol2gspan.xml	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,32 @@
+<tool id="bg_mol2gspan" name="Molecule to gSpan" version="0.1">
+    <description>converter</description>
+    <requirements>
+        <requirement type="package" version="2.3.2">openbabel</requirement>
+        <requirement type="set_environment">EDEN_SCRIPT_PATH</requirement>
+    </requirements>
+    <command>
+        obabel -i smi -o sdf $infile | \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $outfile
+    </command>
+    <inputs>
+        <param format="smi,sdf,mol,inchi,mol2" name="infile" type="data" 
+            label="Input molecules" help=""/>
+    </inputs>
+    <outputs>
+        <data format="gspan" name="outfile" label="gSpan from ${on_string}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" value="3_molceuls.sdf" />
+            <output name="outfile" file="3_molecules.gspan" />
+        </test>
+    </tests>
+    <help>
+
+.. class:: infomark
+
+**What it does** 
+
+That converter will convert arbitratry molecule files to the gSpan format.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/3_molecules.gspan	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,129 @@
+t # id 1
+v 0 O 
+v 1 O 
+v 2 O 
+v 3 O 
+v 4 C 
+v 5 C 
+v 6 C 
+v 7 C 
+v 8 C 
+v 9 C 
+v 10 C 
+v 11 C 
+v 12 C 
+v 13 H 
+v 14 H 
+v 15 H 
+v 16 H 
+v 17 H 
+v 18 H 
+v 19 H 
+v 20 H 
+e 0 4 1
+e 0 11 1
+e 1 10 1
+e 1 20 1
+e 2 10 2
+e 3 11 2
+e 4 5 1
+e 4 6 2
+e 5 7 2
+e 5 10 1
+e 6 8 1
+e 6 13 1
+e 7 9 1
+e 7 14 1
+e 8 9 2
+e 8 15 1
+e 9 16 1
+e 11 12 1
+e 12 17 1
+e 12 18 1
+e 12 19 1
+t # id 2
+v 0 O 
+v 1 O 
+v 2 O 
+v 3 O 
+v 4 C 
+v 5 C 
+v 6 C 
+v 7 C 
+v 8 C 
+v 9 C 
+v 10 C 
+v 11 C 
+v 12 C 
+v 13 H 
+v 14 H 
+v 15 H 
+v 16 H 
+v 17 H 
+v 18 H 
+v 19 H 
+v 20 H 
+e 0 4 1
+e 0 11 1
+e 1 10 1
+e 1 20 1
+e 2 10 2
+e 3 11 2
+e 4 5 1
+e 4 6 2
+e 5 7 2
+e 5 10 1
+e 6 8 1
+e 6 13 1
+e 7 9 1
+e 7 14 1
+e 8 9 2
+e 8 15 1
+e 9 16 1
+e 11 12 1
+e 12 17 1
+e 12 18 1
+e 12 19 1
+t # id 3
+v 0 O 
+v 1 O 
+v 2 O 
+v 3 O 
+v 4 C 
+v 5 C 
+v 6 C 
+v 7 C 
+v 8 C 
+v 9 C 
+v 10 C 
+v 11 C 
+v 12 C 
+v 13 H 
+v 14 H 
+v 15 H 
+v 16 H 
+v 17 H 
+v 18 H 
+v 19 H 
+v 20 H 
+e 0 4 1
+e 0 11 1
+e 1 10 1
+e 1 20 1
+e 2 10 2
+e 3 11 2
+e 4 5 1
+e 4 6 2
+e 5 7 2
+e 5 10 1
+e 6 8 1
+e 6 13 1
+e 7 9 1
+e 7 14 1
+e 8 9 2
+e 8 15 1
+e 9 16 1
+e 11 12 1
+e 12 17 1
+e 12 18 1
+e 12 19 1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/3_molecules.sdf	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,350 @@
+
+ Chemfp
+
+ 21 21  0  0  0  0  0  0  0  0999 V2000
+    1.2333    0.5540    0.7792 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.6952   -2.7148   -0.7502 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.7958   -2.1843    0.8685 O   0  0  0  0  0  0  0  0  0  0  0  0
+    1.7813    0.8105   -1.4821 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.0857    0.6088    0.4403 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7927   -0.5515    0.1244 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7288    1.8464    0.4133 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.1426   -0.4741   -0.2184 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.0787    1.9238    0.0706 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7855    0.7636   -0.2453 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1409   -1.8536    0.1477 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.1094    0.6715   -0.3113 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.5305    0.5996    0.1635 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1851    2.7545    0.6593 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7247   -1.3605   -0.4564 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.5797    2.8872    0.0506 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.8374    0.8238   -0.5090 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7290    1.4184    0.8593 H   0  0  0  0  0  0  0  0  0  0  0  0
+    4.2045    0.6969   -0.6924 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7105   -0.3659    0.6426 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.2555   -3.5916   -0.7337 H   0  0  0  0  0  0  0  0  0  0  0  0
+  1  5  1  0  0  0  0
+  1 12  1  0  0  0  0
+  2 11  1  0  0  0  0
+  2 21  1  0  0  0  0
+  3 11  2  0  0  0  0
+  4 12  2  0  0  0  0
+  5  6  1  0  0  0  0
+  5  7  2  0  0  0  0
+  6  8  2  0  0  0  0
+  6 11  1  0  0  0  0
+  7  9  1  0  0  0  0
+  7 14  1  0  0  0  0
+  8 10  1  0  0  0  0
+  8 15  1  0  0  0  0
+  9 10  2  0  0  0  0
+  9 16  1  0  0  0  0
+ 10 17  1  0  0  0  0
+ 12 13  1  0  0  0  0
+ 13 18  1  0  0  0  0
+ 13 19  1  0  0  0  0
+ 13 20  1  0  0  0  0
+M  END
+>  <PUBCHEM_COMPOUND_CID>
+2244
+
+>  <PUBCHEM_CONFORMER_RMSD>
+0.6
+
+>  <PUBCHEM_CONFORMER_DIVERSEORDER>
+1
+11
+10
+3
+15
+17
+13
+5
+16
+7
+14
+9
+8
+4
+18
+6
+12
+2
+
+>  <PUBCHEM_MMFF94_PARTIAL_CHARGES>
+18
+1 -0.23
+10 -0.15
+11 0.63
+12 0.66
+13 0.06
+14 0.15
+15 0.15
+16 0.15
+17 0.15
+2 -0.65
+21 0.5
+3 -0.57
+4 -0.57
+5 0.08
+6 0.09
+7 -0.15
+8 -0.15
+9 -0.15
+
+>  <PUBCHEM_EFFECTIVE_ROTOR_COUNT>
+3
+
+>  <PUBCHEM_CONFORMER_ID>
+000008C400000001
+
+>  <PUBCHEM_MMFF94_ENERGY>
+39.5952
+
+>  <PUBCHEM_FEATURE_SELFOVERLAP>
+25.432
+
+>  <PUBCHEM_SHAPE_FINGERPRINT>
+1 1 18265615372930943622
+100427 49 16967750034970055351
+12138202 97 18271247217817981012
+12423570 1 16692715976000295083
+12524768 44 16753525617747228747
+12716758 59 18341332292274886536
+13024252 1 17968377969333732145
+14181834 199 17830728755827362645
+14614273 12 18262232214645093005
+15207287 21 17703787037639964108
+15775835 57 18340488876329928641
+16945 1 18271533103414939405
+193761 8 17907860604865584321
+20645476 183 17677348215414174190
+20871998 184 18198632231250704846
+21040471 1 18411412921197846465
+21501502 16 18123463883164380929
+23402539 116 18271795865171824860
+23419403 2 13539898140662769886
+23552423 10 18048876295495619569
+23559900 14 18272369794190581304
+241688 4 16179044415907240795
+257057 1 17478316999871287486
+2748010 2 18339085878070479087
+305870 269 18263645056784260212
+528862 383 18117272558388284091
+53812653 8 18410289211719108569
+7364860 26 17910392788380644719
+81228 2 18050568744116491203
+
+>  <PUBCHEM_SHAPE_MULTIPOLES>
+244.06
+3.86
+2.45
+0.89
+1.95
+1.58
+0.15
+-1.85
+0.38
+-0.61
+-0.02
+0.29
+0.01
+-0.33
+
+>  <PUBCHEM_SHAPE_SELFOVERLAP>
+513.037
+
+>  <PUBCHEM_SHAPE_VOLUME>
+136
+
+>  <PUBCHEM_COORDINATE_TYPE>
+2
+5
+10
+
+$$$$
+2244
+ OpenBabel09021316243D
+
+ 21 21  0  0  0  0  0  0  0  0999 V2000
+    1.2333    0.5540    0.7792 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.6952   -2.7148   -0.7502 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.7958   -2.1843    0.8685 O   0  0  0  0  0  0  0  0  0  0  0  0
+    1.7813    0.8105   -1.4821 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.0857    0.6088    0.4403 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7927   -0.5515    0.1244 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7288    1.8464    0.4133 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.1426   -0.4741   -0.2184 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.0787    1.9238    0.0706 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7855    0.7636   -0.2453 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1409   -1.8536    0.1477 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.1094    0.6715   -0.3113 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.5305    0.5996    0.1635 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1851    2.7545    0.6593 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7247   -1.3605   -0.4564 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.5797    2.8872    0.0506 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.8374    0.8238   -0.5090 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7290    1.4184    0.8593 H   0  0  0  0  0  0  0  0  0  0  0  0
+    4.2045    0.6969   -0.6924 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7105   -0.3659    0.6426 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.2555   -3.5916   -0.7337 H   0  0  0  0  0  0  0  0  0  0  0  0
+  1  5  1  0  0  0  0
+  1 12  1  0  0  0  0
+  2 11  1  0  0  0  0
+  2 21  1  0  0  0  0
+  3 11  2  0  0  0  0
+  4 12  2  0  0  0  0
+  5  6  1  0  0  0  0
+  5  7  2  0  0  0  0
+  6  8  2  0  0  0  0
+  6 11  1  0  0  0  0
+  7  9  1  0  0  0  0
+  7 14  1  0  0  0  0
+  8 10  1  0  0  0  0
+  8 15  1  0  0  0  0
+  9 10  2  0  0  0  0
+  9 16  1  0  0  0  0
+ 10 17  1  0  0  0  0
+ 12 13  1  0  0  0  0
+ 13 18  1  0  0  0  0
+ 13 19  1  0  0  0  0
+ 13 20  1  0  0  0  0
+M  END
+>  <PUBCHEM_COMPOUND_CID>
+2244
+
+>  <PUBCHEM_CONFORMER_RMSD>
+0.6
+
+>  <PUBCHEM_CONFORMER_DIVERSEORDER>
+1
+11
+10
+3
+15
+17
+13
+5
+16
+7
+14
+9
+8
+4
+18
+6
+12
+2
+
+>  <PUBCHEM_MMFF94_PARTIAL_CHARGES>
+18
+1 -0.23
+10 -0.15
+11 0.63
+12 0.66
+13 0.06
+14 0.15
+15 0.15
+16 0.15
+17 0.15
+2 -0.65
+21 0.5
+3 -0.57
+4 -0.57
+5 0.08
+6 0.09
+7 -0.15
+8 -0.15
+9 -0.15
+
+>  <PUBCHEM_EFFECTIVE_ROTOR_COUNT>
+3
+
+>  <PUBCHEM_PHARMACOPHORE_FEATURES>
+5
+1 2 acceptor
+1 3 acceptor
+1 4 acceptor
+3 2 3 11 anion
+6 5 6 7 8 9 10 rings
+
+>  <PUBCHEM_HEAVY_ATOM_COUNT>
+13
+
+>  <PUBCHEM_ATOM_DEF_STEREO_COUNT>
+0
+
+>  <PUBCHEM_ATOM_UDEF_STEREO_COUNT>
+0
+
+>  <PUBCHEM_BOND_DEF_STEREO_COUNT>
+0
+
+>  <PUBCHEM_BOND_UDEF_STEREO_COUNT>
+0
+
+>  <PUBCHEM_ISOTOPIC_ATOM_COUNT>
+0
+
+>  <PUBCHEM_COMPONENT_COUNT>
+1
+
+>  <PUBCHEM_CACTVS_TAUTO_COUNT>
+1
+
+>  <PUBCHEM_CONFORMER_ID>
+000008C400000001
+
+>  <PUBCHEM_COORDINATE_TYPE>
+2
+5
+10
+
+$$$$
+
+
+ 21 21  0  0  0  0  0  0  0  0999 V2000
+    1.2333    0.5540    0.7792 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.6952   -2.7148   -0.7502 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.7958   -2.1843    0.8685 O   0  0  0  0  0  0  0  0  0  0  0  0
+    1.7813    0.8105   -1.4821 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.0857    0.6088    0.4403 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7927   -0.5515    0.1244 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7288    1.8464    0.4133 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.1426   -0.4741   -0.2184 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.0787    1.9238    0.0706 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7855    0.7636   -0.2453 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1409   -1.8536    0.1477 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.1094    0.6715   -0.3113 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.5305    0.5996    0.1635 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1851    2.7545    0.6593 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.7247   -1.3605   -0.4564 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.5797    2.8872    0.0506 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.8374    0.8238   -0.5090 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7290    1.4184    0.8593 H   0  0  0  0  0  0  0  0  0  0  0  0
+    4.2045    0.6969   -0.6924 H   0  0  0  0  0  0  0  0  0  0  0  0
+    3.7105   -0.3659    0.6426 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.2555   -3.5916   -0.7337 H   0  0  0  0  0  0  0  0  0  0  0  0
+  1  5  1  0  0  0  0
+  1 12  1  0  0  0  0
+  2 11  1  0  0  0  0
+  2 21  1  0  0  0  0
+  3 11  2  0  0  0  0
+  4 12  2  0  0  0  0
+  5  6  1  0  0  0  0
+  5  7  2  0  0  0  0
+  6  8  2  0  0  0  0
+  6 11  1  0  0  0  0
+  7  9  1  0  0  0  0
+  7 14  1  0  0  0  0
+  8 10  1  0  0  0  0
+  8 15  1  0  0  0  0
+  9 10  2  0  0  0  0
+  9 16  1  0  0  0  0
+ 10 17  1  0  0  0  0
+ 12 13  1  0  0  0  0
+ 13 18  1  0  0  0  0
+ 13 19  1  0  0  0  0
+ 13 20  1  0  0  0  0
+M  END
+$$$$
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Wed Sep 04 05:10:04 2013 -0400
@@ -0,0 +1,9 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="openbabel" version="2.3.2">
+        <repository changeset_revision="7601c962048a" name="package_openbabel_2_3" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    </package>
+    <set_environment version="1.0">
+        <environment_variable action="set_to" name="EDEN_SCRIPT_PATH">$REPOSITORY_INSTALL_DIR/scripts</environment_variable>
+    </set_environment>
+</tool_dependency>