# HG changeset patch # User bgruening # Date 1383059269 14400 # Node ID 59b3b6ce10bb9b528e06513db7e5b407ab4b0224 # Parent 7d49e315cb95724c332abb0b35149076a1499354 Uploaded diff -r 7d49e315cb95 -r 59b3b6ce10bb EDeN_feature.xml --- a/EDeN_feature.xml Thu Sep 05 12:52:45 2013 -0400 +++ b/EDeN_feature.xml Tue Oct 29 11:07:49 2013 -0400 @@ -1,93 +1,42 @@ - - - - openbabel - EDEN_SCRIPT_PATH - + + to produce sparce vectors eden_macros.xml + - - ## pre-processing step if we have a molecule type we need to convert it to the gSpan format at first - - #import tempfile, os - #set $temp_gspan = tempfile.NamedTemporaryFile( delete=False ) - #silent $temp_gspan.close() - #set $temp_gspan = $temp_gspan.name - - #if $file_type_opts.file_type_opts_selector == 'sdf': - obabel -i sdf -o sdf $infile ---errorlevel 1 | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan - #set $file_type = 'GRAPH' - #elif $file_type_opts.file_type_opts_selector == 'smi': - obabel -i smi -o sdf $infile ---errorlevel 1 | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan - #set $file_type = 'GRAPH' - #else: - #set $temp_gspan = $infile - #set $file_type = $file_type_opts.file_type_opts_selector - #end if - ; + tmp_dir=`mktemp -d -u`; EDeN --action FEATURE - --input_data_file_name $temp_gspan + --output_directory_path \$tmp_dir + + --input_data_file_name $infile --model_file_name $outfile - ## if we have an molecule datatype the file_type is set to GRAPH, after convertion to the gSpan Graph format - --file_type $file_type + --file_type $file_type_opts.file_type_opts_selector --binary_file_type ## create a binary sparse vector as output - - $no_normalization - $min_kernel - - --hash_bit_size $hash_bit_size - --radius $radius - --distance $distance - --vertex_degree_threshold $vertex_degree_threshold - - $no_normalization - $min_kernel + @normalization_kernel_hash_radius_dist_vertex@ --kernel_type $kernel_type_opts.kernel_type_opts_selector --graph_type $graph_type #if $file_type_opts.file_type_opts_selector == 'SEQUENCE': - --sequence_degree $sequence_degree $sequence_token $sequence_multi_line $sequence_pairwise_interaction - - #end if - - #if $kernel_type_opts.kernel_type_opts_selector in ['DDK','NSDDK','SK']: - --tree_lambda $kernel_type_opts.tree_lambda - --radius_two $kernel_type_opts.radius_two #end if - ### Adds rescaled features from nearest neighbors ### - - #if $smooth_opts.smooth_opts_selector == 'smooth': - --smooth - --smooth_param $smooth_opts.smoother_param + @kernel_type_options@ - --row_index_file_name $row_index_file_name - --col_index_file_name $col_index_file_name - --num_hash_functions $smooth_opts.num_hash_functions - --num_repeat_hash_functions $smooth_opts.num_repeat_hash_functions - --max_size_bin $smooth_opts.max_size_bin - --eccess_neighbour_size_factor $smooth_opts.eccess_neighbour_size_factor - --num_nearest_neighbours $smooth_opts.num_nearest_neighbours - $smooth_opts.shared_neighborhood - $smooth_opts.no_neighborhood_cache - $smooth_opts.no_minhash_cache - #end if + @input_smooth_conditional@ ; - rm $temp_gspan - + cp \$tmp_dir/feature $outfile; + rm \$tmp_dir -rf; @@ -97,16 +46,14 @@ description="An error occured with your Job." /> - + - - @@ -122,70 +69,13 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - + - - - - - - - - - - - - - - - - + @@ -224,7 +114,7 @@ - + @@ -242,7 +132,7 @@ When the target information is 0, a self-training algorithm is used to impute a positive or negative class to the unsupervised instances. If the target information is imbalanced a minority class resampling technique is used to rebalance the training set. -This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa. +@references@ diff -r 7d49e315cb95 -r 59b3b6ce10bb EDeN_nearest_neighbor.xml --- a/EDeN_nearest_neighbor.xml Thu Sep 05 12:52:45 2013 -0400 +++ b/EDeN_nearest_neighbor.xml Tue Oct 29 11:07:49 2013 -0400 @@ -1,11 +1,11 @@ - - + eden_macros.xml + tmp_dir=`mktemp -d -u`; EDeN --action NEAREST_NEIGHBOR --input_data_file_name $infile @@ -14,22 +14,38 @@ --file_type "SPARSE_VECTOR" --binary_file_type - ### Adds rescaled features from nearest neighbors ### + @kernel_type_options@ + --graph_type $graph_type + @input_smooth_conditional@ + + @normalization_kernel_hash_radius_dist_vertex@ - #if $smooth_opts.smooth_opts_selector == 'smooth': - --smooth - --smooth_param $smooth_opts.smoother_param + --output_directory_path \$tmp_dir + + + ## + ## shuffling files to create the correct outputs for Galaxy + ## - --row_index_file_name $row_index_file_name - --col_index_file_name $col_index_file_name - --num_hash_functions $smooth_opts.num_hash_functions - --num_repeat_hash_functions $smooth_opts.num_repeat_hash_functions - --max_size_bin $smooth_opts.max_size_bin - --eccess_neighbour_size_factor $smooth_opts.eccess_neighbour_size_factor - --num_nearest_neighbours $smooth_opts.num_nearest_neighbours - $smooth_opts.shared_neighborhood - $smooth_opts.no_neighborhood_cache - $smooth_opts.no_minhash_cache + ; + cp \$tmp_dir/knn $ofile_nnlist 2> /dev/null + + ## Nearest neighbor feature representation + #if 'nnf' in str($additional_outputs).split(','): + ; + cp \$tmp_dir/knn_feature $ofile_nnf 2> /dev/null + #end if + + ## Nearest neighbor target value list + #if 'nnt' in str($additional_outputs).split(','): + ; + cp \$tmp_dir/knn_target_value $ofile_nnt 2> /dev/null + #end if + + ## Nearest neighbor kernel value list + #if 'nnk' in str($additional_outputs).split(','): + ; + cp \$tmp_dir/knn_kernel_value $ofile_nnk 2> /dev/null #end if @@ -40,9 +56,31 @@ + + + + + + + + + + + + + - + + + 'nnf' in additional_outputs + + + 'nnt' in additional_outputs + + + 'nnk' in additional_outputs + @@ -58,7 +96,7 @@ Nearest neighbors are efficiently identified with a locality sensitive hashing technique. -This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa. +@references@ diff -r 7d49e315cb95 -r 59b3b6ce10bb EDeN_test.xml --- a/EDeN_test.xml Thu Sep 05 12:52:45 2013 -0400 +++ b/EDeN_test.xml Tue Oct 29 11:07:49 2013 -0400 @@ -1,15 +1,31 @@ - - + + eden_macros.xml + + + tmp_dir=`mktemp -d -u`; EDeN --action TEST - --input_data_file_name $sparse_vector_infile + --input_data_file_name $sparse_vector_infile + --file_type "SPARSE_VECTOR" + --binary_file_type + --model_file_name $model_infile - --file_type "SPARSE_VECTOR" - --binary_file_type + @kernel_type_options@ + + --graph_type $graph_type + + @normalization_kernel_hash_radius_dist_vertex@ + + --output_directory_path \$tmp_dir + --minimal_output + + ; + cp \$tmp_dir/prediction $output; + rm \$tmp_dir -rf @@ -17,6 +33,12 @@ + + + + + + @@ -35,7 +57,7 @@ When the target information is 0, a self-training algorithm is used to impute a positive or negative class to the unsupervised instances. If the target information is imbalanced a minority class resampling technique is used to rebalance the training set. -This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa. +@references@ diff -r 7d49e315cb95 -r 59b3b6ce10bb EDeN_train.xml --- a/EDeN_train.xml Thu Sep 05 12:52:45 2013 -0400 +++ b/EDeN_train.xml Tue Oct 29 11:07:49 2013 -0400 @@ -1,14 +1,20 @@ - - + + eden_macros.xml + + + tmp_dir=`mktemp -d -u`; + EDeN --action TRAIN --input_data_file_name $infile --file_type "SPARSE_VECTOR" --binary_file_type + ##--output_directory_path \$tmp_dir + ## TODO: we need a tool that creates such a file, maybe from the metadata of an SDF file ## target_file_name is a file with 1 or -1 one in each row, indicating the class --target_file_name $target_infile @@ -40,15 +46,15 @@ + help="Only the top and low quantile will be used as positives and negative instances. A threshold of 1 means that all unsupervised instaces are used in the next phase."> - + - + @@ -58,7 +64,7 @@ - + @@ -80,13 +86,10 @@ When the target information is 0, a self-training algorithm is used to impute a positive or negative class to the unsupervised instances. If the target information is imbalanced a minority class resampling technique is used to rebalance the training set. -This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa. - +@references@ -REFERENCES -========== - -The code for Stochastic Gradient Descent SVM is adapted from http://leon.bottou.org/projects/sgd. Léon Bottou and Yann LeCun, ''Large Scale Online Learning'', Advances in Neural Information Processing Systems 16, Edited by Sebastian Thrun, Lawrence Saul and Bernhard Schölkopf, MIT Press, Cambridge, MA, 2004. +The code for Stochastic Gradient Descent SVM is adapted from http://leon.bottou.org/projects/sgd. Léon Bottou and Yann LeCun, ''Large Scale Online Learning'', +Advances in Neural Information Processing Systems 16, Edited by Sebastian Thrun, Lawrence Saul and Bernhard Schölkopf, MIT Press, Cambridge, MA, 2004. diff -r 7d49e315cb95 -r 59b3b6ce10bb datatypes_conf.xml --- a/datatypes_conf.xml Thu Sep 05 12:52:45 2013 -0400 +++ b/datatypes_conf.xml Tue Oct 29 11:07:49 2013 -0400 @@ -5,6 +5,6 @@ - + diff -r 7d49e315cb95 -r 59b3b6ce10bb eden.py --- a/eden.py Thu Sep 05 12:52:45 2013 -0400 +++ b/eden.py Tue Oct 29 11:07:49 2013 -0400 @@ -24,7 +24,7 @@ class SparseVector( Binary ): """Class describing an SparseVector file""" - file_ext = "sparse" + file_ext = "sparsevector" def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: diff -r 7d49e315cb95 -r 59b3b6ce10bb eden_macros.xml --- a/eden_macros.xml Thu Sep 05 12:52:45 2013 -0400 +++ b/eden_macros.xml Tue Oct 29 11:07:49 2013 -0400 @@ -60,4 +60,122 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + eden + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $no_normalization + $min_kernel + + --hash_bit_size $hash_bit_size + --radius $radius + --distance $distance + --vertex_degree_threshold $vertex_degree_threshold + + + + #if $smooth_opts.smooth_opts_selector == 'smooth': + --smooth + --smooth_param $smooth_opts.smoother_param + + --row_index_file_name $row_index_file_name + --col_index_file_name $col_index_file_name + --num_hash_functions $smooth_opts.num_hash_functions + --num_repeat_hash_functions $smooth_opts.num_repeat_hash_functions + --max_size_bin $smooth_opts.max_size_bin + --eccess_neighbour_size_factor $smooth_opts.eccess_neighbour_size_factor + --num_nearest_neighbours $smooth_opts.num_nearest_neighbours + $smooth_opts.shared_neighborhood + $smooth_opts.no_neighborhood_cache + $smooth_opts.no_minhash_cache + #end if + + + + #if $kernel_type_opts.kernel_type_opts_selector in ['DDK','NSDDK','SK']: + --tree_lambda $kernel_type_opts.tree_lambda + --radius_two $kernel_type_opts.radius_two + #end if + + + +This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa. + + + + + diff -r 7d49e315cb95 -r 59b3b6ce10bb mol2gspan.py --- a/mol2gspan.py Thu Sep 05 12:52:45 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -#!/usr/bin/env python - -import os, sys -import argparse - -def main(args ): - - begin = True - iid = 0 - graph_counter = 1 - - for line in args.infile: - if line.rstrip(): - if line.strip().endswith('END'): - begin = False - elif line.strip() == '$$$$': - graph_counter += 1 - iid = 0 - else: - # found header line, like: 21 21 0 0 0 0 0 0 0 0999 V2000 - if len(line.split()) >= 5 and line.split()[-1] == 'V2000': - args.outfile.write('t # id %s\n' % graph_counter) - begin=True - continue - # connection or coordinate/atom table - if len(line.split()) >= 4 and begin: - # coordinate/atom table - if line.split()[3].isalpha(): - args.outfile.write( 'v %s %s \n' % (iid, line.split()[3]) ) - iid += 1 - else: - #connection table - id, node, edge, trash = line.split(None, 3) - args.outfile.write( 'e %s %s %s\n' % ( int(id) - 1 , int(node) -1, edge ) ) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--infile', nargs='?', type=argparse.FileType('r'), - default=sys.stdin, help="Specify one or more input files") - parser.add_argument('--outfile', type=argparse.FileType('w'), - default=sys.stdout, help="Specify one output file") - args = parser.parse_args() - main( args ) diff -r 7d49e315cb95 -r 59b3b6ce10bb mol2gspan.xml --- a/mol2gspan.xml Thu Sep 05 12:52:45 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ - - converter - - openbabel - EDEN_SCRIPT_PATH - - - obabel -i smi -o sdf $infile | \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $outfile - - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -That converter will convert arbitratry molecule files to the gSpan format. - - - diff -r 7d49e315cb95 -r 59b3b6ce10bb tool_dependencies.xml --- a/tool_dependencies.xml Thu Sep 05 12:52:45 2013 -0400 +++ b/tool_dependencies.xml Tue Oct 29 11:07:49 2013 -0400 @@ -1,9 +1,6 @@ - - + + - - $REPOSITORY_INSTALL_DIR -