Mercurial > repos > bgruening > eden_toolbox
view EDeN_feature.xml @ 5:be1433b0833b draft
Uploaded
author | bgruening |
---|---|
date | Thu, 05 Sep 2013 12:50:36 -0400 |
parents | 56e5c6ee3c1b |
children | 7d49e315cb95 |
line wrap: on
line source
<tool id="bg_eden_feature" name="EDeN Converter" version="0.1"> <description></description> <requirements> <requirement type="package" version="2.3.2">openbabel</requirement> <requirement type="set_environment">EDEN_SCRIPT_PATH</requirement> </requirements> <macros> <import>eden_macros.xml</import> </macros> <command> ## pre-processing step if we have a molecule type we need to convert it to the gSpan format at first #import tempfile, os #set $temp_gspan = tempfile.NamedTemporaryFile( delete=False ) #silent $temp_gspan.close() #set $temp_gspan = $temp_gspan.name #if $file_type_opts.file_type_opts_selector == 'sdf': obabel -i sdf -o sdf $infile ---errorlevel 1 | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan #set $file_type = 'GRAPH' #elif $file_type_opts.file_type_opts_selector == 'smi': obabel -i smi -o sdf $infile ---errorlevel 1 | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan #set $file_type = 'GRAPH' #else: #set $temp_gspan = $infile #set $file_type = $file_type_opts.file_type_opts_selector #end if ; EDeN --action FEATURE --input_data_file_name $temp_gspan --model_file_name $outfile ## if we have an molecule datatype the file_type is set to GRAPH, after convertion to the gSpan Graph format --file_type $file_type --binary_file_type ## create a binary sparse vector as output $no_normalization $min_kernel --hash_bit_size $hash_bit_size --radius $radius --distance $distance --vertex_degree_threshold $vertex_degree_threshold $no_normalization $min_kernel --kernel_type $kernel_type_opts.kernel_type_opts_selector --graph_type $graph_type #if $file_type_opts.file_type_opts_selector == 'SEQUENCE': --sequence_degree $sequence_degree $sequence_token $sequence_multi_line $sequence_pairwise_interaction #end if #if $kernel_type_opts.kernel_type in ['DDK','NSDDK','SK']: --tree_lambda $kernel_type_opts.tree_lambda --radius_two $kernel_type_opts.radius_two #end if ### Adds rescaled features from nearest neighbors ### #if $smooth_opts.smooth_opts_selector == 'smooth': --smooth --smooth_param $smooth_opts.smoother_param --row_index_file_name $row_index_file_name --col_index_file_name $col_index_file_name --num_hash_functions $smooth_opts.num_hash_functions --num_repeat_hash_functions $smooth_opts.num_repeat_hash_functions --max_size_bin $smooth_opts.max_size_bin --eccess_neighbour_size_factor $smooth_opts.eccess_neighbour_size_factor --num_nearest_neighbours $smooth_opts.num_nearest_neighbours $smooth_opts.shared_neighborhood $smooth_opts.no_neighborhood_cache $smooth_opts.no_minhash_cache #end if ; rm $temp_gspan </command> <stdio> <regex match="Error" source="both" level="fatal" description="An error occured with your Job." /> </stdio> <inputs> <param format="smi,gspan,inchi,sdf,mol,mol2,txt" name="infile" type="data" label="Input file" help="File can contain molecule data types (SMILES, InChI, SDF) or Graph datatypes (gSpan, sparse vector, sequence)."/> <conditional name="file_type_opts"> <param name="file_type_opts_selector" type="select" label="Type of Input file"> <option value="GRAPH">Graph</option> <option value="SPARSE_VECTOR">sparse vector</option> <option value="SEQUENCE">Sequence</option> <option value="sdf">SDF</option> <option value="smi">SMILES</option> </param> <when value="GRAPH" /> <when value="SPARSE_VECTOR" /> <when value="SEQUENCE"> <param name="sequence_degree" type="integer" value="1" label="Sequence degree" help=""> <validator type="in_range" min="1" /> </param> <param name="sequence_token" type="boolean" label="Sequence token" truevalue="--sequence_token" falsevalue="" checked="false" /> <param name="sequence_multi_line" type="boolean" label="Sequence is in multi-line notation" truevalue="--sequence_multi_line" falsevalue="" checked="false" /> <param name="sequence_pairwise_interaction" type="boolean" label="Sequence pairwise iterations" truevalue="--sequence_pairwise_interaction" falsevalue="" checked="false" /> </when> <when value="sdf" /> <when value="smi" /> </conditional> <conditional name="kernel_type_opts"> <param name="kernel_type_opts_selector" type="select" label="Type of the Kernel"> <option value="NSPDK">NSPDK</option> <option value="WDK">WDK</option> <option value="PBK">PBK</option> <option value="USPK">USPK</option> <option value="DDK">DDK</option> <option value="NSDDK">ANSDDK</option> <option value="SK">SK [NSPDK]</option> </param> <when value="NSPDK" /> <when value="WDK" /> <when value="PBK" /> <when value="USPK" /> <when value="SK"> <param name="radius_two" type="integer" value="2" label="Radius Two" help=""> <validator type="in_range" min="1" /> </param> <param name="tree_lambda" type="float" value="1.2" label="Tree lambda" help=""> <validator type="in_range" min="0.0" /> </param> </when> <when value="DDK"> <param name="radius_two" type="integer" value="2" label="Radius Two" help=""> <validator type="in_range" min="1" /> </param> <param name="tree_lambda" type="float" value="1.2" label="Tree lambda" help=""> <validator type="in_range" min="0.0" /> </param> </when> <when value="NSDDK"> <param name="radius_two" type="integer" value="2" label="Radius Two" help=""> <validator type="in_range" min="1" /> </param> <param name="tree_lambda" type="float" value="1.2" label="Tree lambda" help=""> <validator type="in_range" min="0.0" /> </param> </when> </conditional> <param name="graph_type" type="select" display="radio" label="Type of Graph"> <option value="DIRECTED">directed</option> <option value="UNDIRECTED">undirected</option> </param> <expand macro="input_smooth_conditional" /> <param name="no_normalization" type="boolean" label="Skip normalization" truevalue="--no_normalization" falsevalue="" checked="false" /> <param name="min_kernel" type="boolean" label="Use min kernel" truevalue="--min_kernel" falsevalue="" checked="false" /> <param name="hash_bit_size" type="integer" value="15" label="Bit size of the used hashing function" help=""> <validator type="in_range" min="1" /> </param> <param name="radius" type="integer" value="2" label="Radius that defines a neighborhood" help=""> <validator type="in_range" min="1" /> </param> <param name="distance" type="integer" value="5" label="Distance that defines a neighborhood" help=""> <validator type="in_range" min="1" /> </param> <param name="vertex_degree_threshold" type="integer" value="7" label="Vertex degree threshold" help=""> <validator type="in_range" min="1" /> </param> </inputs> <configfiles> <!-- The strange indentation is necessary, otherwise we get line breaks or white space in our file --> <configfile name="row_index_file_name">#if $smooth_opts.smooth_opts_selector == 'smooth': #for $element in str( $smooth_opts.row_index ).split(','): #set $element = $element.strip().split('-') #if len($element) == 2: #for $index in range( int($element[0]), int($element[1]) ): ## the following writes the value at the beginning of each line ## #echo $index# inserts a line break automatically, but do not write it ## to the beginning of the line #echo '%s\n' % $index #end for #else: #echo '%s\n' % $element[0] #end if #end for #end if </configfile> <configfile name="col_index_file_name">#if $smooth_opts.smooth_opts_selector == 'smooth': #for $element in str( $smooth_opts.col_index ).split(','): #set $element = $element.strip().split('-') #if len($element) == 2: #for $index in range( int($element[0]), int($element[1]) ): ## the following writes the value at the beginning of each line ## #echo $index# inserts a line break automatically, but do not write it ## to the beginning of the line #echo '%s\n' % $index #end for #else: #echo '%s\n' % $element[0] #end if #end for #end if </configfile> </configfiles> <outputs> <data format="eden_sparse_vector" name="outfile" label="Sparse vector from ${on_string}"/> </outputs> <tests> <test> <param name="infile" value="3_molceuls.sdf" /> <output name="outfile" file="3_molecules.gspan" /> </test> </tests> <help> .. class:: infomark **What it does** The linear model is induced using the accelerated stochastic gradient descent technique by Léon Bottou and Yann LeCun. When the target information is 0, a self-training algorithm is used to impute a positive or negative class to the unsupervised instances. If the target information is imbalanced a minority class resampling technique is used to rebalance the training set. This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa. </help> </tool>