Mercurial > repos > bgruening > eden_toolbox
changeset 2:a3edc97e056c draft
Uploaded
author | bgruening |
---|---|
date | Thu, 05 Sep 2013 11:40:29 -0400 |
parents | 64a1fb09b10d |
children | e1fc8ecabba7 |
files | EDeN_feature.xml EDeN_nearest_neighbor.xml EDeN_test.xml EDeN_train.xml eden_macros.xml |
diffstat | 5 files changed, 154 insertions(+), 142 deletions(-) [+] |
line wrap: on
line diff
--- a/EDeN_feature.xml Wed Sep 04 07:59:08 2013 -0400 +++ b/EDeN_feature.xml Thu Sep 05 11:40:29 2013 -0400 @@ -14,10 +14,10 @@ #set $temp_gspan = $temp_gspan.name #if $file_type_opts.file_type_opts_selector == 'sdf': - obabel -i sdf -o sdf $infile | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan + obabel -i sdf -o sdf $infile ---errorlevel 1 | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan #set $file_type = 'GRAPH' #elif $file_type_opts.file_type_opts_selector == 'smi': - obabel -i smi -o sdf $infile | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan + obabel -i smi -o sdf $infile ---errorlevel 1 | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan #set $file_type = 'GRAPH' #else: #set $temp_gspan = $infile @@ -48,7 +48,7 @@ $no_normalization $min_kernel - --kernel_type $kernel_type ##NSPDK | WDK | PBK | USPK | DDK | NSDDK | ANSDDK | SK [NSPDK] + --kernel_type $kernel_type_opts.kernel_type ##NSPDK | WDK | PBK | USPK | DDK | NSDDK | ANSDDK | SK [NSPDK] --graph_type $graph_type ##DIRECTED | UNDIRECTED [UNDIRECTED] #if $file_type_opts.file_type_opts_selector == 'SEQUENCE': @@ -60,9 +60,10 @@ #end if - --tree_lambda $tree_lambda - --radius_two $radius_two - + #if $kernel_type_opts.kernel_type in ['DDK','NSDDK','SK']: + --tree_lambda $kernel_type_opts.tree_lambda + --radius_two $kernel_type_opts.radius_two + #end if ### Adds rescaled features from nearest neighbors ### @@ -87,6 +88,12 @@ </command> + <stdio> + <regex match="Error" + source="both" + level="fatal" + description="An error occured with your Job." /> + </stdio> <inputs> <param format="smi,gspan,inchi,sdf,mol,mol2,txt" name="infile" type="data" label="Input file" help="File can contain molecule data types (SMILES, InChI, SDF) or Graph datatypes (gSpan, sparse vector, sequence)."/> @@ -113,85 +120,40 @@ <when value="smi" /> </conditional> - <param name="kernel_type" type="select" display="radio" label="Type of the Kernel"> - <option value="NSPDK">NSPDK</option> - <option value="WDK">WDK</option> - <option value="PBK">PBK</option> - <option value="USPK">USPK</option> - <option value="DDK">DDK</option> - <option value="NSDDK">ANSDDK</option> - <option value="SK">SK [NSPDK]</option> - </param> + <conditional name="kernel_type_opts"> + <param name="kernel_type_opts_selector" type="select" label="Type of the Kernel"> + <option value="NSPDK">NSPDK</option> + <option value="WDK">WDK</option> + <option value="PBK">PBK</option> + <option value="USPK">USPK</option> + <option value="DDK">DDK</option> + <option value="NSDDK">ANSDDK</option> + <option value="SK">SK [NSPDK]</option> + </param> + <when value="NSPDK" /> + <when value="WDK" /> + <when value="PBK" /> + <when value="USPK" /> + <when value="DDK,NSDDK,SK"> + <param name="radius_two" type="integer" value="2" label="Radius Two" help=""> + <validator type="in_range" min="1" /> + </param> + <param name="tree_lambda" type="float" value="1.2" label="Tree lambda" help=""> + <validator type="in_range" min="0.0" /> + </param> + </when> + </conditional> + <param name="graph_type" type="select" display="radio" label="Type of Graph"> <option value="DIRECTED">directed</option> <option value="UNDIRECTED">undirected</option> </param> - - <conditional name="smooth_opts"> - <param name="smooth_opts_selector" type="select" label="Adds rescaled features from nearest neighbors (--smooth)"> - <option value="non_smooth" selected="True">Disable smooth</option> - <option value="smooth">Enable smooth</option> - </param> - <when value="non_smooth" /> - <when value="smooth"> - - <param name="smoother_param" type="float" value="0.95" label="Scaling features from neighbors" - help="Features from neighbors are scaled by the kernel value to the power value assigned to this switch."> - <validator type="in_range" min="0.0" /> - </param> - - <param name="no_minhash_cache" type="boolean" label="Deactivate minhash cache" truevalue="--no_minhash_cache" falsevalue="" checked="false" /> - <param name="no_neighborhood_cache" type="boolean" label="Deactivate neighborhood cache" truevalue="-no_neighborhood_cache" falsevalue="" checked="false" /> - <param name="shared_neighborhood" type="boolean" label="Activate shared neighborhood" truevalue="--shared_neighborhood" falsevalue="" checked="false" /> - - <param name="num_hash_functions" type="integer" value="400" label="Number of hash functions" help=""> - <validator type="in_range" min="1" /> - </param> - <param name="num_repeat_hash_functions" type="integer" value="10" label="Number of repeats for each hash functions" help=""> - <validator type="in_range" min="1" /> - </param> - <param name="max_size_bin" type="float" value="0.3" label="Maximum size of one bin" - help="Expressed as the maximum fraction of the datset size. When a bin contains references to more instances than this quantity, the bin is erased. The ratio is that this featrue is common to too many instances and it is therefore not informative. Morover the runtimes become non sub-linear if a significant fraction of the dataset size has to be checked."> - <validator type="in_range" min="0.0" /> - </param> - <param name="eccess_neighbour_size_factor" type="float" value="5.0" label="Access neighborhood size factor" - help="Expressed as a multiplicative factor w.r.t. the neighborhood size required. It means that the approximate neighborhood query stops at the X most frequent instances, where X = eccess_neighbor_size_factor * neighborhood size."> - <validator type="in_range" min="0.0" /> - </param> - <param name="num_nearest_neighbours" type="integer" value="10" label="Number of nearest neighbors" help=""> - <validator type="in_range" min="1" /> - </param> - - <param name="row_index" type="text" size="30" label="Row indieces of your input file that should be converted" - help="Specify a subset of your dataset by providing the row indieces that should be taken into account."> - <sanitizer> - <valid initial="string.digits"> - <add value="," /> - <add value="-" /> - <add value=" " /> - </valid> - </sanitizer> - <validator type="empty_field" message="You need to specify row indieces"/> - </param> - <param name="col_index" type="text" size="30" label="Column indieces of your input file that should be converted" - help="Specify a subset of your dataset by providing the column indieces that should be taken into account."> - <sanitizer> - <valid initial="string.digits"> - <add value="," /> - <add value="-" /> - <add value=" " /> - </valid> - </sanitizer> - <validator type="empty_field" message="You need to specify column indieces"/> - </param> - - </when> - </conditional> + <expand macro="input_smooth_conditional" /> <param name="no_normalization" type="boolean" label="Skip normalization" truevalue="--no_normalization" falsevalue="" checked="false" /> - <param name="min_kernel" type="boolean" label="Use minimal kernel" truevalue="--min_kernel" falsevalue="" checked="false" /> + <param name="min_kernel" type="boolean" label="Use min kernel" truevalue="--min_kernel" falsevalue="" checked="false" /> <param name="hash_bit_size" type="integer" value="15" label="Bit size of the used hashing function" help=""> <validator type="in_range" min="1" /> @@ -205,12 +167,7 @@ <param name="vertex_degree_threshold" type="integer" value="7" label="Vertex degree threshold" help=""> <validator type="in_range" min="1" /> </param> - <param name="radius_two" type="integer" value="2" label="Radius Two" help=""> - <validator type="in_range" min="1" /> - </param> - <param name="tree_lambda" type="float" value="1.2" label="Tree lambda" help=""> - <validator type="in_range" min="0.0" /> - </param> + </inputs> <configfiles>
--- a/EDeN_nearest_neighbor.xml Wed Sep 04 07:59:08 2013 -0400 +++ b/EDeN_nearest_neighbor.xml Thu Sep 05 11:40:29 2013 -0400 @@ -4,16 +4,30 @@ </requirements> <command> EDeN --action NEAREST_NEIGHBOR - + --input_data_file_name $infile + --target_file_name $target_infile + --file_type "SPARSE_VECTOR" --binary_file_type - --target_file_name $target_infile - ##--model_file_name [model] ???????????????????? + ### Adds rescaled features from nearest neighbors ### + + #if $smooth_opts.smooth_opts_selector == 'smooth': + --smooth + --smooth_param $smooth_opts.smoother_param - --kernel_type $kernel_type - --graph_type $graph_type + --row_index_file_name $row_index_file_name + --col_index_file_name $col_index_file_name + --num_hash_functions $smooth_opts.num_hash_functions + --num_repeat_hash_functions $smooth_opts.num_repeat_hash_functions + --max_size_bin $smooth_opts.max_size_bin + --eccess_neighbour_size_factor $smooth_opts.eccess_neighbour_size_factor + --num_nearest_neighbours $smooth_opts.num_nearest_neighbours + $smooth_opts.shared_neighborhood + $smooth_opts.no_neighborhood_cache + $smooth_opts.no_minhash_cache + #end if </command> <inputs> @@ -21,20 +35,7 @@ <param format="eden_sparse_vector" name="infile" type="data" label="Input Graph" help=""/> <param format="txt" name="target_infile" type="data" label="Target file" help=""/> - <param name="kernel_type" type="select" display="radio" label="Type of the Kernel"> - <option value="NSPDK">NSPDK</option> - <option value="WDK">WDK</option> - <option value="PBK">PBK</option> - <option value="USPK">USPK</option> - <option value="DDK">DDK</option> - <option value="NSDDK">ANSDDK</option> - <option value="SK">SK [NSPDK]</option> - </param> - - <param name="graph_type" type="select" display="radio" label="Type of Graph"> - <option value="DIRECTED">directed</option> - <option value="UNDIRECTED">undirected</option> - </param> + <expand macro="input_smooth_conditional" /> </inputs> <outputs>
--- a/EDeN_test.xml Wed Sep 04 07:59:08 2013 -0400 +++ b/EDeN_test.xml Thu Sep 05 11:40:29 2013 -0400 @@ -4,38 +4,22 @@ </requirements> <command> EDeN --action TEST - - --input_data_file_name $infile - --model_file_name $model_outfile + + --input_data_file_name $sparse_vector_infile + --model_file_name $model_infile --file_type "SPARSE_VECTOR" --binary_file_type - --kernel_type $kernel_type - --graph_type $graph_type - </command> <inputs> - <param format="eden_sparse_vector" name="infile" type="data" label="Input Graph" help=""/> - - <param name="kernel_type" type="select" display="radio" label="Type of the Kernel"> - <option value="NSPDK">NSPDK</option> - <option value="WDK">WDK</option> - <option value="PBK">PBK</option> - <option value="USPK">USPK</option> - <option value="DDK">DDK</option> - <option value="NSDDK">ANSDDK</option> - <option value="SK">SK [NSPDK]</option> - </param> - - <param name="graph_type" type="select" display="radio" label="Type of Graph"> - <option value="DIRECTED">directed</option> - <option value="UNDIRECTED">undirected</option> - </param> + <param format="eden_sparse_vector" name="sparse_vector_infile" type="data" label="Input File" help=""/> + <param format="txt" name="model_infile" type="data" label="Input Model" + help="created with the EDeN Train program"/> </inputs> <outputs> - <data format="txt" name="model_outfile" label="Generated from ${on_string}"/> + <data format="tabular" name="output" label="Generated from ${on_string}"/> </outputs> <tests> <test>
--- a/EDeN_train.xml Wed Sep 04 07:59:08 2013 -0400 +++ b/EDeN_train.xml Thu Sep 05 11:40:29 2013 -0400 @@ -16,7 +16,7 @@ --lambda $lambda ##??? notation? --epochs $epoch - + --sparsification_num_iterations $sparsification_num_iterations --topological_regularization_num_neighbors $topological_regularization_num_neighbors --topological_regularization_decay_rate $topological_regularization_decay_rate @@ -31,28 +31,35 @@ </command> <inputs> <param format="eden_sparse_vector" name="infile" type="data" label="Input Graph" help=""/> - <param format="txt" name="target_infile" type="data" label="Target file" help=""/> - - <param name="kernel_type" type="select" display="radio" label="Type of the Kernel"> - <option value="NSPDK">NSPDK</option> - <option value="WDK">WDK</option> - <option value="PBK">PBK</option> - <option value="USPK">USPK</option> - <option value="DDK">DDK</option> - <option value="NSDDK">ANSDDK</option> - <option value="SK">SK [NSPDK]</option> - </param> - - <param name="graph_type" type="select" display="radio" label="Type of Graph"> - <option value="DIRECTED">directed</option> - <option value="UNDIRECTED">undirected</option> - </param> + <param format="txt" name="target_infile" type="data" label="Target file" help="indicates with -1 and 1 the class"/> <param name="epoch" type="integer" value="10" label="Epoch, Stochastic gradient descend algorithm." help=""> <validator type="in_range" min="1" /> </param> <param name="lambda" type="text" value="1e-4" label="lambda, Stochastic gradient descend algorithm." help="" /> + <!-- Semi-supervised-settings --> + <param name="threshold" type="float" value="1.0" label="Top and low quantile" + help="Only the top and low quantile will be used as positives and negative instances. A threshold of 1 means that all unsupervised instaces are used in the next phase. "> + <validator type="in_range" min="0.0" /> + </param> + <param name="num_iterations" type="integer" value="3" label="Number of iterations"> + <param name="only_negative" type="boolean" label="Induce only negative class instances." truevalue="--only_negative" falsevalue="" checked="false" /> + <param name="only_positive" type="boolean" label="Induce only positive class instances." truevalue="--only_positive" falsevalue="" checked="false" /> + + + <param name="topological_regularization_decay_rate" type="float" value="0.01" label="Topological regularization decay rate" /> + <validator type="in_range" min="0.0" /> + </param> + <param name="topological_regularization_num_neighbors" type="integer" value="0" label="Topological regularization number of neighbors"> + <validator type="in_range" min="0" /> + </param> + <param name="sparsification_num_iterations" type="integer" value="0" label="Sparsification number of iterations"> + <validator type="in_range" min="0" /> + </param> + + <param name="random_seed" type="integer" value="1" label="Randam Seed" help="" /> + </inputs> <outputs> <data format="txt" name="model_outfile" label="Train Model from ${on_string}"/>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eden_macros.xml Thu Sep 05 11:40:29 2013 -0400 @@ -0,0 +1,63 @@ +<macros> + <macro name="input_smooth_conditional"> + <conditional name="smooth_opts"> + <param name="smooth_opts_selector" type="select" label="Adds rescaled features from nearest neighbors (--smooth)"> + <option value="non_smooth" selected="True">Disable smooth</option> + <option value="smooth">Enable smooth</option> + </param> + <when value="non_smooth" /> + <when value="smooth"> + + <param name="smoother_param" type="float" value="0.95" label="Scaling features from neighbors" + help="Features from neighbors are scaled by the kernel value to the power value assigned to this switch."> + <validator type="in_range" min="0.0" /> + </param> + + <param name="no_minhash_cache" type="boolean" label="Deactivate minhash cache" truevalue="--no_minhash_cache" falsevalue="" checked="false" /> + <param name="no_neighborhood_cache" type="boolean" label="Deactivate neighborhood cache" truevalue="-no_neighborhood_cache" falsevalue="" checked="false" /> + <param name="shared_neighborhood" type="boolean" label="Activate shared neighborhood" truevalue="--shared_neighborhood" falsevalue="" checked="false" /> + + <param name="num_hash_functions" type="integer" value="400" label="Number of hash functions" help=""> + <validator type="in_range" min="1" /> + </param> + <param name="num_repeat_hash_functions" type="integer" value="10" label="Number of repeats for each hash functions" help=""> + <validator type="in_range" min="1" /> + </param> + <param name="max_size_bin" type="float" value="0.3" label="Maximum size of one bin" + help="Expressed as the maximum fraction of the datset size. When a bin contains references to more instances than this quantity, the bin is erased. The ratio is that this featrue is common to too many instances and it is therefore not informative. Morover the runtimes become non sub-linear if a significant fraction of the dataset size has to be checked."> + <validator type="in_range" min="0.0" /> + </param> + <param name="eccess_neighbour_size_factor" type="float" value="5.0" label="Access neighborhood size factor" + help="Expressed as a multiplicative factor w.r.t. the neighborhood size required. It means that the approximate neighborhood query stops at the X most frequent instances, where X = eccess_neighbor_size_factor * neighborhood size."> + <validator type="in_range" min="0.0" /> + </param> + <param name="num_nearest_neighbours" type="integer" value="10" label="Number of nearest neighbors" help=""> + <validator type="in_range" min="1" /> + </param> + + <param name="row_index" type="text" size="30" label="Row indieces of your input file that should be converted" + help="Specify a subset of your dataset by providing the row indieces that should be taken into account."> + <sanitizer> + <valid initial="string.digits"> + <add value="," /> + <add value="-" /> + <add value=" " /> + </valid> + </sanitizer> + <validator type="empty_field" message="You need to specify row indieces"/> + </param> + <param name="col_index" type="text" size="30" label="Column indieces of your input file that should be converted" + help="Specify a subset of your dataset by providing the column indieces that should be taken into account."> + <sanitizer> + <valid initial="string.digits"> + <add value="," /> + <add value="-" /> + <add value=" " /> + </valid> + </sanitizer> + <validator type="empty_field" message="You need to specify column indieces"/> + </param> + </when> + </conditional> + </macro> +</macros>