view EDeN_feature.xml @ 1:64a1fb09b10d draft

Uploaded
author bgruening
date Wed, 04 Sep 2013 07:59:08 -0400
parents 99091a5d5c84
children a3edc97e056c
line wrap: on
line source

<tool id="bg_eden_feature" name="EDeN Converter" version="0.1">
    <description></description>
    <requirements>
        <requirement type="package" version="2.3.2">openbabel</requirement>
        <requirement type="set_environment">EDEN_SCRIPT_PATH</requirement>
    </requirements>
    <command>

        ## pre-processing step if we have a molecule type we need to convert it to the gSpan format at first

        #import tempfile, os
        #set $temp_gspan = tempfile.NamedTemporaryFile( delete=False )
        #silent $temp_gspan.close()
        #set $temp_gspan = $temp_gspan.name

        #if $file_type_opts.file_type_opts_selector == 'sdf':
            obabel -i sdf -o sdf $infile | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan
            #set $file_type = 'GRAPH'
        #elif $file_type_opts.file_type_opts_selector == 'smi':
            obabel -i smi -o sdf $infile | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan
            #set $file_type = 'GRAPH'
        #else:
            #set $temp_gspan = $infile
            #set $file_type = $file_type_opts.file_type_opts_selector
        #end if
        ;

        EDeN --action FEATURE

        --input_data_file_name $temp_gspan
        --model_file_name $outfile

        ## if we have an molecule datatype the file_type is set to GRAPH, after convertion to the gSpan Graph format
        --file_type $file_type

        --binary_file_type ## create a binary sparse vector as output
        --kernel_type $kernel_type
        --graph_type $graph_type

        $no_normalization
        $min_kernel

        --hash_bit_size $hash_bit_size
        --radius $radius
        --distance $distance
        --vertex_degree_threshold $vertex_degree_threshold

        $no_normalization
        $min_kernel

        --kernel_type $kernel_type  ##NSPDK | WDK | PBK | USPK | DDK | NSDDK | ANSDDK | SK [NSPDK]
        --graph_type $graph_type    ##DIRECTED | UNDIRECTED [UNDIRECTED]

        #if $file_type_opts.file_type_opts_selector == 'SEQUENCE':

            --sequence_degree $sequence_degree
            $sequence_token
            $sequence_multi_line
            $sequence_pairwise_interaction

        #end if

        --tree_lambda $tree_lambda
        --radius_two $radius_two


        ### Adds rescaled features from nearest neighbors ###

        #if $smooth_opts.smooth_opts_selector == 'smooth':
            --smooth
            --smooth_param $smooth_opts.smoother_param

            --row_index_file_name $row_index_file_name
            --col_index_file_name $col_index_file_name
            --num_hash_functions $smooth_opts.num_hash_functions
            --num_repeat_hash_functions $smooth_opts.num_repeat_hash_functions
            --max_size_bin $smooth_opts.max_size_bin
            --eccess_neighbour_size_factor $smooth_opts.eccess_neighbour_size_factor
            --num_nearest_neighbours $smooth_opts.num_nearest_neighbours
            $smooth_opts.shared_neighborhood
            $smooth_opts.no_neighborhood_cache
            $smooth_opts.no_minhash_cache
        #end if

        ;
        rm $temp_gspan


    </command>
    <inputs>
        <param format="smi,gspan,inchi,sdf,mol,mol2,txt" name="infile" type="data" label="Input file" 
            help="File can contain molecule data types (SMILES, InChI, SDF) or Graph datatypes (gSpan, sparse vector, sequence)."/>

        <conditional name="file_type_opts">
            <param name="file_type_opts_selector" type="select" label="Type of Input file">
                <option value="GRAPH">Graph</option>
                <option value="SPARSE_VECTOR">sparse vector</option>
                <option value="SEQUENCE">Sequence</option>
                <option value="sdf">SDF</option>
                <option value="smi">SMILES</option>
            </param>
            <when value="GRAPH" />
            <when value="SPARSE_VECTOR" />
            <when value="SEQUENCE">
                <param name="sequence_degree" type="integer" value="1" label="Sequence degree" help="">
                    <validator type="in_range" min="1" />
                </param>
                <param name="sequence_token" type="boolean" label="Sequence token" truevalue="--sequence_token" falsevalue="" checked="false" />
                <param name="sequence_multi_line" type="boolean" label="Sequence is in multi-line notation" truevalue="--sequence_multi_line" falsevalue="" checked="false" />
                <param name="sequence_pairwise_interaction" type="boolean" label="Sequence pairwise iterations" truevalue="--sequence_pairwise_interaction" falsevalue="" checked="false" />
            </when>
            <when value="sdf" />
            <when value="smi" />
        </conditional>

        <param name="kernel_type" type="select" display="radio" label="Type of the Kernel">
            <option value="NSPDK">NSPDK</option>
            <option value="WDK">WDK</option>
            <option value="PBK">PBK</option>
            <option value="USPK">USPK</option>
            <option value="DDK">DDK</option>
            <option value="NSDDK">ANSDDK</option>
            <option value="SK">SK [NSPDK]</option>
        </param>

        <param name="graph_type" type="select" display="radio" label="Type of Graph">
            <option value="DIRECTED">directed</option>
            <option value="UNDIRECTED">undirected</option>
        </param>


        <conditional name="smooth_opts">
            <param name="smooth_opts_selector" type="select" label="Adds rescaled features from nearest neighbors (--smooth)">
              <option value="non_smooth" selected="True">Disable smooth</option>
              <option value="smooth">Enable smooth</option>
            </param>
            <when value="non_smooth" />
            <when value="smooth">

                <param name="smoother_param" type="float" value="0.95" label="Scaling features from neighbors"
                    help="Features from neighbors are scaled by the kernel value to the power value assigned to this switch.">
                    <validator type="in_range" min="0.0" />
                </param>

                <param name="no_minhash_cache" type="boolean" label="Deactivate minhash cache" truevalue="--no_minhash_cache" falsevalue="" checked="false" />
                <param name="no_neighborhood_cache" type="boolean" label="Deactivate neighborhood cache" truevalue="-no_neighborhood_cache" falsevalue="" checked="false" />
                <param name="shared_neighborhood" type="boolean" label="Activate shared neighborhood" truevalue="--shared_neighborhood" falsevalue="" checked="false" />

                <param name="num_hash_functions" type="integer" value="400" label="Number of hash functions" help="">
                    <validator type="in_range" min="1" />
                </param>
                <param name="num_repeat_hash_functions" type="integer" value="10" label="Number of repeats for each hash functions" help="">
                    <validator type="in_range" min="1" />
                </param>
                <param name="max_size_bin" type="float" value="0.3" label="Maximum size of one bin" 
                    help="Expressed as the maximum fraction of the datset size. When a bin contains references to more instances than this quantity, the bin is erased. The ratio is that this featrue is common to too many instances and it is therefore not informative. Morover the runtimes become non sub-linear if a significant fraction of the dataset size has to be checked.">
                    <validator type="in_range" min="0.0" />
                </param>
                <param name="eccess_neighbour_size_factor" type="float" value="5.0" label="Access neighborhood size factor" 
                    help="Expressed as a multiplicative factor w.r.t. the neighborhood size required. It means that the approximate neighborhood query stops at the X most frequent instances, where X = eccess_neighbor_size_factor * neighborhood size.">
                    <validator type="in_range" min="0.0" />
                </param>
                <param name="num_nearest_neighbours" type="integer" value="10" label="Number of nearest neighbors" help="">
                    <validator type="in_range" min="1" />
                </param>

                <param name="row_index" type="text" size="30" label="Row indieces of your input file that should be converted" 
                    help="Specify a subset of your dataset by providing the row indieces that should be taken into account.">
                    <sanitizer>
                        <valid initial="string.digits">
                            <add value="," />
                            <add value="-" />
                            <add value=" " />
                        </valid>
                    </sanitizer>
                    <validator type="empty_field" message="You need to specify row indieces"/>
                </param>
                <param name="col_index" type="text" size="30" label="Column indieces of your input file that should be converted" 
                    help="Specify a subset of your dataset by providing the column indieces that should be taken into account.">
                    <sanitizer>
                        <valid initial="string.digits">
                            <add value="," />
                            <add value="-" />
                            <add value=" " />
                        </valid>
                    </sanitizer>
                    <validator type="empty_field" message="You need to specify column indieces"/>
                </param>

            </when>
        </conditional>

        <param name="no_normalization" type="boolean" label="Skip normalization" truevalue="--no_normalization" falsevalue="" checked="false" />
        <param name="min_kernel" type="boolean" label="Use minimal kernel" truevalue="--min_kernel" falsevalue="" checked="false" />

        <param name="hash_bit_size" type="integer" value="15" label="Bit size of the used hashing function" help="">
            <validator type="in_range" min="1" />
        </param>
        <param name="radius" type="integer" value="2" label="Radius that defines a neighborhood" help="">
            <validator type="in_range" min="1" />
        </param>
        <param name="distance" type="integer" value="5" label="Distance that defines a neighborhood" help="">
            <validator type="in_range" min="1" />
        </param>
        <param name="vertex_degree_threshold" type="integer" value="7" label="Vertex degree threshold" help="">
            <validator type="in_range" min="1" />
        </param>
        <param name="radius_two" type="integer" value="2" label="Radius Two" help="">
            <validator type="in_range" min="1" />
        </param>
        <param name="tree_lambda" type="float" value="1.2" label="Tree lambda" help="">
            <validator type="in_range" min="0.0" />
        </param>

    </inputs>
    <configfiles>
<!-- The strange indentation is necessary, otherwise we get line breaks or white space in our file -->
<configfile name="row_index_file_name">#if $smooth_opts.smooth_opts_selector == 'smooth':
                #for $element in str( $smooth_opts.row_index ).split(','):
                    #set $element = $element.strip().split('-')
                    #if len($element) == 2:
                        #for $index in range( int($element[0]), int($element[1]) ):
                            ## the following writes the value at the beginning of each line
                            ## #echo $index# inserts a line break automatically, but do not write it
                            ## to the beginning of the line
                            #echo '%s\n' % $index
                        #end for
                    #else:
                        #echo '%s\n' % $element[0]
                    #end if
                #end for
            #end if
</configfile>
<configfile name="col_index_file_name">#if $smooth_opts.smooth_opts_selector == 'smooth':
                #for $element in str( $smooth_opts.col_index ).split(','):
                    #set $element = $element.strip().split('-')
                    #if len($element) == 2:
                        #for $index in range( int($element[0]), int($element[1]) ):
                            ## the following writes the value at the beginning of each line
                            ## #echo $index# inserts a line break automatically, but do not write it
                            ## to the beginning of the line
                            #echo '%s\n' % $index
                        #end for
                    #else:
                        #echo '%s\n' % $element[0]
                    #end if
                #end for
            #end if
</configfile>
    </configfiles>
    <outputs>
        <data format="eden_sparse_vector" name="outfile" label="Sparse vector from ${on_string}"/>
    </outputs>
    <tests>
        <test>
            <param name="infile" value="3_molceuls.sdf" />
            <output name="outfile" file="3_molecules.gspan" />
        </test>
    </tests>
    <help>

.. class:: infomark

**What it does** 

The linear model is induced using the accelerated stochastic gradient descent technique by Léon Bottou and Yann LeCun.
When the target information is 0, a self-training algorithm is used to impute a positive or negative class to the unsupervised instances.
If the target information is imbalanced a minority class resampling technique is used to rebalance the training set.

This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa.

    </help>
</tool>