view EDeN_feature.xml @ 4:56e5c6ee3c1b draft

Uploaded
author bgruening
date Thu, 05 Sep 2013 12:49:03 -0400
parents e1fc8ecabba7
children be1433b0833b
line wrap: on
line source

<tool id="bg_eden_feature" name="EDeN Converter" version="0.1">
    <description></description>
    <requirements>
        <requirement type="package" version="2.3.2">openbabel</requirement>
        <requirement type="set_environment">EDEN_SCRIPT_PATH</requirement>
    </requirements>
    <macros>
        <import>eden_macros.xml</import>
    </macros>
    <command>

        ## pre-processing step if we have a molecule type we need to convert it to the gSpan format at first

        #import tempfile, os
        #set $temp_gspan = tempfile.NamedTemporaryFile( delete=False )
        #silent $temp_gspan.close()
        #set $temp_gspan = $temp_gspan.name

        #if $file_type_opts.file_type_opts_selector == 'sdf':
            obabel -i sdf -o sdf $infile ---errorlevel 1 | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan
            #set $file_type = 'GRAPH'
        #elif $file_type_opts.file_type_opts_selector == 'smi':
            obabel -i smi -o sdf $infile ---errorlevel 1 | python \$EDEN_SCRIPT_PATH/mol2gspan.py --infile - --outfile $temp_gspan
            #set $file_type = 'GRAPH'
        #else:
            #set $temp_gspan = $infile
            #set $file_type = $file_type_opts.file_type_opts_selector
        #end if
        ;

        EDeN --action FEATURE

        --input_data_file_name $temp_gspan
        --model_file_name $outfile

        ## if we have an molecule datatype the file_type is set to GRAPH, after convertion to the gSpan Graph format
        --file_type $file_type

        --binary_file_type ## create a binary sparse vector as output
        --kernel_type $kernel_type_opts.kernel_type_opts_selector
        --graph_type $graph_type

        $no_normalization
        $min_kernel

        --hash_bit_size $hash_bit_size
        --radius $radius
        --distance $distance
        --vertex_degree_threshold $vertex_degree_threshold

        $no_normalization
        $min_kernel

        --kernel_type $kernel_type_opts.kernel_type  ##NSPDK | WDK | PBK | USPK | DDK | NSDDK | ANSDDK | SK [NSPDK]
        --graph_type $graph_type    ##DIRECTED | UNDIRECTED [UNDIRECTED]

        #if $file_type_opts.file_type_opts_selector == 'SEQUENCE':

            --sequence_degree $sequence_degree
            $sequence_token
            $sequence_multi_line
            $sequence_pairwise_interaction

        #end if

        #if $kernel_type_opts.kernel_type in ['DDK','NSDDK','SK']:
            --tree_lambda $kernel_type_opts.tree_lambda
            --radius_two $kernel_type_opts.radius_two
        #end if

        ### Adds rescaled features from nearest neighbors ###

        #if $smooth_opts.smooth_opts_selector == 'smooth':
            --smooth
            --smooth_param $smooth_opts.smoother_param

            --row_index_file_name $row_index_file_name
            --col_index_file_name $col_index_file_name
            --num_hash_functions $smooth_opts.num_hash_functions
            --num_repeat_hash_functions $smooth_opts.num_repeat_hash_functions
            --max_size_bin $smooth_opts.max_size_bin
            --eccess_neighbour_size_factor $smooth_opts.eccess_neighbour_size_factor
            --num_nearest_neighbours $smooth_opts.num_nearest_neighbours
            $smooth_opts.shared_neighborhood
            $smooth_opts.no_neighborhood_cache
            $smooth_opts.no_minhash_cache
        #end if

        ;
        rm $temp_gspan


    </command>
    <stdio>
        <regex match="Error" 
           source="both" 
           level="fatal" 
           description="An error occured with your Job." />
    </stdio>
    <inputs>
        <param format="smi,gspan,inchi,sdf,mol,mol2,txt" name="infile" type="data" label="Input file" 
            help="File can contain molecule data types (SMILES, InChI, SDF) or Graph datatypes (gSpan, sparse vector, sequence)."/>

        <conditional name="file_type_opts">
            <param name="file_type_opts_selector" type="select" label="Type of Input file">
                <option value="GRAPH">Graph</option>
                <option value="SPARSE_VECTOR">sparse vector</option>
                <option value="SEQUENCE">Sequence</option>
                <option value="sdf">SDF</option>
                <option value="smi">SMILES</option>
            </param>
            <when value="GRAPH" />
            <when value="SPARSE_VECTOR" />
            <when value="SEQUENCE">
                <param name="sequence_degree" type="integer" value="1" label="Sequence degree" help="">
                    <validator type="in_range" min="1" />
                </param>
                <param name="sequence_token" type="boolean" label="Sequence token" truevalue="--sequence_token" falsevalue="" checked="false" />
                <param name="sequence_multi_line" type="boolean" label="Sequence is in multi-line notation" truevalue="--sequence_multi_line" falsevalue="" checked="false" />
                <param name="sequence_pairwise_interaction" type="boolean" label="Sequence pairwise iterations" truevalue="--sequence_pairwise_interaction" falsevalue="" checked="false" />
            </when>
            <when value="sdf" />
            <when value="smi" />
        </conditional>

        <conditional name="kernel_type_opts">
            <param name="kernel_type_opts_selector" type="select" label="Type of the Kernel">
                <option value="NSPDK">NSPDK</option>
                <option value="WDK">WDK</option>
                <option value="PBK">PBK</option>
                <option value="USPK">USPK</option>
                <option value="DDK">DDK</option>
                <option value="NSDDK">ANSDDK</option>
                <option value="SK">SK [NSPDK]</option>
            </param>
            <when value="NSPDK" />
            <when value="WDK" />
            <when value="PBK" />
            <when value="USPK" />
            <when value="SK">
                <param name="radius_two" type="integer" value="2" label="Radius Two" help="">
                    <validator type="in_range" min="1" />
                </param>
                <param name="tree_lambda" type="float" value="1.2" label="Tree lambda" help="">
                    <validator type="in_range" min="0.0" />
                </param>
            </when>
            <when value="DDK">
                <param name="radius_two" type="integer" value="2" label="Radius Two" help="">
                    <validator type="in_range" min="1" />
                </param>
                <param name="tree_lambda" type="float" value="1.2" label="Tree lambda" help="">
                    <validator type="in_range" min="0.0" />
                </param>
            </when>
            <when value="NSDDK">
                <param name="radius_two" type="integer" value="2" label="Radius Two" help="">
                    <validator type="in_range" min="1" />
                </param>
                <param name="tree_lambda" type="float" value="1.2" label="Tree lambda" help="">
                    <validator type="in_range" min="0.0" />
                </param>
            </when>
        </conditional>


        <param name="graph_type" type="select" display="radio" label="Type of Graph">
            <option value="DIRECTED">directed</option>
            <option value="UNDIRECTED">undirected</option>
        </param>

        <expand macro="input_smooth_conditional" />

        <param name="no_normalization" type="boolean" label="Skip normalization" truevalue="--no_normalization" falsevalue="" checked="false" />
        <param name="min_kernel" type="boolean" label="Use min kernel" truevalue="--min_kernel" falsevalue="" checked="false" />

        <param name="hash_bit_size" type="integer" value="15" label="Bit size of the used hashing function" help="">
            <validator type="in_range" min="1" />
        </param>
        <param name="radius" type="integer" value="2" label="Radius that defines a neighborhood" help="">
            <validator type="in_range" min="1" />
        </param>
        <param name="distance" type="integer" value="5" label="Distance that defines a neighborhood" help="">
            <validator type="in_range" min="1" />
        </param>
        <param name="vertex_degree_threshold" type="integer" value="7" label="Vertex degree threshold" help="">
            <validator type="in_range" min="1" />
        </param>


    </inputs>
    <configfiles>
<!-- The strange indentation is necessary, otherwise we get line breaks or white space in our file -->
<configfile name="row_index_file_name">#if $smooth_opts.smooth_opts_selector == 'smooth':
                #for $element in str( $smooth_opts.row_index ).split(','):
                    #set $element = $element.strip().split('-')
                    #if len($element) == 2:
                        #for $index in range( int($element[0]), int($element[1]) ):
                            ## the following writes the value at the beginning of each line
                            ## #echo $index# inserts a line break automatically, but do not write it
                            ## to the beginning of the line
                            #echo '%s\n' % $index
                        #end for
                    #else:
                        #echo '%s\n' % $element[0]
                    #end if
                #end for
            #end if
</configfile>
<configfile name="col_index_file_name">#if $smooth_opts.smooth_opts_selector == 'smooth':
                #for $element in str( $smooth_opts.col_index ).split(','):
                    #set $element = $element.strip().split('-')
                    #if len($element) == 2:
                        #for $index in range( int($element[0]), int($element[1]) ):
                            ## the following writes the value at the beginning of each line
                            ## #echo $index# inserts a line break automatically, but do not write it
                            ## to the beginning of the line
                            #echo '%s\n' % $index
                        #end for
                    #else:
                        #echo '%s\n' % $element[0]
                    #end if
                #end for
            #end if
</configfile>
    </configfiles>
    <outputs>
        <data format="eden_sparse_vector" name="outfile" label="Sparse vector from ${on_string}"/>
    </outputs>
    <tests>
        <test>
            <param name="infile" value="3_molceuls.sdf" />
            <output name="outfile" file="3_molecules.gspan" />
        </test>
    </tests>
    <help>

.. class:: infomark

**What it does** 

The linear model is induced using the accelerated stochastic gradient descent technique by Léon Bottou and Yann LeCun.
When the target information is 0, a self-training algorithm is used to impute a positive or negative class to the unsupervised instances.
If the target information is imbalanced a minority class resampling technique is used to rebalance the training set.

This tool is part of the EDeN (Explicit Decomposition with Neighborhoods) suite, developed by Fabrizio Costa.

    </help>
</tool>