comparison kmersvm/nullseq.xml @ 5:f99b5099ea55 draft

Uploaded
author test-svm
date Sun, 05 Aug 2012 16:50:57 -0400
parents
children
comparison
equal deleted inserted replaced
4:f2130156fd5d 5:f99b5099ea55
1 <tool id="kmersvm_nullseq" name="Generate Null Sequence">
2 <description>using random sampling from genomic DNA</description>
3 <command interpreter="python">scripts/nullseq_generate.py -q
4 #if str($excluded) !="None":
5 -e $excluded
6 #end if
7 -x $fold -r $rseed -g $gc_err -t $rpt_err $input $dbkey ${indices_path.fields.path}
8 </command>
9 <inputs>
10 <param name="fold" type="integer" value="1" label="# of Fold-Increase" />
11 <param name="gc_err" type="float" value="0.02" label="Allowable GC Error" />
12 <param name="rpt_err" type="float" value="0.02" label="Allowable Repeat Error" />
13 <param name="rseed" type="integer" value="1" label="Random Number Seed" />
14 <param format="interval" name="input" type="data" label="BED File of Positive Regions" />
15 <validator type="unspecified_build" />
16 <validator type="dataset_metadata_in_file" filename="nullseq_indices.loc" metadata_name="dbkey" metadata_column="0" message="Sequences are currently unavailable for the specified build." />
17 <param name="excluded" optional="true" format="interval" type="data" value="None" label="Excluded Regions (optional)" />
18 <param name="indices_path" type="select" label="Available Datasets">
19 <options from_file="nullseq_indices.loc">
20 <column name="dbkey" index="0"/>
21 <column name="value" index="0"/>
22 <column name="name" index="1"/>
23 <column name="path" index="2"/>
24 <!--filter type="data_meta" ref="input" key="dbkey" column="0" /-->
25 </options>
26 </param>
27 </inputs>
28 <outputs>
29 <data format="interval" name="nullseq_output" from_work_dir="nullseq_output.bed" />
30 </outputs>
31 <help>
32
33 **What it does**
34
35 Takes an input BED file and generates a set of sequences for use as negative data (null sequences) in Train SVM similar in length, GC content and repeat fraction. Uses random sampling for efficiency.
36
37 **Parameters**
38
39 Fold-Increase: Size of desired null sequence data set expressed as multiple of the size of the input data set.
40
41 GC Error, Repeat Error: Acceptable difference between a positive sequence and its corresponding null sequence in terms of GC content, repeat content.
42
43 Random Number Seed: Seed for random number generator.
44
45 Excluded Regions: Submitted regions will be excluded from null sequence generation.
46
47 ----
48
49 **Example**
50
51 Given a BED file containing::
52
53 chr1 10212203 10212303
54 chr1 103584748 103584848
55 chr1 105299130 105299230
56 chr1 106367772 106367872
57
58 Tool will output BED file matched in length, GC content and repeat content::
59
60 chr1 3089935 3090035
61 chr1 5031335 5031435
62 chr1 5103742 5103842
63 chr1 5650372 5650472
64
65 </help>
66 </tool>