|
5
|
1 <tool id="kmersvm_nullseq" name="Generate Null Sequence">
|
|
|
2 <description>using random sampling from genomic DNA</description>
|
|
|
3 <command interpreter="python">scripts/nullseq_generate.py -q
|
|
|
4 #if str($excluded) !="None":
|
|
|
5 -e $excluded
|
|
|
6 #end if
|
|
|
7 -x $fold -r $rseed -g $gc_err -t $rpt_err $input $dbkey ${indices_path.fields.path}
|
|
|
8 </command>
|
|
|
9 <inputs>
|
|
|
10 <param name="fold" type="integer" value="1" label="# of Fold-Increase" />
|
|
|
11 <param name="gc_err" type="float" value="0.02" label="Allowable GC Error" />
|
|
|
12 <param name="rpt_err" type="float" value="0.02" label="Allowable Repeat Error" />
|
|
|
13 <param name="rseed" type="integer" value="1" label="Random Number Seed" />
|
|
|
14 <param format="interval" name="input" type="data" label="BED File of Positive Regions" />
|
|
|
15 <validator type="unspecified_build" />
|
|
|
16 <validator type="dataset_metadata_in_file" filename="nullseq_indices.loc" metadata_name="dbkey" metadata_column="0" message="Sequences are currently unavailable for the specified build." />
|
|
|
17 <param name="excluded" optional="true" format="interval" type="data" value="None" label="Excluded Regions (optional)" />
|
|
|
18 <param name="indices_path" type="select" label="Available Datasets">
|
|
|
19 <options from_file="nullseq_indices.loc">
|
|
|
20 <column name="dbkey" index="0"/>
|
|
|
21 <column name="value" index="0"/>
|
|
|
22 <column name="name" index="1"/>
|
|
|
23 <column name="path" index="2"/>
|
|
|
24 <!--filter type="data_meta" ref="input" key="dbkey" column="0" /-->
|
|
|
25 </options>
|
|
|
26 </param>
|
|
|
27 </inputs>
|
|
|
28 <outputs>
|
|
|
29 <data format="interval" name="nullseq_output" from_work_dir="nullseq_output.bed" />
|
|
|
30 </outputs>
|
|
|
31 <help>
|
|
|
32
|
|
|
33 **What it does**
|
|
|
34
|
|
|
35 Takes an input BED file and generates a set of sequences for use as negative data (null sequences) in Train SVM similar in length, GC content and repeat fraction. Uses random sampling for efficiency.
|
|
|
36
|
|
|
37 **Parameters**
|
|
|
38
|
|
|
39 Fold-Increase: Size of desired null sequence data set expressed as multiple of the size of the input data set.
|
|
|
40
|
|
|
41 GC Error, Repeat Error: Acceptable difference between a positive sequence and its corresponding null sequence in terms of GC content, repeat content.
|
|
|
42
|
|
|
43 Random Number Seed: Seed for random number generator.
|
|
|
44
|
|
|
45 Excluded Regions: Submitted regions will be excluded from null sequence generation.
|
|
|
46
|
|
|
47 ----
|
|
|
48
|
|
|
49 **Example**
|
|
|
50
|
|
|
51 Given a BED file containing::
|
|
|
52
|
|
|
53 chr1 10212203 10212303
|
|
|
54 chr1 103584748 103584848
|
|
|
55 chr1 105299130 105299230
|
|
|
56 chr1 106367772 106367872
|
|
|
57
|
|
|
58 Tool will output BED file matched in length, GC content and repeat content::
|
|
|
59
|
|
|
60 chr1 3089935 3090035
|
|
|
61 chr1 5031335 5031435
|
|
|
62 chr1 5103742 5103842
|
|
|
63 chr1 5650372 5650472
|
|
|
64
|
|
|
65 </help>
|
|
|
66 </tool>
|