|
0
|
1 <tool id="kmersvm_nullseq" name="Generate Null Sequence">
|
|
|
2 <description>using random sampling from genomic DNA</description>
|
|
|
3 <command interpreter="python">scripts/nullseq_generate.py -q
|
|
|
4 #if str($excluded) !="None":
|
|
|
5 -e $excluded
|
|
|
6 #end if
|
|
|
7 -x $fold -r $rseed -g $gc_err -t $rpt_err $input $dbkey ${indices_path.fields.path}
|
|
|
8 </command>
|
|
|
9 <inputs>
|
|
|
10 <param name="fold" type="integer" value="1" label="# of Fold-Increase" />
|
|
|
11 <param name="gc_err" type="float" value="0.02" label="Allowable GC Error" />
|
|
|
12 <param name="rpt_err" type="float" value="0.02" label="Allowable Repeat Error" />
|
|
|
13 <param name="rseed" type="integer" value="1" label="Random Number Seed" />
|
|
|
14 <param format="interval" name="input" type="data" label="BED File of Positive Regions" />
|
|
|
15 <validator type="unspecified_build" />
|
|
|
16 <validator type="dataset_metadata_in_file" filename="nullseq_indices.loc" metadata_name="dbkey" metadata_column="0" message="Sequences are currently unavailable for the specified build." />
|
|
|
17 <param name="excluded" optional="true" format="interval" type="data" value="None" label="Excluded Regions (optional)" />
|
|
|
18 <param name="indices_path" type="select" label="Available Datasets">
|
|
|
19 <options from_file="nullseq_indices.loc">
|
|
|
20 <column name="dbkey" index="0"/>
|
|
|
21 <column name="value" index="0"/>
|
|
|
22 <column name="name" index="1"/>
|
|
|
23 <column name="path" index="2"/>
|
|
|
24 <!--filter type="data_meta" ref="input" key="dbkey" column="0" /-->
|
|
|
25 </options>
|
|
|
26 </param>
|
|
|
27 </inputs>
|
|
|
28 <outputs>
|
|
|
29 <data format="interval" name="nullseq_output" from_work_dir="nullseq_output.bed" />
|
|
|
30 </outputs>
|
|
|
31 <tests>
|
|
|
32 <test>
|
|
|
33 <param name="input" value="nullseq_test.bed" ftype="bed" />
|
|
|
34 <param name="fold" value="1" />
|
|
|
35 <param name="gc_err" value="0.02" />
|
|
|
36 <param name="rpt_err" value="0'02" />
|
|
|
37 <param name="rseed" value="1" />
|
|
|
38 <param name="indices_path" value="hg19" />
|
|
|
39 <output name="output" file="nullseq_output.bed" />
|
|
|
40 </test>
|
|
|
41 </tests>
|
|
|
42 <help>
|
|
|
43
|
|
|
44 **What it does**
|
|
|
45
|
|
|
46 Takes an input BED file and generates a set of sequences for use as negative data (null sequences) in Train SVM similar in length, GC content and repeat fraction. Uses random sampling for efficiency.
|
|
|
47
|
|
|
48 **Parameters**
|
|
|
49
|
|
|
50 Fold-Increase: Size of desired null sequence data set expressed as multiple of the size of the input data set.
|
|
|
51
|
|
|
52 GC Error, Repeat Error: Acceptable difference between a positive sequence and its corresponding null sequence in terms of GC content, repeat content.
|
|
|
53
|
|
|
54 Random Number Seed: Seed for random number generator.
|
|
|
55
|
|
|
56 Excluded Regions: Submitted regions will be excluded from null sequence generation.
|
|
|
57
|
|
|
58 ----
|
|
|
59
|
|
|
60 **Example**
|
|
|
61
|
|
|
62 Given a BED file containing::
|
|
|
63
|
|
|
64 chr1 10212203 10212303
|
|
|
65 chr1 103584748 103584848
|
|
|
66 chr1 105299130 105299230
|
|
|
67 chr1 106367772 106367872
|
|
|
68
|
|
|
69 Tool will output BED file matched in length, GC content and repeat content::
|
|
|
70
|
|
|
71 chr1 3089935 3090035
|
|
|
72 chr1 5031335 5031435
|
|
|
73 chr1 5103742 5103842
|
|
|
74 chr1 5650372 5650472
|
|
|
75
|
|
|
76 </help>
|
|
|
77 </tool>
|