0
|
1 <tool id="beta_plus" name="BETA-plus: Binding and Expression Target prediction and motif analysis" version="0.1.0">
|
|
2 <description>Predict the factors (TFs or CRs) direct target genes by combining the binding and expression data, then do motif analysis on target regions</description>
|
|
3 <macros>
|
|
4 <import>beta_macros.xml</import>
|
|
5 </macros>
|
|
6 <expand macro="requirements" />
|
|
7 <command>
|
|
8 BETA plus
|
|
9 #include source=$common_opts#
|
|
10 #include source=$genome_opts#
|
|
11 #include source=$ref_genome_seq_opts#
|
|
12 #include source=$extended_opts#
|
|
13 --mn $motifs
|
|
14 &> $log &&
|
|
15 mkdir -p $motifresult.extra_files_path &&
|
|
16 cp BETA_OUTPUT/motifresult/betamotif.html $motifresult &&
|
|
17 cp BETA_OUTPUT/motifresult/*.js $motifresult.extra_files_path &&
|
|
18 cp BETA_OUTPUT/motifresult/*.css $motifresult.extra_files_path &&
|
|
19 cp -r BETA_OUTPUT/motifresult/img $motifresult.extra_files_path
|
|
20
|
|
21 </command>
|
|
22 <inputs>
|
|
23 <expand macro="common_params" />
|
|
24 <expand macro="genome_params" />
|
|
25 <expand macro="refGenomeSourceConditional" />
|
|
26 <expand macro="extended_params" />
|
|
27 <param name="motifs" type="float" value="10" optional="true" label="Motifs to retrieve"
|
|
28 help="a number between 0 and 1 as the p-value cutoff or an integer larger than 1 as the number of motifs">
|
|
29 <validator type="in_range" max="20000" min="0" message="A float between 0 and 1 or an integer greater than 1" />
|
|
30 </param>
|
|
31 </inputs>
|
|
32 <expand macro="stdio" />
|
|
33 <outputs>
|
|
34 <data format="txt" name="log" label="Log of BETA plus"/>
|
|
35 <data format="pdf" name="functionoutput" label="BETA functional prediction on ${peakfile.name}" from_work_dir="BETA_OUTPUT/NA_function_prediction.pdf"/>
|
|
36 <data format="tabular" name="uptargetsoutput" label="BETA direct targets prediction on up regulated genes" from_work_dir="BETA_OUTPUT/NA_uptarget.txt"/>
|
|
37 <data format="tabular" name="downtargetsoutput" label="BETA direct targets prediction on down regulated genes" from_work_dir="BETA_OUTPUT/NA_downtarget.txt"/>
|
|
38 <data format="bed" name="uptargetpeaks" label="BETA Uptarget associated peaks" from_work_dir="BETA_OUTPUT/NA_uptarget_associate_peaks.bed"/>
|
|
39 <data format="bed" name="downtargetpeaks" label="BETA Downtarget associated peaks" from_work_dir="BETA_OUTPUT/NA_downtarget_associate_peaks.bed"/>
|
|
40 <data format="txt" name="upmotifs" label="BETA Motifs in up-target regions" from_work_dir="BETA_OUTPUT/motifresult/UP_MOTIFS.txt" />
|
|
41 <data format="txt" name="up_non_motifs" label="BETA Motifs in up-target regions versus non-target regions" from_work_dir="BETA_OUTPUT/motifresult/UP_NON_MOTIFS.txt" />
|
|
42 <data format="txt" name="downmotifs" label="BETA Motifs in down-target regions" from_work_dir="BETA_OUTPUT/motifresult/DOWN_MOTIFS.txt" />
|
|
43 <data format="txt" name="down_non_motifs" label="BETA Motifs in down-target regions versus non-target regions" from_work_dir="BETA_OUTPUT/motifresult/DOWN_NON_MOTIFS.txt" />
|
|
44 <data format="txt" name="differentialmotifs" label="BETA Motifs up-target regions versus down-target regions" from_work_dir="BETA_OUTPUT/motifresult/DIFFERENTIAL_MOTIF_UP_DOWN.txt" />
|
|
45 <data format="html" name="motifresult" label="BETA Motif analysis on target regions"/>
|
|
46 </outputs>
|
|
47 <tests>
|
|
48 <test>
|
|
49 <param name='peakfile' value="peaks.bed" ftype="bed" dbkey="hg19"/>
|
|
50 <param name="distance" value="100000"/>
|
|
51 <param name="peaknumber" value="10000"/>
|
|
52 <param name="genomeName" value="hg19"/>
|
|
53 <param name='exprefile' value="diff_expr.xls" ftype="tabular" dbkey="hg19"/>
|
|
54 <param name="kind" value="LIM"/>
|
|
55 <param name="expreinfo" type="text" value="2,5,7"/>
|
|
56 <param name="gname2" value="Refseq"/>
|
|
57 <param name="diff_fdr" value="1.0"/>
|
|
58 <param name="diff_amount" value="0.5"/>
|
|
59 <param name="method" value="score"/>
|
|
60 <output name="log">
|
|
61 <assert_contents>
|
|
62 <has_text_matching expression="Finished" />
|
|
63 </assert_contents>
|
|
64 </output>
|
|
65 <output name="uptargetsoutput">
|
|
66 <assert_contents>
|
|
67 <has_text_matching expression="NM_001002231" />
|
|
68 </assert_contents>
|
|
69 </output>
|
|
70 <output name="downtargetsoutput">
|
|
71 <assert_contents>
|
|
72 <has_text_matching expression="NM_001280" />
|
|
73 </assert_contents>
|
|
74 </output>
|
|
75 <output name="differentialmotifs">
|
|
76 <assert_contents>
|
|
77 <has_text_matching expression="CDX1\tHomeodomain Family" />
|
|
78 </assert_contents>
|
|
79 </output>
|
|
80 </test>
|
|
81 </tests>
|
|
82 <help>
|
|
83 ** BETA plus **
|
|
84
|
|
85 @EXTERNAL_DOCUMENTATION@
|
|
86
|
|
87 @CITATION_SECTION@
|
|
88
|
|
89 This tool annotates the given intervals and scores with genome
|
|
90 features such as gene body.
|
|
91 Predicts Direct targets of TF and the active/repressive function
|
|
92 prediction. Does motif analysis at targets region as well.
|
|
93 It's the major module in CEAS package
|
|
94 which is written by Hyunjin Gene Shin, published in Bioinformatics
|
|
95 (pubmed id:19689956).
|
|
96
|
|
97 .. class:: warningmark
|
|
98
|
|
99 **NEED IMPROVEMENT**
|
|
100
|
|
101 -----
|
|
102
|
|
103 **Parameters**
|
|
104
|
|
105 - **PEAKFILE file** contains peaks for the experiment in a bed
|
|
106 format file. Normally, it's produced by the peak calling tool. It's
|
|
107 required.
|
|
108 - **EXPREFILE file** contains the differentially expressed genes in a tab
|
|
109 delimited text file. It's required.
|
|
110 - **Kind** The kind of your expression file format, LIM for LIMMA standard
|
|
111 output with Microarray, CUF for Cuffdiffs standard output with RNA-seq,
|
|
112 BSF for BETA specific format, and O for other formats.
|
|
113 - **genome** hg19 for human and mm9 for mouse. Others, don't set this parameter.
|
|
114 - **genomereference** Genome reference data with fasta format
|
|
115 - **gname2** If this switch is on, gene or transcript IDs in files given
|
|
116 through -e will be considered as official gene symbols, DEFAULT=FALSE
|
|
117 - **EXPREINFO** is the columns info of the geneID, up/down status and statistcal
|
|
118 values column of your expression data,NOTE: use a comma as an connector.
|
|
119 for example: 2,5,7 means geneID in the 2nd column, Tscore in 5th column
|
|
120 and FDR in 7 column.
|
|
121 - **REFERENCE** is the refgene info file downloaded from UCSC genome browser.
|
|
122 It is a tab delimited text file with gene annotation with refseq and gene symbol.
|
|
123 Input this file only if your genome is neither hg19 nor mm9.
|
|
124 profiling
|
|
125 - **OUTPUT** to specify the output files directory
|
|
126 - **bl** Whether or not to use CTCF boundary file to get the contributed peaks
|
|
127 - **BOUNDARYFILE** is the file with reasonable boundaries if --bl is on and genome
|
|
128 is neither hg19 nor mm9.
|
|
129 - **NAME** specify the name of the output files.
|
|
130 - **DISTANCE** specify the distance wich peaks within it will be considered.
|
|
131 - **DIFF_FDR** specify the differential genes by the 3rd column in file input
|
|
132 via -e, genes with less than this value will be considered as the differentially
|
|
133 changed genes.
|
|
134 - **DIFF_AMOUNT** specify the differential genes the top #(DIFF_AMOUNT) ranked by
|
|
135 the 3rd column in file input via -e, genes ranked in the top # will be considered
|
|
136 as the differentially expressed genes.
|
|
137 - **CUTOFF** specify a cutoff of ks-test in the function prediction part
|
|
138
|
|
139
|
|
140 -----
|
|
141
|
|
142 **Script parameter list of BETA plus**
|
|
143
|
|
144 ::
|
|
145
|
|
146 -h, --help show this help message and exit
|
|
147 -p PEAKFILE, --peakfile PEAKFILE
|
|
148 The bed format of peaks binding sites. (BETA support 3
|
|
149 or 5 columns bed format, CHROM, START, END (NAME,
|
|
150 SCORE))
|
|
151 -e EXPREFILE, --diff_expr EXPREFILE
|
|
152 The differential expression file get from limma for
|
|
153 MicroArray ddata and cuffdiff for RNAseq data
|
|
154 -k {LIM,CUF,BSF,O}, --kind {LIM,CUF,BSF,O}
|
|
155 The kind of your expression file,this is required,it
|
|
156 can be LIM, CUF, BSF, O. LIM for LIMMA standard
|
|
157 format. CUF for CUFDIFF standard format, BSF for BETA
|
|
158 specific format and O for other formats, if is 'O',
|
|
159 columns infor required via --info
|
|
160 -g {hg19,mm9}, --genome {hg19,mm9}
|
|
161 Specify your species, hg19, mm9
|
|
162 --gs GENOMEREFERNCE GenomeReference file with fasta format
|
|
163 --gname2 If this switch is on, gene or transcript IDs in files
|
|
164 given through -e will be considered as official gene
|
|
165 symbols, DEFAULT=FALSE
|
|
166 --info EXPREINFO Specify the geneID, up/down status and statistcal
|
|
167 values column of your expression data,NOTE: use a
|
|
168 comma as an connector. for example: 2,5,7 means geneID
|
|
169 in the 2nd column, Tscore in 5th column and FDR in 7
|
|
170 column DEFAULT:2,5,7 for LIMMA; 2,10,13 for Cuffdiff
|
|
171 and 1,2,3 for BETA specific format
|
|
172 -r REFERENCE, --reference REFERENCE
|
|
173 The refgene info file downloaded from UCSC genome
|
|
174 browser.input this file only if your genome is neither
|
|
175 hg19 nor mm9
|
|
176 -o OUTPUT, --output OUTPUT
|
|
177 The directory to store all the output files, if you
|
|
178 don't set this, files will be output into the current
|
|
179 directory
|
|
180 --bl Whether or not use CTCF boundary to filter peaks
|
|
181 around a gene, DEFAULT=FALSE
|
|
182 --bf BOUNDARYFILE CTCF conserved peaks bed file, use this only when you
|
|
183 set --bl and the genome is neither hg19 nor mm9
|
|
184 --pn PEAKNUMBER The number of peaks you want to consider,
|
|
185 DEFAULT=10000
|
|
186 --method {score,distance}
|
|
187 Define the method to do the TF/CR function prediction,
|
|
188 score for regulatory potential, distance for the
|
|
189 distance to the proximal binding peak. DEFAULT:SCORE
|
|
190 -n NAME, --name NAME This argument is used to name the result file.If not
|
|
191 set, the peakfile name will be used instead
|
|
192 -d DISTANCE, --distance DISTANCE
|
|
193 Set a number which unit is 'base'. It will get peaks
|
|
194 within this distance from gene TSS. default:100000
|
|
195 (100kb)
|
|
196 --df DIFF_FDR Input a number 0~1 as a threshold to pick out the most
|
|
197 significant differential expressed genes by FDR,
|
|
198 DEFAULT = 1, that is select all the genes
|
|
199 --da DIFF_AMOUNT Get the most significant differential expressed genes
|
|
200 by the percentage(0-1) or number(larger than 1)Input a
|
|
201 number between 0-1, the rank based on fdr for example,
|
|
202 2000, so that the script will only consider top 2000
|
|
203 genes as the differentially expressed genes. DEFAULT =
|
|
204 0.5, that is select top 50 percent genes of up and
|
|
205 down seprately. NOTE: if you want to use diff_fdr,
|
|
206 please set this parameter to 1, otherwise it will get
|
|
207 the intersection of these two parameters
|
|
208 -c CUTOFF, --cutoff CUTOFF
|
|
209 Input a number between 0~1 as a threshold to select
|
|
210 the closer target gene list(up regulate or down
|
|
211 regulate or both) with the p value was called by one
|
|
212 side ks-test, DEFAULT = 0.001
|
|
213
|
|
214 </help>
|
|
215 </tool>
|