diff beta_basic.xml @ 0:20453b656907

Imported from capsule None
author jjohnson
date Tue, 16 Sep 2014 13:35:24 -0400
parents
children 9c5241259454
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/beta_basic.xml	Tue Sep 16 13:35:24 2014 -0400
@@ -0,0 +1,188 @@
+<tool id="beta_basic" name="BETA-basic: Binding and Expression Target Analysis" version="0.1.0">
+  <description>Predict the factors (TFs or CRs) direct target genes by combining the binding and expression data</description>
+  <macros>
+    <import>beta_macros.xml</import>
+  </macros>
+  <expand macro="requirements" />
+  <command>
+  BETA basic 
+  #include source=$common_opts#
+  #include source=$genome_opts#
+  #include source=$extended_opts#
+  &amp;> $log
+  </command>
+  <inputs>
+    <expand macro="common_params" />
+    <expand macro="genome_params" />
+    <expand macro="extended_params" />
+  </inputs>
+  <expand macro="stdio" />
+  <outputs>
+    <data format="txt" name="log" label="Log of BETA basic"/>
+    <data format="pdf" name="functionoutput" label="BETA functional prediction on ${peakfile.name}" from_work_dir="BETA_OUTPUT/NA_function_prediction.pdf"/>
+    <data format="tabular" name="uptargetsoutput" label="BETA direct targets prediction on up regulated genes" from_work_dir="BETA_OUTPUT/NA_uptarget.txt"/>
+    <data format="tabular" name="downtargetsoutput" label="BETA direct targets prediction on down regulated genes" from_work_dir="BETA_OUTPUT/NA_downtarget.txt"/>
+    <data format="bed" name="uptargetpeaks" label="BETA Uptarget associated peaks" from_work_dir="BETA_OUTPUT/NA_uptarget_associate_peaks.bed"/>
+    <data format="bed" name="downtargetpeaks" label="BETA Downtarget associated peaks" from_work_dir="BETA_OUTPUT/NA_downtarget_associate_peaks.bed"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name='peakfile' value="peaks.bed" ftype="bed" dbkey="hg19"/>
+      <param name="distance" value="100000"/>
+      <param name="peaknumber" value="10000"/>
+      <param name="genomeName" value="hg19"/>
+      <param name='exprefile' value="diff_expr.xls" ftype="tabular" dbkey="hg19"/>
+      <param name="kind" value="LIM"/>
+      <param name="expreinfo" type="text" value="2,5,7"/>
+      <param name="gname2" value="Refseq"/>
+      <param name="diff_fdr" value="1.0"/>
+      <param name="diff_amount" value="0.5"/>
+      <param name="method" value="score"/>
+      <output name="log">
+        <assert_contents>
+            <has_text_matching expression="Finished" />
+        </assert_contents>
+      </output>
+      <output name="targetsoutput">
+        <assert_contents>
+            <has_text_matching expression="chr19\t4675243\t4723855\tNM_139159\t1.1.*\t-\tDPP" />
+        </assert_contents>
+      </output>
+      <output name="targetpeaks">
+        <assert_contents>
+            <has_text_matching expression="chr19\t4723422\t4724314\tregion_9\tNM_139159\tDPP9\t13\t0.6.*" />
+        </assert_contents>
+      </output>
+    </test>
+  </tests>
+ <help>
+** BETA basic **
+
+@EXTERNAL_DOCUMENTATION@
+
+@CITATION_SECTION@
+
+This tool annotates the given intervals and scores with genome
+features such as gene body. It's the major module in CEAS package
+which is written by Hyunjin Gene Shin, published in Bioinformatics
+(pubmed id:19689956).
+
+.. class:: warningmark
+
+**NEED IMPROVEMENT**
+
+-----
+
+**Parameters**
+
+- **PEAKFILE file** contains peaks for the experiment in a bed
+  format file. Normally, it's produced by the peak calling tool. It's
+  required.
+- **EXPREFILE file** contains the differentially expressed genes in a tab 
+  delimited text file. It's required.
+- **Kind** The kind of your expression file format, LIM for LIMMA standard 
+  output with Microarray, CUF for Cuffdiffs standard output with RNA-seq, 
+  BSF for BETA specific format, and O for other formats.
+- **genome** hg19 for human and mm9 for mouse. Others, don't set this parameter.
+- **gname2** If this switch is on, gene or transcript IDs in files given 
+  through -e will be considered as official gene symbols, DEFAULT=FALSE
+- **EXPREINFO** is the columns info of the geneID, up/down status and statistcal
+  values column of your expression data,NOTE: use a comma as an connector. 
+  for example: 2,5,7 means geneID in the 2nd column, Tscore in 5th column 
+  and FDR in 7 column.
+- **REFERENCE** is the refgene info file downloaded from UCSC genome browser.
+  It is a tab delimited text file with gene annotation with refseq and gene symbol.
+  Input this file only if your genome is neither hg19 nor mm9.
+  profiling
+- **OUTPUT** to specify the output files directory
+- **bl** Whether or not to use CTCF boundary file to get the contributed peaks
+- **BOUNDARYFILE** is the file with reasonable boundaries if --bl is on and genome
+  is neither hg19 nor mm9.
+- **NAME** specify the name of the output files.
+- **DISTANCE** specify the distance wich peaks within it will be considered.
+- **DIFF_FDR** specify the differential genes by the 3rd column in file input
+  via -e, genes with less than this value will be considered as the differentially
+  changed genes.
+- **DIFF_AMOUNT** specify the differential genes the top #(DIFF_AMOUNT) ranked by
+  the 3rd column in file input via -e, genes ranked in the top # will be considered
+  as the differentially expressed genes.
+- **CUTOFF** specify a cutoff of ks-test in the function prediction part
+
+-----
+
+**Script parameter list of BETA basic**
+
+::
+
+  -h, --help            show this help message and exit
+  -p PEAKFILE, --peakfile PEAKFILE
+                        The bed format of peaks binding sites. (BETA support 3
+                        or 5 columns bed format, CHROM, START, END (NAME,
+                        SCORE))
+  -e EXPREFILE, --diff_expr EXPREFILE
+                        The differential expression file get from limma for
+                        MicroArray ddata and cuffdiff for RNAseq data
+  -k {LIM,CUF,BSF,O}, --kind {LIM,CUF,BSF,O}
+                        The kind of your expression file,this is required,it
+                        can be LIM, CUF, BSF, O. LIM for LIMMA standard
+                        format. CUF for CUFDIFF standard format, BSF for BETA
+                        specific format and O for other formats, if is 'O',
+                        columns infor required via --info
+  -g {hg19,mm9}, --genome {hg19,mm9}
+                        Specify your species, hg19, mm9. For other genome
+                        assembily versions of human and mouse or other
+                        species, ignore this parameter.
+  --gname2              If this switch is on, gene or transcript IDs in files
+                        given through -e will be considered as official gene
+                        symbols, DEFAULT=FALSE
+  --info EXPREINFO      Specify the geneID, up/down status and statistcal
+                        values column of your expression data,NOTE: use a
+                        comma as an connector. for example: 2,5,7 means geneID
+                        in the 2nd column, Tscore in 5th column and FDR in 7
+                        column DEFAULT:2,5,7 for LIMMA; 2,10,13 for Cuffdiff
+                        and 1,2,3 for BETA specific format
+  -r REFERENCE, --reference REFERENCE
+                        The refgene info file downloaded from UCSC genome
+                        browser.input this file only if your genome is neither
+                        hg19 nor mm9
+  -o OUTPUT, --output OUTPUT
+                        The directory to store all the output files, if you
+                        don't set this, files will be output into the current
+                        directory
+  --bl                  Whether or not use CTCF boundary to filter peaks
+                        around a gene, DEFAULT=FALSE
+  --bf BOUNDARYFILE     CTCF conserved peaks bed file, use this only when you
+                        set --bl and the genome is neither hg19 nor mm9
+  --pn PEAKNUMBER       The number of peaks you want to consider,
+                        DEFAULT=10000
+  --method {score,distance}
+                        Define the method to do the TF/CR function prediction,
+                        score for regulatory potential, distance for the
+                        distance to the proximal binding peak. DEFAULT:SCORE
+  -n NAME, --name NAME  This argument is used to name the result file.If not
+                        set, the peakfile name will be used instead
+  -d DISTANCE, --distance DISTANCE
+                        Set a number which unit is 'base'. It will get peaks
+                        within this distance from gene TSS. default:100000
+                        (100kb)
+  --df DIFF_FDR         Input a number 0~1 as a threshold to pick out the most
+                        significant differential expressed genes by FDR,
+                        DEFAULT = 1, that is select all the genes
+  --da DIFF_AMOUNT      Get the most significant differential expressed genes
+                        by the percentage(0-1) or number(larger than 1)Input a
+                        number between 0-1, the rank based on fdr for example,
+                        2000, so that the script will only consider top 2000
+                        genes as the differentially expressed genes. DEFAULT =
+                        0.5, that is select top 50 percent genes of up and
+                        down seprately. NOTE: if you want to use diff_fdr,
+                        please set this parameter to 1, otherwise it will get
+                        the intersection of these two parameters
+  -c CUTOFF, --cutoff CUTOFF
+                        Input a number between 0~1 as a threshold to select
+                        the closer target gene list(up regulate or down
+                        regulate or both) with the p value was called by one
+                        side ks-test, DEFAULT = 0.001
+
+  </help>
+
+</tool>