view beta_basic.xml @ 0:20453b656907

Imported from capsule None
author jjohnson
date Tue, 16 Sep 2014 13:35:24 -0400
parents
children 9c5241259454
line wrap: on
line source

<tool id="beta_basic" name="BETA-basic: Binding and Expression Target Analysis" version="0.1.0">
  <description>Predict the factors (TFs or CRs) direct target genes by combining the binding and expression data</description>
  <macros>
    <import>beta_macros.xml</import>
  </macros>
  <expand macro="requirements" />
  <command>
  BETA basic 
  #include source=$common_opts#
  #include source=$genome_opts#
  #include source=$extended_opts#
  &amp;> $log
  </command>
  <inputs>
    <expand macro="common_params" />
    <expand macro="genome_params" />
    <expand macro="extended_params" />
  </inputs>
  <expand macro="stdio" />
  <outputs>
    <data format="txt" name="log" label="Log of BETA basic"/>
    <data format="pdf" name="functionoutput" label="BETA functional prediction on ${peakfile.name}" from_work_dir="BETA_OUTPUT/NA_function_prediction.pdf"/>
    <data format="tabular" name="uptargetsoutput" label="BETA direct targets prediction on up regulated genes" from_work_dir="BETA_OUTPUT/NA_uptarget.txt"/>
    <data format="tabular" name="downtargetsoutput" label="BETA direct targets prediction on down regulated genes" from_work_dir="BETA_OUTPUT/NA_downtarget.txt"/>
    <data format="bed" name="uptargetpeaks" label="BETA Uptarget associated peaks" from_work_dir="BETA_OUTPUT/NA_uptarget_associate_peaks.bed"/>
    <data format="bed" name="downtargetpeaks" label="BETA Downtarget associated peaks" from_work_dir="BETA_OUTPUT/NA_downtarget_associate_peaks.bed"/>
  </outputs>
  <tests>
    <test>
      <param name='peakfile' value="peaks.bed" ftype="bed" dbkey="hg19"/>
      <param name="distance" value="100000"/>
      <param name="peaknumber" value="10000"/>
      <param name="genomeName" value="hg19"/>
      <param name='exprefile' value="diff_expr.xls" ftype="tabular" dbkey="hg19"/>
      <param name="kind" value="LIM"/>
      <param name="expreinfo" type="text" value="2,5,7"/>
      <param name="gname2" value="Refseq"/>
      <param name="diff_fdr" value="1.0"/>
      <param name="diff_amount" value="0.5"/>
      <param name="method" value="score"/>
      <output name="log">
        <assert_contents>
            <has_text_matching expression="Finished" />
        </assert_contents>
      </output>
      <output name="targetsoutput">
        <assert_contents>
            <has_text_matching expression="chr19\t4675243\t4723855\tNM_139159\t1.1.*\t-\tDPP" />
        </assert_contents>
      </output>
      <output name="targetpeaks">
        <assert_contents>
            <has_text_matching expression="chr19\t4723422\t4724314\tregion_9\tNM_139159\tDPP9\t13\t0.6.*" />
        </assert_contents>
      </output>
    </test>
  </tests>
 <help>
** BETA basic **

@EXTERNAL_DOCUMENTATION@

@CITATION_SECTION@

This tool annotates the given intervals and scores with genome
features such as gene body. It's the major module in CEAS package
which is written by Hyunjin Gene Shin, published in Bioinformatics
(pubmed id:19689956).

.. class:: warningmark

**NEED IMPROVEMENT**

-----

**Parameters**

- **PEAKFILE file** contains peaks for the experiment in a bed
  format file. Normally, it's produced by the peak calling tool. It's
  required.
- **EXPREFILE file** contains the differentially expressed genes in a tab 
  delimited text file. It's required.
- **Kind** The kind of your expression file format, LIM for LIMMA standard 
  output with Microarray, CUF for Cuffdiffs standard output with RNA-seq, 
  BSF for BETA specific format, and O for other formats.
- **genome** hg19 for human and mm9 for mouse. Others, don't set this parameter.
- **gname2** If this switch is on, gene or transcript IDs in files given 
  through -e will be considered as official gene symbols, DEFAULT=FALSE
- **EXPREINFO** is the columns info of the geneID, up/down status and statistcal
  values column of your expression data,NOTE: use a comma as an connector. 
  for example: 2,5,7 means geneID in the 2nd column, Tscore in 5th column 
  and FDR in 7 column.
- **REFERENCE** is the refgene info file downloaded from UCSC genome browser.
  It is a tab delimited text file with gene annotation with refseq and gene symbol.
  Input this file only if your genome is neither hg19 nor mm9.
  profiling
- **OUTPUT** to specify the output files directory
- **bl** Whether or not to use CTCF boundary file to get the contributed peaks
- **BOUNDARYFILE** is the file with reasonable boundaries if --bl is on and genome
  is neither hg19 nor mm9.
- **NAME** specify the name of the output files.
- **DISTANCE** specify the distance wich peaks within it will be considered.
- **DIFF_FDR** specify the differential genes by the 3rd column in file input
  via -e, genes with less than this value will be considered as the differentially
  changed genes.
- **DIFF_AMOUNT** specify the differential genes the top #(DIFF_AMOUNT) ranked by
  the 3rd column in file input via -e, genes ranked in the top # will be considered
  as the differentially expressed genes.
- **CUTOFF** specify a cutoff of ks-test in the function prediction part

-----

**Script parameter list of BETA basic**

::

  -h, --help            show this help message and exit
  -p PEAKFILE, --peakfile PEAKFILE
                        The bed format of peaks binding sites. (BETA support 3
                        or 5 columns bed format, CHROM, START, END (NAME,
                        SCORE))
  -e EXPREFILE, --diff_expr EXPREFILE
                        The differential expression file get from limma for
                        MicroArray ddata and cuffdiff for RNAseq data
  -k {LIM,CUF,BSF,O}, --kind {LIM,CUF,BSF,O}
                        The kind of your expression file,this is required,it
                        can be LIM, CUF, BSF, O. LIM for LIMMA standard
                        format. CUF for CUFDIFF standard format, BSF for BETA
                        specific format and O for other formats, if is 'O',
                        columns infor required via --info
  -g {hg19,mm9}, --genome {hg19,mm9}
                        Specify your species, hg19, mm9. For other genome
                        assembily versions of human and mouse or other
                        species, ignore this parameter.
  --gname2              If this switch is on, gene or transcript IDs in files
                        given through -e will be considered as official gene
                        symbols, DEFAULT=FALSE
  --info EXPREINFO      Specify the geneID, up/down status and statistcal
                        values column of your expression data,NOTE: use a
                        comma as an connector. for example: 2,5,7 means geneID
                        in the 2nd column, Tscore in 5th column and FDR in 7
                        column DEFAULT:2,5,7 for LIMMA; 2,10,13 for Cuffdiff
                        and 1,2,3 for BETA specific format
  -r REFERENCE, --reference REFERENCE
                        The refgene info file downloaded from UCSC genome
                        browser.input this file only if your genome is neither
                        hg19 nor mm9
  -o OUTPUT, --output OUTPUT
                        The directory to store all the output files, if you
                        don't set this, files will be output into the current
                        directory
  --bl                  Whether or not use CTCF boundary to filter peaks
                        around a gene, DEFAULT=FALSE
  --bf BOUNDARYFILE     CTCF conserved peaks bed file, use this only when you
                        set --bl and the genome is neither hg19 nor mm9
  --pn PEAKNUMBER       The number of peaks you want to consider,
                        DEFAULT=10000
  --method {score,distance}
                        Define the method to do the TF/CR function prediction,
                        score for regulatory potential, distance for the
                        distance to the proximal binding peak. DEFAULT:SCORE
  -n NAME, --name NAME  This argument is used to name the result file.If not
                        set, the peakfile name will be used instead
  -d DISTANCE, --distance DISTANCE
                        Set a number which unit is 'base'. It will get peaks
                        within this distance from gene TSS. default:100000
                        (100kb)
  --df DIFF_FDR         Input a number 0~1 as a threshold to pick out the most
                        significant differential expressed genes by FDR,
                        DEFAULT = 1, that is select all the genes
  --da DIFF_AMOUNT      Get the most significant differential expressed genes
                        by the percentage(0-1) or number(larger than 1)Input a
                        number between 0-1, the rank based on fdr for example,
                        2000, so that the script will only consider top 2000
                        genes as the differentially expressed genes. DEFAULT =
                        0.5, that is select top 50 percent genes of up and
                        down seprately. NOTE: if you want to use diff_fdr,
                        please set this parameter to 1, otherwise it will get
                        the intersection of these two parameters
  -c CUTOFF, --cutoff CUTOFF
                        Input a number between 0~1 as a threshold to select
                        the closer target gene list(up regulate or down
                        regulate or both) with the p value was called by one
                        side ks-test, DEFAULT = 0.001

  </help>

</tool>