view ceas.xml @ 0:97bd5bb4204c default tip

commit
author ryo_tas <yamanaka@genome.rcast.u-tokyo.ac.jp>
date Tue, 30 Dec 2014 18:45:34 +0900
parents
children
line wrap: on
line source

<tool name="CEAS: Enrichment on chromosome and annotation" id="ceas_ceas">
  <description>Annotate the given intervals and scores with genome features such as gene body</description>
  <command interpreter="command">/bin/bash $shscript </command>
  <inputs>
    <param format="wig" name="wfile" type="data" label="wig file" optional="true">
      <validator type="unspecified_build" />
    </param>
    <param format="bed" name="bfile" type="data" label="BED file(maximum 100000 lines)" optional="true">
      <validator type="unspecified_build" />
    </param>
    <param name="span" type="integer" label="Span" value="3000">
    	<validator type="in_range" max="1000000" min="100" message="The Span is out of range, the parameter has to be between 100 to 1000000" />
    </param>
    <param name="pfres" type="integer" label="Profiling resolution" value="50">
   		<validator type="in_range" max="1000" min="10" message="The Profiling Resolution is out of range, the parameter has to be between 10 to 1000" />
    </param>
    <param name="lowersize" type="text" label="Promoter/downstream lower-interval" value="1000" >
    	<validator type="in_range" max="10000" min="100" message="The lower-interval is out of range, the parameter has to be between 100 to 10000" />
    </param>
    <param name="middlesize" type="text" label="Promoter/downstream middle-interval" value="2000" >
      <validator type="in_range" max="10000" min="100" message="The middle-interval is out of range, the parameter has to be between 100 to 10000" />
    </param>
    <param name="uppersize" type="text" label="Promoter/downstream upper-interval" value="3000">
    	<validator type="in_range" max="10000" min="100" message="The upper-interval is out of range, the parameter has to be between 100 to 10000" />
    </param>
    <param name="lowerbisize" type="text" label="Bi-Promoter lower range" value="2500">
    	<validator type="in_range" max="10000" min="100" message="The lower-range is out of range, the parameter has to be between 100 to 10000" />
    </param>
    <param name="upperbisize" type="text" label="Bi-Promoter upper range" value="5000">
    	<validator type="in_range" max="10000" min="100" message="The upper-range is out of range, the parameter has to be between 100 to 10000" />
    </param>
    <param name="reldist" type="integer" label="Relative distance" value="3000">
    	<validator type="in_range" max="10000" min="100" message="The Relative distance is out of range, the parameter has to be between 100 to 10000" />
    </param>
    <param type="select" name="imagetype" display="radio" label="Image Type">
      <option value="PNG">PNG format</option>
      <option value="PDF">PDF format</option>
    </param>

    <conditional name="genegroup">
      <param name="enable" type="select" label="Specify gene list in the signal profiling" force_select="true">
	<option value="no">No</option>
	<option value="yes">Yes</option>
      </param>

      <when value="no">
      </when>

      <when value="yes">
	<param format="text" name="genelist" type="data" label="Gene List" optional="false"/>
	<param name="label" type="text" label="Gene List Label" optional="false" />
	<repeat name="more" title="Gene Lists">
	  <param format="text" name="genelist" type="data" label="Gene List" optional="false"/>
	  <param name="label" type="text" label="Gene List Label" optional="false" />
	</repeat>
	<param name="idtype" type="select" label="Are they ...">
	  <option value="">refseq</option>
	  <option value=" --gname2">genesymbol</option>
	</param>
      </when>
    </conditional>

  </inputs>

  <outputs>
    <data format="png" name="output">
      <change_format>
	<when input="imagetype" value="PDF" format="pdf" />
      </change_format>
    </data>
    <data format="txt" name="log" label="ceas job log" />
  </outputs>

  <configfiles>
    <configfile name="shscript">
#!/bin/bash
#import os

#set $dollar = chr(36)
#set $gt = chr(62)
#set $lt = chr(60)
#set $ad = chr(38)

if [ $bfile == "None" ];then
		if [ $wfile == "None" ];then
				echo "Either wig or bed file is required!" ${gt}${ad}2
        exit;
		fi
fi


#if $bfile != None
    dbkey=$bfile.metadata.dbkey
#elif $wfile != None
    dbkey=$wfile.metadata.dbkey
#end if


##REMOVING WIG VALIDATORS
##if [ $wfile != "None" ];then
##    wigsize=`du -b $wfile | awk '{print ${dollar}1}'`
##    
##    if [[ ${dollar}wigsize -gt 2097152000 ]];then
##        echo "wig file is too big! 2G is the maximum!" ${gt}${ad}2
##        exit;
##    fi
##fi


#if $genegroup.enable == "yes"
#set $gngroups = str($genegroup.genelist)
#set $gnlabels = str($genegroup.label)
lines=`wc -l $gngroups | tail -1 | awk '{print ${dollar}1}'`
if [[ ${dollar}lines -gt 100000 ]];then
    echo "Total lines of the gene list has to between 100 and 100000!" ${gt}${ad}2;
    exit;
fi
if [[ ${dollar}lines -lt 100 ]];then
    echo "Total lines of the gene list has to between 100 and 100000!" ${gt}${ad}2;
    exit;
fi
genelength=`echo $gnlabels |awk '{print length($0)}'`
if [[ ${dollar}genelength -gt 255 ]];then
    echo "Gene List Label exceed the limit of 255 characters!" ${gt}${ad}2;
    exit;
fi
#end if
#if $genegroup.enable == "yes"
#set $gngroups = str($genegroup.genelist)
#set $gnlabels = str($genegroup.label)
#for $m in $genegroup.more
#set $gngroups = $gngroups+","+str($m.genelist)
#set $gnlabels = $gnlabels+","+str($m.label)
#end for
#set $gngroupspara = "--gn-groups="+$gngroups
#set $gnlabelspara = "--gn-group-names='"+$gnlabels+"'"+str($genegroup.idtype.value)
#else
#set $gngroupspara = ""
#set $gnlabelspara = ""
#end if

#set $path = $os.path.abspath($__app__.config.tool_path)

WIG=""
if [ $wfile != "None" ]; then #were we sent in a value for the wig file?
   WIG="-w $wfile"
fi

BED=""
if [ $bfile != "None" ]; then #were we sent in a value for the wig file?
   BED="-b $bfile"
   lines=`wc -l $bfile | tail -1 | awk '{print ${dollar}1}'`
   format=`$path/validation/fcfunc.py $bfile`

   if [[ ${dollar}lines -gt 500000 ]];then
      echo "BED file is too big! 100K lines are the maximum!" ${gt}${ad}2
      exit;
   fi
   if [[ ${dollar}format != "passed" ]]; then
      echo ${dollar}format ${gt}${ad}2
      exit;
   fi
fi

#set $sizes = str($lowersize) + "," + str($middlesize) + "," + str($uppersize)
#set $bisizes = str($lowerbisize) + "," + str($upperbisize)

#set $gtpath = os.path.join( os.path.abspath($__app__.config.cistrome_static_library_path), "ceaslib", "GeneTable", $dbkey )
#set $length_file = os.path.join( os.path.abspath($__app__.config.cistrome_static_library_path), "chromLen", $dbkey+".len" )


#if str($wfile) != "None"
#if $wfile.extension == "wig"
ceas ${dollar}WIG ${dollar}BED --span=$span --pf-res=$pfres --sizes=$sizes --bisizes=$bisizes \
   --rel-dist=$reldist -g $gtpath $gngroupspara $gnlabelspara --name=ceas_out ${ad}${gt} $log
#elif $wfile.extension == "bigwig"
ceasBW ${dollar}WIG ${dollar}BED --span=$span --pf-res=$pfres --sizes=$sizes --bisizes=$bisizes -l $length_file\
   --rel-dist=$reldist -g $gtpath $gngroupspara $gnlabelspara --name=ceas_out ${ad}${gt} $log
#end if
#else
ceas ${dollar}WIG ${dollar}BED --span=$span --pf-res=$pfres --sizes=$sizes --bisizes=$bisizes \
   --rel-dist=$reldist -g $gtpath $gngroupspara $gnlabelspara --name=ceas_out ${ad}${gt} $log
#end if


R --vanilla ${lt} ceas_out.R ${ad}${gt}/dev/null
if [ $imagetype == "PNG" ]; then
convert ceas_out.pdf ceas_out.png
convert ceas_out-*.png -append ceas_out_joint.png
mv ceas_out_joint.png $output
else
mv ceas_out.pdf $output
fi

    </configfile>
  </configfiles>
 <tests>
  <test maxseconds="3600" name="CEAS_1">
    <param name="wfile" value="wiggle.wig" />
    <param name="bfile" value="bedfile.bed" />
    <param name="span" value="3000" />
    <param name="pfres" value="50" />
    <param name="lowersize" value="1000" />
    <param name="middlesize" value="2000" />
    <param name="uppersize" value="3000" />
    <param name="lowerbisize" value="2500" />
    <param name="upperbisize" value="5000" />
    <param name="reldist" value="3000" />
    <param name="genome" value="hg18" />
    <param name="imagetype" value="PDF" />
    <param name="enable" value="no" />
    <output name="output" file="ceas_1/ceas_1.pdf" />
    <output name="output" file="ceas_1/ceas_1.log" lines_diff = "200" />
  </test>
  <test maxseconds="3600" name="CEAS_2">
    <param name="wfile" value="wiggle.wig" />
    <param name="bfile" value="bedfile.bed" />
    <param name="span" value="1000" />
    <param name="pfres" value="250" />
    <param name="lowersize" value="1000" />
    <param name="middlesize" value="2000" />
    <param name="uppersize" value="3000" />
    <param name="lowerbisize" value="2500" />
    <param name="upperbisize" value="5000" />
    <param name="reldist" value="3000" />
    <param name="genome" value="hg18" />
    <param name="imagetype" value="PDF" />
    <param name="enable" value="no" />
    <output name="output" file="ceas_2/ceas_2.pdf" />
    <output name="output" file="ceas_2/ceas_2.log" lines_diff = "200" />
  </test>
  <test maxseconds="3600" name="CEAS_3">
    <param name="wfile" value="wiggle.wig" />
    <param name="bfile" value="bedfile.bed" />
    <param name="span" value="3000" />
    <param name="pfres" value="150" />
    <param name="lowersize" value="1000" />
    <param name="middlesize" value="2000" />
    <param name="uppersize" value="3000" />
    <param name="lowerbisize" value="5000" />
    <param name="upperbisize" value="10000" />
    <param name="reldist" value="3000" />
    <param name="genome" value="hg18" />
    <param name="imagetype" value="PDF" />
    <param name="enable" value="no" />
    <output name="output" file="ceas_3/ceas_3.pdf" />
    <output name="output" file="ceas_3/ceas_3.log" lines_diff = "200" />
  </test>
  <test maxseconds="3600" name="CEAS_4">
    <param name="wfile" value="wiggle.wig" />
    <param name="bfile" value="bedfile.bed" />
    <param name="span" value="3000" />
    <param name="pfres" value="500" />
    <param name="lowersize" value="1000" />
    <param name="middlesize" value="2000" />
    <param name="uppersize" value="3000" />
    <param name="lowerbisize" value="5000" />
    <param name="upperbisize" value="10000" />
    <param name="reldist" value="3000" />
    <param name="genome" value="hg18" />
    <param name="imagetype" value="PDF" />
    <param name="enable" value="no" />
    <output name="output" file="ceas_4/ceas_4.pdf" />
    <output name="output" file="ceas_4/ceas_4.log" lines_diff = "200" />
  </test>
  <test maxseconds="3600" name="CEAS_5">
    <param name="wfile" value="wiggle.wig" />
    <param name="bfile" value="bedfile.bed" />
    <param name="span" value="6000" />
    <param name="pfres" value="500" />
    <param name="lowersize" value="1000" />
    <param name="middlesize" value="2000" />
    <param name="uppersize" value="3000" />
    <param name="lowerbisize" value="5000" />
    <param name="upperbisize" value="10000" />
    <param name="reldist" value="3000" />
    <param name="genome" value="hg18" />
    <param name="imagetype" value="PDF" />
    <param name="enable" value="no" />
    <output name="output" file="ceas_5/ceas_5.pdf" />
    <output name="output" file="ceas_5/ceas_5.log" lines_diff = "200" />
  </test>
</tests> 
  <help>
This tool annotates the given intervals and scores with genome
features such as gene body. It's the major module in CEAS package
which is written by Hyunjin Gene Shin, published in Bioinformatics
(pubmed id:19689956).

.. class:: warningmark

**NEED IMPROVEMENT**

-----

**Parameters**

- **WIGGLE file** contains the scores for the experiment in a wiggle
  format file. Normally, it's produced by the peak calling tool. It's
  optional.
- **BED file** contains the peak locations for the experiment in a BED
  format file.
- **Span** from TSS and TTS in the gene-centered annotation. ChIP
  regions within this range from TSS and TTS are considered when
  calculating the coverage rates in promoter and downstream.
- **Profiling resolution** is the WIGGLE profiling resolution.
- **Promoter/downstream intervals** for ChIP region annotation are
  comma-separated three values or a single value can be given. If a
  single value is given, it will be segmented into three equal
  fractions (ie, 3000 is equivalent to 1000,2000,3000)
- **BiPromoter ranges** is for ChIP region annotation. It's
  comma-separated two values or a single value can be given. If a
  single value is given, it will be segmented into two equal fractions
  (ie, 5000 is equivalent to 2500,5000) 
- **Relative distance** is the relative distance to TSS/TTS in WIGGLE file
  profiling
- **Genome Annotation Version** to specify the annotations according to
  the data set. The annotations are downloaded from UCSC genome site.
- **Image type** specify the output image format, either in PNG or in
  PDF format.
- If **Specify gene list in the signal profiling** is set, you can specify
  different gene groups for CEAS to put them together in the profile
  figure. You need to select several **Gene List** files from history which
  contains the RefSeq ids or Gene Symbols for each row, and
  **Gene List Label** for each gene list file.

-----

**Outputs**

- **PNG/PDF file** is the result for CEAS analysis, containing 5 pages.
- **LOG file** for job log. If you see errors, please attached this in
  the bug report

-----

**script parameter list of CEAS 0.9.8**

Options:
  --version             show program's version number and exit
  -h, --help            Show this help message and exit.
  -b BED, --bed=BED     BED file of ChIP regions.
  -w WIG, --wig=WIG     WIG file for either wig profiling or genome background
                        annotation. WARNING: --bg flag must be set for genome
                        background re-annotation.
  -e EBED, --ebed=EBED  BED file of extra regions of interest (eg, non-coding
                        regions)
  -g GDB, --gt=GDB      Gene annotation table (eg, a refGene table in sqlite3
                        db format provided through the CEAS web,
                        http://liulab.dfci.harvard.edu/CEAS/download.html).
  --name=NAME           Experiment name. This will be used to name the output
                        files. If an experiment name is not given, the stem of
                        the input BED file name will be used instead (eg, if
                        'peaks.bed', 'peaks' will be used as a name.)
  --sizes=SIZES         Promoter (also dowsntream) sizes for ChIP region
                        annotation. Comma-separated three values or a single
                        value can be given. If a single value is given, it
                        will be segmented into three equal fractions (ie, 3000
                        is equivalent to 1000,2000,3000), DEFAULT:
                        1000,2000,3000. WARNING: Values > 10000bp are
                        automatically set to 10000bp.
  --bisizes=BISIZES     Bidirectional-promoter sizes for ChIP region
                        annotation Comma-separated two values or a single
                        value can be given. If a single value is given, it
                        will be segmented into two equal fractions (ie, 5000
                        is equivalent to 2500,5000) DEFAULT: 2500,5000bp.
                        WARNING: Values > 20000bp are automatically set to
                        20000bp.
  --bg                  Run genome BG annotation again. WARNING: This flag is
                        effective only if a WIG file is given through -w
                        (--wig). Otherwise, ignored.
  --span=SPAN           Span from TSS and TTS in the gene-centered annotation.
                        ChIP regions within this range from TSS and TTS are
                        considered when calculating the coverage rates in
                        promoter and downstream, DEFAULT=3000bp
  --pf-res=PF_RES       Wig profiling resolution, DEFAULT: 50bp. WARNING:
                        Value smaller than the wig interval (resolution) may
                        cause aliasing error.
  --rel-dist=REL_DIST   Relative distance to TSS/TTS in wig profiling,
                        DEFAULT: 3000bp
  --gn-groups=GN_GROUPS
                        Gene-groups of particular interest in wig profiling.
                        Each gene group file must have gene names in the 1st
                        column. The file names are separated by commas w/ no
                        space (eg, --gn-groups=top10.txt,bottom10.txt)
  --gn-group-names=GN_NAMES
                        The names of the gene groups in --gn-groups. The gene
                        group names are separated by commas. (eg, --gn-group-
                        names='top 10%,bottom 10%'). These group names appear
                        in the legends of the wig profiling plots. If no group
                        names given, the groups are represented as 'Group 1,
                        Group2,...Group n'.
  --gname2              Whether or not use the 'name2' column of the gene
                        annotation table when reading the gene IDs in the
                        files given through --gn-groups. This flag is
                        meaningful only with --gn-groups.

  </help>

</tool>