changeset 3:4cf2808854bc draft

Uploaded
author bonsai
date Fri, 13 Sep 2013 10:01:00 -0400
parents dbb83adec9eb
children ac6be31420fe
files crac-index-wrapper.sh crac-index.xml crac.xml crac_wrapper.sh tool_dependencies.xml
diffstat 5 files changed, 432 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/crac-index-wrapper.sh	Fri Sep 13 10:01:00 2013 -0400
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# Recovering parameters from crac-index.xml
+###############################################################
+CRAC_INDEX_BINARY=crac-index
+BASE_FILENAME="$1"
+HTML_REPORT="$2"
+FILES_PATH="$3"
+BUCKET="$4"
+
+#echo Wrapper for crac-index
+#echo running: $0 "$@"
+
+shift 4
+
+#echo After shifting, args are:
+#echo "$@"
+
+mkdir -p "$FILES_PATH"
+
+# Execution of the command line (Submiting job to the cluster)
+###############################################################
+  CRAC_INDEX_CMD_LINE="$CRAC_INDEX_BINARY index "$FILES_PATH/$BASE_FILENAME" -b "$BUCKET" "$@""
+
+  out=`$CRAC_INDEX_CMD_LINE`
+
+  jID=`echo $out | awk {'print $3'}`
+fi
+
+cat << MARINE > "$HTML_REPORT"
+
+exit 0
+
+echo "<html>
+  <head>
+    <title>Files for Crac Index (crac_index)</title>
+  </head>
+  <body>
+    This index is composed of the following files:
+    <p/>
+    <ul>
+      <li><a href=""$BASE_FILENAME.conf"" type=""text/plain"">$BASE_FILENAME.conf</a></li>
+      <li><a href=""$BASE_FILENAME.ssa"" type=""application/binary"">$BASE_FILENAME.ssa</a></li>
+    </ul>
+  </body>
+</html>" > "$HTML_REPORT"
+
+
+MARINE
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/crac-index.xml	Fri Sep 13 10:01:00 2013 -0400
@@ -0,0 +1,64 @@
+<tool id="crac-index" name="CRAC index">
+
+   <description>Create genome indexes available to be used with CRAC mapping/annotation tool </description>
+
+   <command>
+     crac-index-wrapper.sh "$output_name" "$output" "$output.files_path" "$bucket" "$input_file"
+   </command>
+
+   <inputs>
+      <param name="input_file" type="data" label="Source file" format="fasta" help="You must choose a fasta file containing the genome"/>
+      <param name="output_name" type="text" value ="IndexOutput" label="Output name" help="Name must be different from 'index' word, otherwise CRAC-index will fail." />
+      <param name="bucket" type="integer" value="100000000" label="Bucket size" help="The size of the bucket for the index construction (default 100000000)."/>
+   </inputs>
+  
+   <outputs>
+      <data name="output" format="crac_index" label="${output_name}.crac-index" />
+   </outputs>
+
+   <help>
+
+**What it does**
+
+Crac-index generates an indexed genome from a fasta file. This is especially useful for the Crac mapping/annotation tool.
+
+----------------------
+
+**Input Formats**
+
+Crac-index takes as input files any fasta or multi-fasta files.
+
+----------------------
+
+**Outputs**
+
+Crac-index on Galaxy produces a composite output named crac-index, which is made of a ssa file and a conf file. Both are required to the use of your index. 
+
+----------------------
+
+**Crac-index settings**
+
+
+Usage : ./crac-index [options] command output_file input_file
+
+  command must be :
+    index: create an index on the specified input file(s).
+
+  options can be :
+
+  -b bucket_size	 the size of the bucket for the index construction
+                  	 (default 100000000)
+  -d diff-cover 	 parameter for the index construction (default 1024)
+  -v              	 verbose mode
+
+  Examples: 
+		./crac-index index myIndex sequence1.fa sequence2.fa sequence3.fa
+			You can specify FASTA or MultiFASTA file(s).
+			In this example, two files will be created:
+			- myIndex.ssa (index storing the compressed sequences)
+			- myIndex.conf (information on sequence names and length)
+
+   </help>
+
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/crac.xml	Fri Sep 13 10:01:00 2013 -0400
@@ -0,0 +1,265 @@
+<?xml version="1.0" encoding="utf-8"?>
+<tool id="crac" name="CRAC" version="1.0">
+  <requirements>
+    <requirement type='package' version="1.3.0">crac</requirement>
+  </requirements>
+   <description>Analyzing RNAs in high-throughput sequencing data</description>
+   <command interpreter="bash"> crac_wrapper.sh 
+              #if $Genome.which_genome == "prebuilt"
+   <!--1-->      "$Genome.prebuilt_genome.fields.path"
+              #else
+   <!--1-->      "$Genome.index_input.extra_files_path"
+              #end if
+              #if $condi_compressed == "yes"
+                --gz
+              #end if
+   <!--2-->   $output_name.extra_files_path		<!-- Usefull for submitting jobs on crac.sh-->
+              -r $input -k $kmer_length --read-length $read_length --sam $output_name
+              #if $condi_deep_snp.deepSNP == "yes"
+                 --deep-snv --nb-nucleotides-snv-comparison $condi_deep_snp.nb_nucleotides_snp_comparison
+              #end if
+              #if $choixSettings.settings == "experimental"
+                --max-splice-length $choixSettings.max_splice_length
+                --max-bio-indel $choixSettings.max_bio_indel
+                --min-duplication $choixSettings.min_duplication
+                --max-duplication $choixSettings.max_duplication
+                --min-percent-single-loc $choixSettings.min_percent_single_loc
+                --min-percent-duplication-loc $choixSettings.min_percent_duplication_loc
+                --max-bases-randomly-matched $choixSettings.max_bases_randomly_matched
+                --max-extension-length $choixSettings.max_extension_length
+                --min-support-no-cover $choixSettings.min_support_no_cover
+                --min-break-length $choixSettings.min_break_length
+              #end if
+              #if str($detailed_sam) == "yes"
+                --detailed-sam 
+              #end if
+              
+   </command>
+
+   <inputs>
+      <!-- Normal Setting -->
+
+      <conditional name="Genome">			<!-- Conditional 3 (Which genome) -->
+      <param name="which_genome" type="select" label="Do you want to use a pre-built reference genome or a Crac-index generated genome from your history?" help="Pre-built reference genomes are generated by Crac-index.">
+         <option value="prebuilt"> Use a pre-built reference genome </option>
+         <option value="history"> Use a Crac-index generated genome from my history</option>
+      </param>
+
+      <when value="prebuilt">
+         <param name="prebuilt_genome" type="select" label="Select a reference genome" help="if your genome of interest is not listed - contact authors">
+            <options from_data_table="crac_indexes">
+            <filter type="sort_by" column="2" />
+            <validator type="no_options" message="No indexes are available" />
+            </options> 
+         </param>
+      </when>
+
+      <when value="history">
+         <param name="index_input" format="crac_index" type="data" label="Reference Genome" help="Select an indexed Genome from your history"/>
+      </when>
+      </conditional>
+
+
+      <param name="input" format="txt,raw,fastq" type="data" label="Reads File" help="Select a file"/>   
+      <param name="kmer_length" type="integer" min="12" max="32" value="21" label="k-mer length"> 
+         <help>k-mer length must be carefully chosen. A k-mer of that length must map to a unique location in the genome with a high probability. Recommended value for the human genome: 22</help>  
+      </param> 
+      <param name="read_length" type="integer" label="Read length" value="0" help="Set read length when all reads have the same length to dramatically increase computation speed. Default value (no read length considered) : 0. Note : If read length is set, shorter reads will be ignored, longer reads will be cut."/>
+      <param name="detailed_sam" type="select" label="Do you want a detailed sam output file ?" help="Detailed sam output file gives you information on the SNPs, Splice junctions, Sequencing errors, Chimeras, ..." >
+         <option value="no"> No, I do not want detailed sam output file </option>
+         <option value="yes"> Yes, I want detailed sam output file </option>
+         <when value="yes"/>				<!-- Supress warnings-->
+         <when value="no"/>				<!-- Supress warnings-->
+      </param>
+      <param name="condi_compressed" type="select" display="radio" label="Compress output files?">
+        <option value="yes">Yes</option>
+        <option value="no">No</option>
+      </param>
+      <conditional name="condi_deep_snp">		<!-- Conditional 1 -->
+		<param type="select" name="deepSNP" label="Search hard for SNPs?">
+			<option value="no" selected="true"> No, do not search hard for SNVs </option>
+			<option value="yes"> Yes, search hard for SNVs (takes more time) </option>
+ 		</param>
+		<when value="yes">
+                   <param name="nb_nucleotides_snp_comparison" type="integer" value="8" label="Number of nucleotides for SNV comparison" help="Default value for human genome : 8. A smaller value will find more SNVs, but will be less accurate."/>
+                </when>
+		<when value="no"/> 			<!-- Suppress warnings -->
+      </conditional>					<!-- End Conditional 1 -->   
+   
+    <!-- Experimental Setting-->
+    <conditional name="choixSettings">			<!-- Conditional 2 (setting choice) -->
+    	<param name="settings" type="select" label="Advanced CRAC settings to use" help="If you want full control to optimize your experience, use Advanced Settings. Be careful, these settings are experimental and one single change can make Crac fail">
+    	    <option value="normal" selected="true"> Normal settings </option>
+    	    <option value="experimental"> Advanced Settings </option>
+    	</param>
+    	<when value="normal"/>				<!-- Supress warnings-->
+    	<when value="experimental">			<!-- Supress warnings-->    		  					
+           <param name="max_splice_length" type="integer" value="300000" label="Maximum splice length" help="Splices larger than this value, will not be considered as splices, but (if possible) as chimeras. Default value for human genome : 300,000 bp." />
+           <param name="max_bio_indel" type="integer" value="15" label="Maximum indel length. Larger indels will be considered as splice junctions" help=" HELP. Default value for human genome : 15 bp." />
+           <param name="min_duplication" type="integer" value="2" label="Minimum duplication occurrence" help=" Minimum number of k-mer occurrences in the genome to be considered as duplicated. Default value for human genome : 2." />
+           <param name="max_duplication" type="integer" value="9" label="Maximum duplication occurrence" help=" Maximum number of k-mer occurrences to be considered as duplicated. Default value for human genome : 9." />
+           <param name="min_percent_single_loc" type="float" value="0.15" label="Minimum unique location percentage" help=" Minimal percentage of k-mers that must be unique in the genome, to consider the read as unique. Default value for human genome : 0.15." />
+           <param name="min_percent_duplication_loc" type="float" value="0.20" label="Minimum duplicated location percentage" help=" Minimal percentage of k-mers that must be duplicated in the genome, to consider the read as duplicated. Default value for human genome : 0.20." />
+           <!--param name="min_percent_multiple_loc" type="float" value="0.20" label="Minimum percent multiple localisation" help=" HELP. Default value for human genome : 0.20." /-->
+           <param name="max_bases_randomly_matched" type="integer" value="10" label="Maximum bases randomly matched" help=" Maximum number of bases that can be considered as randomly matched. Default value for human genome : 10." />
+           <param name="max_bases_retrieved" type="integer" value="10" label="Maximum bases retrieved" help=" Maximum number of bases retrieved from the genome when outputting deletions. Default value for human genome : 10." />
+           <param name="max_extension_length" type="integer" value="10" label="Maximum extension length" help=" Maximal number of nucleotides visited to extend a break and to make sure that the location is consistent. Default value for human genome : 10." />
+           <param name="min_support_no_cover" type="float" value="1.30" label="Minimum suppot no cover" help=" Average coverage along the read to consider it as not covered. Default value for human genome : 1.30." />
+           <param name="min_break_length" type="float" value="0.5" label="Minimum break length" help=" Breaks shorter than this ratio times the k-mer length will be considered as too short and will be merged if necessary. Default value for human genome : 0.5." />
+        </when>						<!-- End "when experimental" -->
+    </conditional>					<!-- End Conditional 2 -->
+   </inputs>
+
+   <outputs>
+	<data name="output_name" format="sam" label="${tool.name} on ${on_string}: mapped reads" />
+   </outputs>
+
+   <tests>
+      <test>
+      </test>
+   </tests>
+
+   <help>
+**What it does**
+
+CRAC proposes a novel way of analyzing reads that integrates genomic locations
+and local coverage, and delivers all above mentioned predictions in a single
+step. CRAC uses a double k-mer profiling approach to detect candidate
+mutations, indels, splice or fusion junctions in each single read.
+
+.. _CRAC: http://crac.gforge.inria.fr/
+
+If you use this tool, please cite: 
+  - Philippe N., Salson M., Commes T., Rivals E., `"CRAC: an integrated approach to the analysis of RNA-seq reads"`__, Genome Biology (2013), 14:R30, doi: 10.1186/gb-2013-14-3-r30.
+
+.. __: http://genomebiology.com/2013/14/3/R30/
+
+------
+
+**Input formats**
+
+CRAC accepts files in FASTA, FASTQ or any text format (txt, raw, ...). 
+
+------
+
+**Output**
+
+The output is in SAM format. If you choose the detailed SAM output, CRAC adds several flags to tell more informations. You can see the details here: http://crac.gforge.inria.fr/index.php?id=sam-documentation
+
+
+------
+
+**Crac settings**
+
+Main options are displayed at the top of the page. If you're an experimented user, you can choose to display
+the whole Crac setting.  Most of the options in Crac have been implemented here.
+
+------
+crac 1.3.0      Compiled on Sep 13 2013.
+
+   -h, --help           <none>          print this help and exit
+   -f, --full-help      <none>          print a complete help and exit
+   -v                   <none>          print version and exit
+
+Mandatory arguments
+   -i                   <FILE>          set genome index file (without the extension filename)
+   -r                   <FILE> [FILE2]  set read file. Specify FILE2 in case of paired-end reads
+   -k                   <INT>           set k-mer length
+   -o, --sam            <FILE>          set SAM output filename or print on STDOUT with "-o -" argument
+
+Optional arguments
+  * Protocol
+   --stranded           <none>          set the read mapping with for a strand specific library (DEFAULT non-strand specific)
+
+  * Efficiency
+   --nb-threads         <INT>           set the number of worker threads (DEFAULT 1)
+   --read-length, -m    <INT>           set read length in case of all reads have the same length to optimize
+                                        CPU and memory times
+   --treat-multiple     <none>          consider alignments with multiple locations (>max-duplication) rather than considering a no-alignment in the SAM file
+   --max-locs           <INT>           set the maximum number of locations on the reference index (DEFAULT 300)
+
+  * Accuracy
+   --no-ambiguity       <none>          discard biological events (splice, snv, indel, chimera) which have several matches on the reference index
+
+
+Optional output arguments
+   --all                              <FILE>     set output base filename for all causes following
+   --gz                               <none>     all output files specified after this argument are gzipped
+
+  * Summary and statistics
+   --summary                          <FILE>     set output summary file
+  * Mapping
+   --single                           <FILE>     set output single file
+   --duplicate                        <FILE>     set output duplication file
+   --multiple                         <FILE>     set output multiple file
+   --none                             <FILE>     set output none file
+   --normal                           <FILE>     set output normal file
+   --almost-normal                    <FILE>     set output almost normal file
+
+  * Biological causes
+   --snv                              <FILE>     set output SNV file
+   --indel                            <FILE>     set output short indel file
+   --splice                           <FILE>     set output splice junction file
+   --weak-splice                      <FILE>     set output coverless splice junction file
+   --chimera                          <FILE>     set output chimera junction file
+   --paired-end-chimera               <FILE>     set output for paired-end chimera file
+   --biological                       <FILE>     set output bio-undetermined file
+
+  * Sequence errors
+   --errors                           <FILE>     set output sequence errors file
+
+  * Repetition
+   --repeat                           <FILE>     set output repetition file
+
+  * Other causes
+   --undetermined                     <FILE>     set output undetermined file
+   --nothing                          <FILE>     set output nothing file
+
+Optional process for specific research
+   --deep-snv                         <none>     will search hard to find SNPs
+   --stringent-chimera                <none>     will search chimeras with more accuracy (but less sensitivity)
+
+Optional process launcher (once must be selected)
+  * Exact matching tool
+   --emt                              <none>     launch CRAC-emt for exact mapping of short reads
+
+  * Server tool (for debugging) 
+   --server                           <none>     launch CRAC server,the output arguments will
+                                                 not be taken into account
+   --input-name-server                <STRING>   DEFAULT classify.fifo
+   --output-name-server               <STRING>   DEFAULT classify.out.fifo
+
+Additional settings for users
+  * Sam output file
+   --detailed-sam                     <none>     more informations are added in SAM output file
+
+  * Mapping
+   --min-percent-single-loc           <FLOAT>    DEFAULT 0.15
+   --min-duplication                  <INT>      DEFAULT 2
+   --max-duplication                  <INT>      DEFAULT 9
+   --min-percent-duplication-loc      <FLOAT>    DEFAULT 0.15
+   --min-percent-multiple-loc         <FLOAT>    DEFAULT 0.50
+   --min-repetition                   <INT>      DEFAULT 20
+   --min-percent-repetition-loc       <FLOAT>    DEFAULT 0.20
+  * Biological causes
+   --max-splice-length                <INT>      DEFAULT 300000
+   --max-paired-end-length            <INT>      DEFAULT 300000
+   --max-bio-indel                    <INT>      DEFAULT 15
+   --max-bases-retrieved              <INT>      DEFAULT 15
+  * Undetermined
+   --min-support-no-cover             <FLOAT>    DEFAULT 1.30
+
+Additional settings for advanced users
+  * Break verification and fusion (merging mirage breaks)
+   --min-break-length                 <FLOAT> DEFAULT 0.50
+   --max-bases-randomly-matched       <INT>   DEFAULT 10
+   --max-extension-length             <INT>   DEFAULT 10
+
+  * Threading
+   --nb-tags-info-stored              <INT>   DEFAULT 1000
+
+  * Deep SNV search option
+   --nb-nucleotides-snv-comparison    <INT>   DEFAULT 8
+   </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/crac_wrapper.sh	Fri Sep 13 10:01:00 2013 -0400
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+# Recovering special parameters from crac.xml
+###############################################################
+CRAC_BINARY=crac
+INDEX_INPUT="$1"
+
+# Getting the indexed genome value
+###############################################################
+# Getting the indexed Genome name without the extension
+if [ -d "$INDEX_INPUT" ]; then				# If $INDEX_INPUT is a directory (that is to say an index from the history)
+  cpt=0
+  for fichier in $INDEX_INPUT/*.ssa
+    do
+      if [ $((++cpt)) -gt 1 ]; then 			#More than 1 '.ssa' file is not expected
+        echo "Warning:Multiple indexes found [$INDEX]" >&2
+      fi
+    INDEX=${fichier%%.ssa}				#Getting the index from history
+    done
+  else
+    INDEX="$INDEX_INPUT"				#Getting the prebuilt index
+fi
+if [ ! -f "$INDEX.ssa" -a ! -f "$INDEX.conf" ]; then	#Both '.ssa' and '.conf' files are required
+  echo "Error:Index not found [$INDEX]" >&2
+  exit 1
+fi
+
+# Execution of the command line (Submiting job to the cluster)
+###############################################################
+shift 2					#Avoiding index_input and output_name.extra_files_path
+
+CRAC_CMD_LINE=""$CRAC_BINARY" -i "$INDEX" "$@""
+
+out=`$CRAC_CMD_LINE`
+
+exit 0
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Fri Sep 13 10:01:00 2013 -0400
@@ -0,0 +1,16 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="crac" version="1.3.0">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">https://gforge.inria.fr/frs/download.php/32471/crac-1.3.0.tar.gz</action>
+                <action type="shell_command">./configure</action>
+                <action type="shell_command">make</action>
+                <action type="shell_command">make check</action>
+            </actions>
+        </install>
+        <readme>
+CRAC requires g++ 4.3 or later. 
+	</readme>
+    </package>
+</tool_dependency>