Mercurial > repos > bonsai > crac
changeset 3:4cf2808854bc draft
Uploaded
author | bonsai |
---|---|
date | Fri, 13 Sep 2013 10:01:00 -0400 |
parents | dbb83adec9eb |
children | ac6be31420fe |
files | crac-index-wrapper.sh crac-index.xml crac.xml crac_wrapper.sh tool_dependencies.xml |
diffstat | 5 files changed, 432 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/crac-index-wrapper.sh Fri Sep 13 10:01:00 2013 -0400 @@ -0,0 +1,50 @@ +#!/bin/sh + +# Recovering parameters from crac-index.xml +############################################################### +CRAC_INDEX_BINARY=crac-index +BASE_FILENAME="$1" +HTML_REPORT="$2" +FILES_PATH="$3" +BUCKET="$4" + +#echo Wrapper for crac-index +#echo running: $0 "$@" + +shift 4 + +#echo After shifting, args are: +#echo "$@" + +mkdir -p "$FILES_PATH" + +# Execution of the command line (Submiting job to the cluster) +############################################################### + CRAC_INDEX_CMD_LINE="$CRAC_INDEX_BINARY index "$FILES_PATH/$BASE_FILENAME" -b "$BUCKET" "$@"" + + out=`$CRAC_INDEX_CMD_LINE` + + jID=`echo $out | awk {'print $3'}` +fi + +cat << MARINE > "$HTML_REPORT" + +exit 0 + +echo "<html> + <head> + <title>Files for Crac Index (crac_index)</title> + </head> + <body> + This index is composed of the following files: + <p/> + <ul> + <li><a href=""$BASE_FILENAME.conf"" type=""text/plain"">$BASE_FILENAME.conf</a></li> + <li><a href=""$BASE_FILENAME.ssa"" type=""application/binary"">$BASE_FILENAME.ssa</a></li> + </ul> + </body> +</html>" > "$HTML_REPORT" + + +MARINE +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/crac-index.xml Fri Sep 13 10:01:00 2013 -0400 @@ -0,0 +1,64 @@ +<tool id="crac-index" name="CRAC index"> + + <description>Create genome indexes available to be used with CRAC mapping/annotation tool </description> + + <command> + crac-index-wrapper.sh "$output_name" "$output" "$output.files_path" "$bucket" "$input_file" + </command> + + <inputs> + <param name="input_file" type="data" label="Source file" format="fasta" help="You must choose a fasta file containing the genome"/> + <param name="output_name" type="text" value ="IndexOutput" label="Output name" help="Name must be different from 'index' word, otherwise CRAC-index will fail." /> + <param name="bucket" type="integer" value="100000000" label="Bucket size" help="The size of the bucket for the index construction (default 100000000)."/> + </inputs> + + <outputs> + <data name="output" format="crac_index" label="${output_name}.crac-index" /> + </outputs> + + <help> + +**What it does** + +Crac-index generates an indexed genome from a fasta file. This is especially useful for the Crac mapping/annotation tool. + +---------------------- + +**Input Formats** + +Crac-index takes as input files any fasta or multi-fasta files. + +---------------------- + +**Outputs** + +Crac-index on Galaxy produces a composite output named crac-index, which is made of a ssa file and a conf file. Both are required to the use of your index. + +---------------------- + +**Crac-index settings** + + +Usage : ./crac-index [options] command output_file input_file + + command must be : + index: create an index on the specified input file(s). + + options can be : + + -b bucket_size the size of the bucket for the index construction + (default 100000000) + -d diff-cover parameter for the index construction (default 1024) + -v verbose mode + + Examples: + ./crac-index index myIndex sequence1.fa sequence2.fa sequence3.fa + You can specify FASTA or MultiFASTA file(s). + In this example, two files will be created: + - myIndex.ssa (index storing the compressed sequences) + - myIndex.conf (information on sequence names and length) + + </help> + + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/crac.xml Fri Sep 13 10:01:00 2013 -0400 @@ -0,0 +1,265 @@ +<?xml version="1.0" encoding="utf-8"?> +<tool id="crac" name="CRAC" version="1.0"> + <requirements> + <requirement type='package' version="1.3.0">crac</requirement> + </requirements> + <description>Analyzing RNAs in high-throughput sequencing data</description> + <command interpreter="bash"> crac_wrapper.sh + #if $Genome.which_genome == "prebuilt" + <!--1--> "$Genome.prebuilt_genome.fields.path" + #else + <!--1--> "$Genome.index_input.extra_files_path" + #end if + #if $condi_compressed == "yes" + --gz + #end if + <!--2--> $output_name.extra_files_path <!-- Usefull for submitting jobs on crac.sh--> + -r $input -k $kmer_length --read-length $read_length --sam $output_name + #if $condi_deep_snp.deepSNP == "yes" + --deep-snv --nb-nucleotides-snv-comparison $condi_deep_snp.nb_nucleotides_snp_comparison + #end if + #if $choixSettings.settings == "experimental" + --max-splice-length $choixSettings.max_splice_length + --max-bio-indel $choixSettings.max_bio_indel + --min-duplication $choixSettings.min_duplication + --max-duplication $choixSettings.max_duplication + --min-percent-single-loc $choixSettings.min_percent_single_loc + --min-percent-duplication-loc $choixSettings.min_percent_duplication_loc + --max-bases-randomly-matched $choixSettings.max_bases_randomly_matched + --max-extension-length $choixSettings.max_extension_length + --min-support-no-cover $choixSettings.min_support_no_cover + --min-break-length $choixSettings.min_break_length + #end if + #if str($detailed_sam) == "yes" + --detailed-sam + #end if + + </command> + + <inputs> + <!-- Normal Setting --> + + <conditional name="Genome"> <!-- Conditional 3 (Which genome) --> + <param name="which_genome" type="select" label="Do you want to use a pre-built reference genome or a Crac-index generated genome from your history?" help="Pre-built reference genomes are generated by Crac-index."> + <option value="prebuilt"> Use a pre-built reference genome </option> + <option value="history"> Use a Crac-index generated genome from my history</option> + </param> + + <when value="prebuilt"> + <param name="prebuilt_genome" type="select" label="Select a reference genome" help="if your genome of interest is not listed - contact authors"> + <options from_data_table="crac_indexes"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No indexes are available" /> + </options> + </param> + </when> + + <when value="history"> + <param name="index_input" format="crac_index" type="data" label="Reference Genome" help="Select an indexed Genome from your history"/> + </when> + </conditional> + + + <param name="input" format="txt,raw,fastq" type="data" label="Reads File" help="Select a file"/> + <param name="kmer_length" type="integer" min="12" max="32" value="21" label="k-mer length"> + <help>k-mer length must be carefully chosen. A k-mer of that length must map to a unique location in the genome with a high probability. Recommended value for the human genome: 22</help> + </param> + <param name="read_length" type="integer" label="Read length" value="0" help="Set read length when all reads have the same length to dramatically increase computation speed. Default value (no read length considered) : 0. Note : If read length is set, shorter reads will be ignored, longer reads will be cut."/> + <param name="detailed_sam" type="select" label="Do you want a detailed sam output file ?" help="Detailed sam output file gives you information on the SNPs, Splice junctions, Sequencing errors, Chimeras, ..." > + <option value="no"> No, I do not want detailed sam output file </option> + <option value="yes"> Yes, I want detailed sam output file </option> + <when value="yes"/> <!-- Supress warnings--> + <when value="no"/> <!-- Supress warnings--> + </param> + <param name="condi_compressed" type="select" display="radio" label="Compress output files?"> + <option value="yes">Yes</option> + <option value="no">No</option> + </param> + <conditional name="condi_deep_snp"> <!-- Conditional 1 --> + <param type="select" name="deepSNP" label="Search hard for SNPs?"> + <option value="no" selected="true"> No, do not search hard for SNVs </option> + <option value="yes"> Yes, search hard for SNVs (takes more time) </option> + </param> + <when value="yes"> + <param name="nb_nucleotides_snp_comparison" type="integer" value="8" label="Number of nucleotides for SNV comparison" help="Default value for human genome : 8. A smaller value will find more SNVs, but will be less accurate."/> + </when> + <when value="no"/> <!-- Suppress warnings --> + </conditional> <!-- End Conditional 1 --> + + <!-- Experimental Setting--> + <conditional name="choixSettings"> <!-- Conditional 2 (setting choice) --> + <param name="settings" type="select" label="Advanced CRAC settings to use" help="If you want full control to optimize your experience, use Advanced Settings. Be careful, these settings are experimental and one single change can make Crac fail"> + <option value="normal" selected="true"> Normal settings </option> + <option value="experimental"> Advanced Settings </option> + </param> + <when value="normal"/> <!-- Supress warnings--> + <when value="experimental"> <!-- Supress warnings--> + <param name="max_splice_length" type="integer" value="300000" label="Maximum splice length" help="Splices larger than this value, will not be considered as splices, but (if possible) as chimeras. Default value for human genome : 300,000 bp." /> + <param name="max_bio_indel" type="integer" value="15" label="Maximum indel length. Larger indels will be considered as splice junctions" help=" HELP. Default value for human genome : 15 bp." /> + <param name="min_duplication" type="integer" value="2" label="Minimum duplication occurrence" help=" Minimum number of k-mer occurrences in the genome to be considered as duplicated. Default value for human genome : 2." /> + <param name="max_duplication" type="integer" value="9" label="Maximum duplication occurrence" help=" Maximum number of k-mer occurrences to be considered as duplicated. Default value for human genome : 9." /> + <param name="min_percent_single_loc" type="float" value="0.15" label="Minimum unique location percentage" help=" Minimal percentage of k-mers that must be unique in the genome, to consider the read as unique. Default value for human genome : 0.15." /> + <param name="min_percent_duplication_loc" type="float" value="0.20" label="Minimum duplicated location percentage" help=" Minimal percentage of k-mers that must be duplicated in the genome, to consider the read as duplicated. Default value for human genome : 0.20." /> + <!--param name="min_percent_multiple_loc" type="float" value="0.20" label="Minimum percent multiple localisation" help=" HELP. Default value for human genome : 0.20." /--> + <param name="max_bases_randomly_matched" type="integer" value="10" label="Maximum bases randomly matched" help=" Maximum number of bases that can be considered as randomly matched. Default value for human genome : 10." /> + <param name="max_bases_retrieved" type="integer" value="10" label="Maximum bases retrieved" help=" Maximum number of bases retrieved from the genome when outputting deletions. Default value for human genome : 10." /> + <param name="max_extension_length" type="integer" value="10" label="Maximum extension length" help=" Maximal number of nucleotides visited to extend a break and to make sure that the location is consistent. Default value for human genome : 10." /> + <param name="min_support_no_cover" type="float" value="1.30" label="Minimum suppot no cover" help=" Average coverage along the read to consider it as not covered. Default value for human genome : 1.30." /> + <param name="min_break_length" type="float" value="0.5" label="Minimum break length" help=" Breaks shorter than this ratio times the k-mer length will be considered as too short and will be merged if necessary. Default value for human genome : 0.5." /> + </when> <!-- End "when experimental" --> + </conditional> <!-- End Conditional 2 --> + </inputs> + + <outputs> + <data name="output_name" format="sam" label="${tool.name} on ${on_string}: mapped reads" /> + </outputs> + + <tests> + <test> + </test> + </tests> + + <help> +**What it does** + +CRAC proposes a novel way of analyzing reads that integrates genomic locations +and local coverage, and delivers all above mentioned predictions in a single +step. CRAC uses a double k-mer profiling approach to detect candidate +mutations, indels, splice or fusion junctions in each single read. + +.. _CRAC: http://crac.gforge.inria.fr/ + +If you use this tool, please cite: + - Philippe N., Salson M., Commes T., Rivals E., `"CRAC: an integrated approach to the analysis of RNA-seq reads"`__, Genome Biology (2013), 14:R30, doi: 10.1186/gb-2013-14-3-r30. + +.. __: http://genomebiology.com/2013/14/3/R30/ + +------ + +**Input formats** + +CRAC accepts files in FASTA, FASTQ or any text format (txt, raw, ...). + +------ + +**Output** + +The output is in SAM format. If you choose the detailed SAM output, CRAC adds several flags to tell more informations. You can see the details here: http://crac.gforge.inria.fr/index.php?id=sam-documentation + + +------ + +**Crac settings** + +Main options are displayed at the top of the page. If you're an experimented user, you can choose to display +the whole Crac setting. Most of the options in Crac have been implemented here. + +------ +crac 1.3.0 Compiled on Sep 13 2013. + + -h, --help <none> print this help and exit + -f, --full-help <none> print a complete help and exit + -v <none> print version and exit + +Mandatory arguments + -i <FILE> set genome index file (without the extension filename) + -r <FILE> [FILE2] set read file. Specify FILE2 in case of paired-end reads + -k <INT> set k-mer length + -o, --sam <FILE> set SAM output filename or print on STDOUT with "-o -" argument + +Optional arguments + * Protocol + --stranded <none> set the read mapping with for a strand specific library (DEFAULT non-strand specific) + + * Efficiency + --nb-threads <INT> set the number of worker threads (DEFAULT 1) + --read-length, -m <INT> set read length in case of all reads have the same length to optimize + CPU and memory times + --treat-multiple <none> consider alignments with multiple locations (>max-duplication) rather than considering a no-alignment in the SAM file + --max-locs <INT> set the maximum number of locations on the reference index (DEFAULT 300) + + * Accuracy + --no-ambiguity <none> discard biological events (splice, snv, indel, chimera) which have several matches on the reference index + + +Optional output arguments + --all <FILE> set output base filename for all causes following + --gz <none> all output files specified after this argument are gzipped + + * Summary and statistics + --summary <FILE> set output summary file + * Mapping + --single <FILE> set output single file + --duplicate <FILE> set output duplication file + --multiple <FILE> set output multiple file + --none <FILE> set output none file + --normal <FILE> set output normal file + --almost-normal <FILE> set output almost normal file + + * Biological causes + --snv <FILE> set output SNV file + --indel <FILE> set output short indel file + --splice <FILE> set output splice junction file + --weak-splice <FILE> set output coverless splice junction file + --chimera <FILE> set output chimera junction file + --paired-end-chimera <FILE> set output for paired-end chimera file + --biological <FILE> set output bio-undetermined file + + * Sequence errors + --errors <FILE> set output sequence errors file + + * Repetition + --repeat <FILE> set output repetition file + + * Other causes + --undetermined <FILE> set output undetermined file + --nothing <FILE> set output nothing file + +Optional process for specific research + --deep-snv <none> will search hard to find SNPs + --stringent-chimera <none> will search chimeras with more accuracy (but less sensitivity) + +Optional process launcher (once must be selected) + * Exact matching tool + --emt <none> launch CRAC-emt for exact mapping of short reads + + * Server tool (for debugging) + --server <none> launch CRAC server,the output arguments will + not be taken into account + --input-name-server <STRING> DEFAULT classify.fifo + --output-name-server <STRING> DEFAULT classify.out.fifo + +Additional settings for users + * Sam output file + --detailed-sam <none> more informations are added in SAM output file + + * Mapping + --min-percent-single-loc <FLOAT> DEFAULT 0.15 + --min-duplication <INT> DEFAULT 2 + --max-duplication <INT> DEFAULT 9 + --min-percent-duplication-loc <FLOAT> DEFAULT 0.15 + --min-percent-multiple-loc <FLOAT> DEFAULT 0.50 + --min-repetition <INT> DEFAULT 20 + --min-percent-repetition-loc <FLOAT> DEFAULT 0.20 + * Biological causes + --max-splice-length <INT> DEFAULT 300000 + --max-paired-end-length <INT> DEFAULT 300000 + --max-bio-indel <INT> DEFAULT 15 + --max-bases-retrieved <INT> DEFAULT 15 + * Undetermined + --min-support-no-cover <FLOAT> DEFAULT 1.30 + +Additional settings for advanced users + * Break verification and fusion (merging mirage breaks) + --min-break-length <FLOAT> DEFAULT 0.50 + --max-bases-randomly-matched <INT> DEFAULT 10 + --max-extension-length <INT> DEFAULT 10 + + * Threading + --nb-tags-info-stored <INT> DEFAULT 1000 + + * Deep SNV search option + --nb-nucleotides-snv-comparison <INT> DEFAULT 8 + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/crac_wrapper.sh Fri Sep 13 10:01:00 2013 -0400 @@ -0,0 +1,37 @@ +#!/bin/sh + +# Recovering special parameters from crac.xml +############################################################### +CRAC_BINARY=crac +INDEX_INPUT="$1" + +# Getting the indexed genome value +############################################################### +# Getting the indexed Genome name without the extension +if [ -d "$INDEX_INPUT" ]; then # If $INDEX_INPUT is a directory (that is to say an index from the history) + cpt=0 + for fichier in $INDEX_INPUT/*.ssa + do + if [ $((++cpt)) -gt 1 ]; then #More than 1 '.ssa' file is not expected + echo "Warning:Multiple indexes found [$INDEX]" >&2 + fi + INDEX=${fichier%%.ssa} #Getting the index from history + done + else + INDEX="$INDEX_INPUT" #Getting the prebuilt index +fi +if [ ! -f "$INDEX.ssa" -a ! -f "$INDEX.conf" ]; then #Both '.ssa' and '.conf' files are required + echo "Error:Index not found [$INDEX]" >&2 + exit 1 +fi + +# Execution of the command line (Submiting job to the cluster) +############################################################### +shift 2 #Avoiding index_input and output_name.extra_files_path + +CRAC_CMD_LINE=""$CRAC_BINARY" -i "$INDEX" "$@"" + +out=`$CRAC_CMD_LINE` + +exit 0 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Fri Sep 13 10:01:00 2013 -0400 @@ -0,0 +1,16 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="crac" version="1.3.0"> + <install version="1.0"> + <actions> + <action type="download_by_url">https://gforge.inria.fr/frs/download.php/32471/crac-1.3.0.tar.gz</action> + <action type="shell_command">./configure</action> + <action type="shell_command">make</action> + <action type="shell_command">make check</action> + </actions> + </install> + <readme> +CRAC requires g++ 4.3 or later. + </readme> + </package> +</tool_dependency>