Mercurial > repos > devteam > picard
diff picard_CollectRnaSeqMetrics.xml @ 3:52fdfc45590a draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/picard commit 00a7926c285bc4a339bd7deebf40b28f39c7d947-dirty
author | devteam |
---|---|
date | Thu, 16 Jul 2015 15:32:31 -0400 |
parents | |
children | 2589e6207cb4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/picard_CollectRnaSeqMetrics.xml Thu Jul 16 15:32:31 2015 -0400 @@ -0,0 +1,201 @@ +<tool name="CollectRnaSeqMetrics" id="picard_CollectRnaSeqMetrics" version="@TOOL_VERSION@.0"> + <description> collect metrics about the alignment of RNA to various functional classes of loci in the genome</description> + <macros> + <import>picard_macros.xml</import> + </macros> + <expand macro="requirements"> + <requirement type="package" version="3.1.2">R</requirement> + </expand> + <command> + + ## Set up input files + + ## Reference sequences + + #set $reference_fasta_filename = "localref.fa" + + #if str( $reference_source.reference_source_selector ) == "history": + ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" && + #else: + #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path ) + #end if + + ## refFlat data + ## The awk line below converts a file obtained from UCSC as specified in the tool help to refFlat format + + grep -v '^#' ${refFlat} | awk '{print $11"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}' > refFlat.tab && + + ## Start picard command + + @java_options@ + java -jar \$JAVA_JAR_PATH/picard.jar + CollectRnaSeqMetrics + REF_FLAT=refFlat.tab + + #if str( $ribosomal_intervals ) != "None": + RIBOSOMAL_INTERVALS="${ribosomal_intervals}" + #end if + + STRAND_SPECIFICITY="${strand_specificity}" + MINIMUM_LENGTH="${minimum_length}" + CHART_OUTPUT="${pdfFile}" + + #for $sequence_to_ignore in $ignore_list: + IGNORE_SEQUENCE="${sequence_to_ignore.sequence}" + #end for + + RRNA_FRAGMENT_PERCENTAGE="${rrna_fragment_percentage}" + METRIC_ACCUMULATION_LEVEL="${metric_accumulation_level}" + INPUT="${inputFile}" + OUTPUT="${outFile}" + REFERENCE_SEQUENCE="${reference_fasta_filename}" + ASSUME_SORTED="${assume_sorted}" + + QUIET=true + VERBOSITY=ERROR + VALIDATION_STRINGENCY=${validation_stringency} + + </command> + + <inputs> + <param format="sam,bam" type="data" name="inputFile" label="Select SAM/BAM dataset or dataset collection" help="If empty, upload or import a SAM/BAM dataset" /> + <conditional name="reference_source"> + <param name="reference_source_selector" type="select" label="Load reference genome from"> + <option value="cached">Local cache</option> + <option value="history">History</option> + </param> + <when value="cached"> + <param name="ref_file" type="select" label="Using reference genome" help="REFERENCE_SEQUENCE"> + <options from_data_table="all_fasta"></options> + <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/> + </param> + </when> + <when value="history"> + <param name="ref_file" type="data" format="fasta" label="Use the folloing dataset as the reference sequence" help="REFERENCE_SEQUENCE; You can upload a FASTA sequence to the history and use it as reference" /> + </when> + </conditional> + <param format="tabular" name="refFlat" type="data" label="Gene annotations in refFlat form" help="See "Obtaining gene annotations in refFlat format" below for help" /> + <param name="ribosomal_intervals" format="picard_interval_list" type="data" optional="True" label="Location of rRNA sequences in genome, in interval_list format" help="RIBOSOMAL_INTERVALS; If not specified no bases will be identified as being ribosomal. The list of intervals can be geberated from BED or Interval datasets using Galaxy BedToIntervalList tool"/> + <param name="strand_specificity" type="select" label="What is the RNA-seq library strand specificity" help="STRAND_SPECIFICITY; For unpaired reads, use FIRST_READ_TRANSCRIPTION_STRAND if the reads are expected to be on the transcription strand."> + <option value="NONE" select="True">None</option> + <option value="FIRST_READ_TRANSCRIPTION_STRAND">First read transcription strand</option> + <option value="SECOND_READ_TRANSCRIPTION_STRAND">Second read transcription strand</option> + </param> + <param name="minimum_length" type="integer" value="500" label="When calculating coverage based values use only use transcripts of this length or greater" help="MINIMUM_LENGTH; default=500"/> + <repeat name="ignore_list" title="Sequences to ignore" min="0" help="You can provide multiple sequences by clicking the button below"> + <param name="sequence" type="text" size="80" label="Ignore reads matching this sequence"/> + </repeat> + <param name="rrna_fragment_percentage" type="float" value="0.8" label="This percentage of the length of a fragment must overlap one of the ribosomal intervals for a read or read pair to be considered rRNA." help="RRNA_FRAGMENT_PERCENTAGE; default=0.8"/> + <param name="metric_accumulation_level" type="select" label="The level(s) at which to accumulate metrics" multiple="true" help="METRIC_ACCUMULATION_LEVEL"> + <option value="ALL_READS" selected="True">All reads</option> + <option value="SAMPLE">Sample</option> + <option value="LIBRARY">Library</option> + <option value="READ_GROUP">Read group</option> + </param> + <param name="assume_sorted" type="boolean" label="Assume the input file is already sorted" checked="true" truevalue="true" falsevalue="false" help="ASSUME_SORTED"/> + + <expand macro="VS" /> + + </inputs> + <outputs> + <data format="pdf" name="pdfFile" label="${tool.name} on ${on_string}: Chart PDF"/> + <data format="tabular" name="outFile" label="${tool.name} on ${on_string}: Summary stats"/> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal"/> + </stdio> + <tests> + <test> + <param name="reference_source_selector" value="history"/> + <param name="ref_file" value="picard_CollectRnaSeqMetrics_ref.fa" ftype="fasta"/> + <param name="inputFile" value="picard_CollectRnaSeqMetrics.bam" ftype="bam"/> + <param name="assume_sorted" value="true" /> + <param name="refFlat" value="picard_CollectRnaSeqMetrics.refFlat" /> + <param name="metric_accumulation_level" value="ALL_READS" /> + <param name="minimum_length" value="500" /> + <param name="strand_specificity" value="NONE" /> + <param name="rrna_fragment_percentage" value="0.8" /> + <output name="outFile" file="picard_CollectRnaSeqMetrics_test1.tab" ftype="tabular" lines_diff="4"/> + </test> + + </tests> + <help> + +.. class:: infomark + +**Purpose** + +Collects metrics about the alignment of RNA to various functional classes of loci in the genome: coding, intronic, UTR, intergenic, ribosomal. + +@dataset_collections@ + +----- + +.. class:: warningmark + +**Obtaining gene annotations in refFlat format** + +This tool requires gene annotations in refFlat_ format. These data can be obtained from UCSC table browser directly through Galaxy by following these steps: + + 1. Click on **Get Data** in the upper part of left pane of Galaxy interface + 2. Click on **UCSC Main** link + 3. Set your genome and dataset of interest. It **must** be the same genome build against which you have mapped the reads contained in the BAM file you are analyzing + 4. In the **output format** field choose **selected fields from primary and related tables** + 5. Click **get output** button + 6. In the first table presented at the top of the page select (using checkboxes) first 11 fields: + name + chrom + strand + txStart + txEnd + cdsStart + cdsEnd + exonCount + exonStarts + exonEnds + proteinId + 7. Click **done with selection** + 8. Click **Send query to Galaxy** + 9. A new dataset will appear in the current Galaxy history + 10. Use this dataset as the input for **Gene annotations in refFlat form** dropdown of this tool + +.. _refFlat: http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat + +@description@ + + REF_FLAT=File Gene annotations in refFlat form. Format described here: + http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat Required. + + RIBOSOMAL_INTERVALS=File Location of rRNA sequences in genome, in interval_list format. If not specified no bases + will be identified as being ribosomal. Format described here: + http://picard.sourceforge.net/javadoc/net/sf/picard/util/IntervalList.html and can be + generated from BED datasetes using Galaxy's wrapper for picard_BedToIntervalList tool + + STRAND_SPECIFICITY=StrandSpecificity + STRAND=StrandSpecificity For strand-specific library prep. For unpaired reads, use FIRST_READ_TRANSCRIPTION_STRAND + if the reads are expected to be on the transcription strand. Required. Possible values: + {NONE, FIRST_READ_TRANSCRIPTION_STRAND, SECOND_READ_TRANSCRIPTION_STRAND} + + MINIMUM_LENGTH=Integer When calculating coverage based values (e.g. CV of coverage) only use transcripts of this + length or greater. Default value: 500. + + IGNORE_SEQUENCE=String If a read maps to a sequence specified with this option, all the bases in the read are + counted as ignored bases. + + RRNA_FRAGMENT_PERCENTAGE=Double + This percentage of the length of a fragment must overlap one of the ribosomal intervals + for a read or read pair by this must in order to be considered rRNA. Default value: 0.8. + + METRIC_ACCUMULATION_LEVEL=MetricAccumulationLevel + LEVEL=MetricAccumulationLevel The level(s) at which to accumulate metrics. Possible values: {ALL_READS, SAMPLE, + LIBRARY, READ_GROUP} This option may be specified 0 or more times. + + ASSUME_SORTED=Boolean + AS=Boolean If true (default), then the sort order in the header file will be ignored. Default + value: true. Possible values: {true, false} + +@more_info@ + + </help> +</tool>