view data_manager/rnastar_index_builder.xml @ 0:84f6e67cbae5 draft

Initial commit - problems testing on my ram starved laptop - need to test on a bigger ram machine
author fubar
date Mon, 29 Sep 2014 20:25:10 -0400
parents
children ebadd2c92958
line wrap: on
line source

<tool id="rnastar_index_builder_data_manager" name="rnastar index" tool_type="manage_data" version="0.0.1">
    <description>builder</description>
    <requirements>
        <requirement type="package" version="2.4.0d">rnastar</requirement>
    </requirements>
    <command interpreter="python">
rnastar_index_builder.py --out_file "${out_file}" --fasta_filename "${all_fasta_source.fields.path}" 
--fasta_dbkey "${all_fasta_source.fields.dbkey}" --fasta_description "${all_fasta_source.fields.name}" 
--data_table_name "rnastar_indexes" --out_index_path "$out_file.extra_files_path" --runThreadN 1
#if $genemodel.modelformat=="gff3":
 --sjdbOverhang "${genemodel.sjdbOverhang}"
 --sjdbGTFfile "${genemodel.sjdbGTFfile}"
 --sjdbGTFtagExonParentTranscript "${genemodel.sjdbGTFtagExonParentTranscript}"
 --sjdbGTFfeatureExon "${genemodel.sjdbGTFfeatureExon}"
#end if
#if $genemodel.modelformat=="bed":
 --sjdbFileChrStartEnd "${genemodel.sjdbFileChrStartEnd}"
 --sjdbOverhang "${genemodel.sjdbOverhang}"
#end if
#if $genemodel.modelformat=="None":
 --sjdbOverhang 0
#end if
</command>
    <stdio>
        <regex match=".*" source="both" level="warning" description="stdout/err chatter:"/>
    </stdio>

    <inputs>
        <param name="all_fasta_source" type="select" label="Source FASTA Sequence">
            <options from_data_table="all_fasta"/>
        </param>
        <param type="text" name="sequence_name" value="" label="Informative name for sequence index" />
        <param type="text" name="sequence_id" value="" label="ID for sequence index" />


        <conditional name="genemodel">
            <param name="modelformat" type="select" 
               label="Choose the format of gene model data from your history - bed or gff3"
               help="This will be the source of splice junction indexing if required">
              <option value="gff3"  selected="true">gff3,gtf</option>
              <option value="bed">BED - tabular chr,start,end,strand</option>
              <option value="None" >None - no splice junction index</option>
             </param>
             <when value="gff3">
                <param type="data" format="gff3,gff" name="sjdbGTFfile" value="" label="Gene model - must be gff3 or compatible and must match the input genome"
                   help="Required if you want to index splice junctions during index generation." />
                   
                <param type="text" name="sjdbGTFchrPrefix" value="chr" label="String prefix for GTF chromosomes"
                    help='GTF prefix for chromosome names (e.g. "chr" to use ENSMEBL annotations with UCSC geneomes)' >
                    <sanitizer invalid_char="">
                        <valid initial="string.printable"/>
                     </sanitizer>
                </param>
                <param type="text" name="sjdbGTFfeatureExon" value="exon_id" label="GTF feature to use as exon marker"
                   help="GTF feature type in GTF file to be used as exons for building transcripts - use what's in your GTF">
                    <sanitizer invalid_char="">
                        <valid initial="string.printable"/>
                     </sanitizer>
                </param>
                   
                <param type="text" name="sjdbGTFtagExonParentTranscript" value="transcript_id" label="GTF feature to define for each exon's parents"
                   help="GTF tag name to be used as exons' parents for building transcripts - use what's in your GTF">
                    <sanitizer invalid_char="">
                        <valid initial="string.printable"/>
                     </sanitizer>
                </param>
                   
                <param type="integer" name="sjdbOverhang" value="100" label="Splice junction overhang. If=0, splice junction database NOT used"
                help="int>=0: length of the donor/acceptor sequence on each side, (mate_length - 1)" />
                
            </when>
            <when value='bed'>
                <param type="data" format="bed" name="sjdbFileChrStartEnd" value="" label="Introns as a tabular bed (chr,start,end,strand) file matching the input genome"
                   help="Required if you want to index splice junctions during index generation." />
                <param type="integer" name="sjdbOverhang" value="100" label="Splice junction overhang. If=0, splice junction database NOT used"
                help="int>=0: length of the donor/acceptor sequence on each side, (mate_length - 1)" />
           </when>
            <when value='None'>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="out_file" format="data_manager_json"/>
    </outputs>
    <help>

.. class:: infomark

*What it does*

This is a Galaxy datamanager for the rna STAR gap-aware RNA aligner.

Please read the fine manual - that and the google group are the places to learn about the options above.

*Note on sjdbOverhang*

From https://groups.google.com/forum/#!topic/rna-star/h9oh10UlvhI::

  James is right, using large enough --sjdbOverhang is safer and should not generally cause any problems with reads of varying length. If your reads are very short, &lt;50b, then I would strongly recommend using optimum --sjdbOverhang=mateLength-1
  By mate length I mean the length of one of the ends of the read, i.e. it's 100 for 2x100b PE or 1x100b SE. For longer reads you can simply use generic --sjdbOverhang 100.
  It is a bit confusing because of the way I named this parameter. --sjdbOverhang Noverhang is only used at the genome generation step  for constructing the reference sequence out of the annotations.
  Basically, the Noverhang exonic bases from the donor site and Noverhang exonic bases from the acceptor site are spliced together for each of the junctions, and these spliced sequences are added to the genome sequence.

  At the mapping stage, the reads are aligned to both genomic and splice sequences simultaneously. If a read maps to one of spliced sequences and crosses the "junction" in the middle of it, the coordinates of two pspliced pieces are translated back to genomic space and added to the collection of mapped pieces, which are then all "stitched" together to form the final alignment. Since in the process of "maximal mapped length" search the read is split into pieces of no longer than --seedSearchStartLmax (=50 by default) bases, even if the read (mate) is longer than --sjdbOverhang, it can still be mapped to the spliced reference, as long as --sjdbOverhang > --seedSearchStartLmax.

  Cheers
  Alex

*Note on gene model requirements for splice junctions*

From https://groups.google.com/forum/#!msg/rna-star/3Y_aaTuzBrE/lUylTB8h5vMJ::

    When you generate a genome with annotations, you need to specify --sjdbOverhang value, which ideally should be equal to (oneMateLength-1), or you could use a generic value of ~100.

    Your gtf lines look fine to me. STAR needs 3 features from a GTF file:
    1. Chromosome names in col.1 that agree with chromosome names in genome .fasta files. If you have "chr2L" names in the genome .fasta files, and "2L" in the .gtf file, then you need to use --sjdbGTFchrPrefix chr option.
    2. 'exon' in col.3 for the exons of all transcripts (this name can be changed with --sjdbGTFfeatureExon)
    3. 'transcript_id' attribute that assigns each exon to a transcript (--this name can be changed with --sjdbGTFtagExonParentTranscript)

    Cheers
    Alex

**Notice:** If you leave name, description, or id blank, it will be generated automatically. 

    </help>
</tool>