changeset 0:e23440b3332a draft

Uploaded
author ieguinoa
date Tue, 25 May 2021 15:14:56 +0000
parents
children 67c59c6576db
files data_manager/macros.xml data_manager/rna_star_index_builder.py data_manager/rna_star_index_builder.xml data_manager_conf.xml test-data/all_fasta.loc test-data/phiX174.fasta test-data/rnastar_index2_versioned.loc test-data/test_star_01.data_manager_json tool-data/all_fasta.loc.sample tool-data/rnastar_index2x_versioned.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 11 files changed, 673 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/macros.xml	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,248 @@
+<macros>
+    <!-- REMEMBER to bump the version of rna_star_index_builder_data_manager
+    whenever you make changes to the following two version tokens!
+    The data manager uses a symlink to this macro file to keep the STAR and
+    the index versions in sync, but you should manually adjust the +galaxy
+    version number. -->
+    <!-- STAR version to be used -->
+    <token name="@VERSION@">2.7.8a</token>
+    <!-- STAR index version compatible with this version of STAR
+    This is the STAR version that introduced the index structure expected
+    by the current version.
+    It can be found for any specific version of STAR with:
+    STAR -h | grep versionGenome
+    or by looking for the versionGenome parameter in source/parametersDefault
+    of STAR's source code -->
+    <token name="@IDX_VERSION@">2.7.4a</token>
+    <token name="@IDX_DATA_TABLE@">rnastar_index2x_versioned</token>
+
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@VERSION@">star</requirement>
+            <requirement type="package" version="1.9">samtools</requirement>
+            <yield />
+        </requirements>
+    </xml>
+
+    <xml name="edam">
+        <edam_topics>
+            <edam_topic>topic_3170</edam_topic>
+            <edam_topic>topic_3308</edam_topic>
+        </edam_topics>
+        <edam_operations>
+            <edam_operation>operation_0292</edam_operation>
+        </edam_operations>
+    </xml>
+
+    <xml name="index_selection" token_with_gene_model="0">
+        <param argument="--genomeDir" name="genomeDir" type="select"
+        label="Select reference genome"
+        help="If your genome of interest is not listed, contact the Galaxy team">
+            <options from_data_table="@IDX_DATA_TABLE@">
+                <filter type="static_value" column="4" value="@WITH_GENE_MODEL@" />
+                <filter type="static_value" column="5" value="@IDX_VERSION@" />
+                <filter type="sort_by" column="2" />
+                <validator type="no_options" message="No indexes are available for the selected input dataset" />
+            </options>
+        </param>
+    </xml>
+
+    <token name="@FASTQ_GZ_OPTION@">
+        --readFilesCommand zcat
+    </token>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1093/bioinformatics/bts635</citation>
+        </citations>
+    </xml>
+    <xml name="@SJDBOPTIONS@" token_optional="true">
+         <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="@OPTIONAL@" help="Exon junction information for mapping splices"/>
+         <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/>
+    </xml>
+    <xml name="dbKeyActions">
+        <actions>
+            <conditional name="refGenomeSource.geneSource">
+                <when value="indexed">
+                    <action type="metadata" name="dbkey">
+                        <option type="from_data_table" name="@IDX_DATA_TABLE@" column="1" offset="0">
+                            <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
+                            <filter type="param_value" ref="refGenomeSource.GTFconditional.genomeDir" column="0"/>
+                        </option>
+                    </action>
+                </when>
+                <when value="history">
+                    <action type="metadata" name="dbkey">
+                        <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey" />
+                    </action>
+                </when>
+            </conditional>
+        </actions>
+    </xml>
+    <token name="@TEMPINDEX@"><![CDATA[
+    ## Create temporary index for custom reference
+    #if str($refGenomeSource.geneSource) == 'history':
+        mkdir -p tempstargenomedir &&
+        STAR
+            --runMode genomeGenerate
+            --genomeDir 'tempstargenomedir'
+            --genomeFastaFiles '${refGenomeSource.genomeFastaFiles}'
+            ## Handle difference between indices with/without annotations
+            #if 'GTFconditional' in $refGenomeSource:
+                ## GTFconditional exists only in STAR, but not STARsolo
+                #if str($refGenomeSource.GTFconditional.GTFselect) == 'with-gtf':
+                    --sjdbOverhang '${refGenomeSource.GTFconditional.sjdbOverhang}'
+                    --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}'
+                    #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3':
+                        --sjdbGTFtagExonParentTranscript Parent
+                    #end if
+                #end if
+            #else:
+                ## ref genome selection is less complex for STARsolo cause
+                ## with-gtf is mandatory there
+                --sjdbOverhang '${refGenomeSource.sjdbOverhang}'
+                --sjdbGTFfile '${refGenomeSource.sjdbGTFfile}'
+                #if str($refGenomeSource.sjdbGTFfile.ext) == 'gff3':
+                    --sjdbGTFtagExonParentTranscript Parent
+                #end if
+            #end if
+            #if str($refGenomeSource.genomeSAindexNbases):
+                --genomeSAindexNbases ${refGenomeSource.genomeSAindexNbases}
+            #end if
+            --runThreadN \${GALAXY_SLOTS:-4}
+        &&
+    #end if
+    ]]></token>
+    <token name="@REFGENOMEHANDLING@" ><![CDATA[
+    --runThreadN \${GALAXY_SLOTS:-4}
+    --genomeLoad NoSharedMemory
+    --genomeDir
+    #if str($refGenomeSource.geneSource) == 'history':
+        tempstargenomedir
+    #else:
+        '${refGenomeSource.GTFconditional.genomeDir.fields.path}'
+        ## Handle difference between indices with/without annotations
+        #if str($refGenomeSource.GTFconditional.GTFselect) == 'without-gtf':
+            #if $refGenomeSource.GTFconditional.sjdbGTFfile:
+                --sjdbOverhang $refGenomeSource.GTFconditional.sjdbOverhang
+                --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}'
+                #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3':
+                    --sjdbGTFtagExonParentTranscript Parent
+                #end if
+            #end if
+        #end if
+        #end if
+        ]]></token>
+    <token name="@READSHANDLING@" ><![CDATA[
+    ## Check that the input pairs are of the same type
+    ## otherwise STARsolo will run for a long time and then error out.
+    ## We consume either repeats of two inputs R1 + R2
+    ## or a collection of paired reads.
+    #if str($sc.input_types.use) == "repeat":
+        #set $reads1 = []
+        #set $reads2 = []
+        #for $r1, $r2 in zip($sc.input_types.input1, $sc.input_types.input2):
+            #assert $r1.datatype == $r2.datatype
+            #silent $reads1.append(str($r1))
+            #silent $reads2.append(str($r2))
+        #end for
+        #set $reads1 = ','.join($reads1)
+        #set $reads2 = ','.join($reads2)
+    #elif str($sc.input_types.use) == "list_paired":
+        #set $r1 = $sc.input_types.input_collection.forward
+        #set $r2 = $sc.input_types.input_collection.reverse
+        #set $reads1 = $r1
+        #set $reads2 = $r2
+    #end if
+    ## cDNA sequence(s) [R2] always go first, then barcode(s) [R1]
+    ## see: Section 3.2 of STAR manual for multiple inputs, and Section 13 for STARsolo inputs
+    --readFilesIn $reads2 $reads1
+    --soloCBmatchWLtype $sc.soloCBmatchWLtype
+    #if $r1.is_of_type('fastq.gz', 'fastqsanger.gz'):
+        @FASTQ_GZ_OPTION@
+    #end if
+    ]]></token>
+    <xml name="ref_selection">
+        <param argument="--genomeFastaFiles" type="data" format="fasta" label="Select a reference genome" />
+          <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/>
+    </xml>
+    <xml name="stdio" >
+        <stdio>
+            <regex match="FATAL error" source="both" level="fatal"/>
+            <regex match="EXITING: FATAL INPUT ERROR:" source="both" level="fatal"/>
+            <regex match="EXITING: fatal error trying to allocate genome arrays, exception thrown: std::bad_alloc" source="both" level="fatal"/>
+            <regex match="\[sam_read1\] missing header\? Abort!" source="both" level="fatal"/>
+            <yield />
+        </stdio>
+    </xml>
+    <xml name="input_selection">
+        <conditional name="input_types" >
+            <param name="use" type="select" label="Input Type" >
+                <option value="repeat" >Separate barcode and cDNA reads</option>
+                <option value="list_paired" >Paired collection of barcode and cDNA reads</option>
+            </param>
+            <when value="repeat">
+                <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input1" type="data"  multiple="true"
+                label="RNA-Seq FASTQ/FASTA file, Barcode reads" />
+                <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input2" type="data"  multiple="true"
+                label="RNA-Seq FASTQ/FASTA file, cDNA reads"/>
+            </when>
+            <when value="list_paired">
+                <param name="input_collection" collection_type="paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="Collection of Pairs" />
+            </when>
+        </conditional>
+    </xml>
+    <xml name="input_selection_smart_seq">
+        <conditional name="input_types_smart_seq" >
+            <param name="use" type="select" label="Input Type" >
+                <option value="list_single_end" >Single-end FASTQ collection</option>
+                <option value="list_paired_end" >Paired FASTQ collection</option>
+            </param>
+            <when value="list_single_end">
+                <param name="single_end_collection" collection_type="list" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of single-end FASTQ files" />
+            </when>
+            <when value="list_paired_end">
+                <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of paired-end FASTQ files" />
+            </when>
+        </conditional>
+    </xml>
+    <xml name="umidedup_options">
+        <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other</option>
+        <option value="1MM_Directional_UMItools" >Directional method from the UMI-tool</option>
+        <option value="1MM_Directional" >Directional with stringent UMI deduplication</option>
+    </xml>
+    <xml name="anchor_types">
+        <option value="0">Read start</option>
+        <option value="1">Read end</option>
+        <option value="2">Adapter start</option>
+        <option value="3">Adapter end</option>
+    </xml>
+    <xml name="cb_match_wl_common">
+        <option value="Exact" >Exact</option>
+        <option value="1MM" >Single match</option>
+    </xml>
+    <xml name="cb_match_wl_cellranger">
+        <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2)</option>
+        <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3)</option>
+        <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3)</option>
+    </xml>
+    <xml name="solo_adapter_params">
+        <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." >
+            <sanitizer>
+                <valid initial="string.digits">
+                    <add value="-"/>
+                    <add value="A"/>
+                    <add value="T"/>
+                    <add value="C"/>
+                    <add value="G"/>
+                    <add value="N"/>
+                </valid>
+            </sanitizer>
+        </param>
+        <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence" />
+        <param argument="--clipAdapterType" type="select" >
+            <option value="Hamming" selected="true" >Adapter clipping based on Hamming distance</option>
+            <option value="CellRanger4" >5p and 3p adapter clipping similar to CellRanger4</option>
+            <option value="None" >No adapter clipping</option>
+        </param>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/rna_star_index_builder.py	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config-file')
+    parser.add_argument('--value')
+    parser.add_argument('--dbkey')
+    parser.add_argument('--name')
+    parser.add_argument('--subdir')
+    parser.add_argument('--data-table')
+    parser.add_argument('--with-gene-model', action='store_true')
+    parser.add_argument('--index-version')
+
+    args = parser.parse_args()
+
+    if args.dbkey in [None, '', '?']:
+        raise Exception(
+            '"%s" is not a valid dbkey. You must specify a valid dbkey.'
+            % (args.dbkey)
+        )
+
+    with_gene_model = "0"
+    if args.with_gene_model:
+        with_gene_model = "1"
+
+    data_manager_dict = {
+        'data_tables': {
+            args.data_table: [
+                {
+                    "value": args.value,
+                    "dbkey": args.dbkey,
+                    "name": args.name,
+                    "path": args.subdir,
+                    "with_gene_model": with_gene_model,
+                    "version": args.index_version
+                }
+            ]
+        }
+    }
+    with open(args.config_file, 'w') as fh:
+        json.dump(data_manager_dict, fh, sort_keys=True)
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/rna_star_index_builder.xml	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,183 @@
+<tool id="rna_star_index_builder_data_manager_custom" name="rnastar index versioned" tool_type="manage_data" version="@IDX_VERSION@" profile="19.05">
+    <description>builder</description>
+
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <expand macro="requirements">
+        <requirement type="package" version="3.7">python</requirement>
+    </expand>
+
+    <command><![CDATA[
+if [ -z "\$GALAXY_MEMORY_MB" ] ; then
+    GALAXY_MEMORY_BYTES=31000000000 ;
+else
+    GALAXY_MEMORY_BYTES=\$((GALAXY_MEMORY_MB * 1000000)) ;
+fi ;
+
+#import os
+#set $target_directory = str($out_file.extra_files_path)
+#set $subdir = os.path.basename($target_directory)
+
+mkdir '${target_directory}' &&
+
+STAR
+--runMode genomeGenerate
+--genomeFastaFiles '${all_fasta_source.fields.path}'
+--genomeDir '${target_directory}'
+--limitGenomeGenerateRAM \${GALAXY_MEMORY_BYTES}
+#if $GTFconditional.GTFselect == "withGTF":
+    #if $GTFconditional.GTF_source_conditional == "builtin_gff":
+        --sjdbGTFfile '${GTFconditional.GTF_source_conditional.all_gff_source.path}'
+    #else
+        --sjdbGTFfile '${GTFconditional.GTF_source_conditional.sjdbGTFfile}'
+    #end if
+    --sjdbOverhang ${GTFconditional.sjdbOverhang}
+#end if
+#if $advanced_options.advanced_options_selector == "advanced":
+    --genomeSAindexNbases ${advanced_options.genomeSAindexNbases}
+    --genomeChrBinNbits ${advanced_options.genomeChrBinNbits}
+    --genomeSAsparseD ${advanced_options.genomeSAsparseD}
+#end if
+--runThreadN \${GALAXY_SLOTS:-2} &&
+
+python '${__tool_directory__}/rna_star_index_builder.py'
+--config-file '${out_file}'
+--value '${all_fasta_source.fields.value}'
+--dbkey '${all_fasta_source.fields.dbkey}'
+--index-version '@IDX_VERSION@'
+#if $name:
+    --name '$name'
+#else
+    --name '${all_fasta_source.fields.name}'
+#end if
+#if str($GTFconditional.GTFselect) == "withGTF":
+    --with-gene-model
+#end if
+--data-table @IDX_DATA_TABLE@
+--subdir '${subdir}'
+    ]]></command>
+    <inputs>
+        <param name="all_fasta_source" type="select" label="Source FASTA Sequence">
+            <options from_data_table="all_fasta"/>
+        </param>
+        <param name="name" type="text" value="" label="Informative name for sequence index"
+            help="By using different settings, you may have several indices per reference genome. Give an appropriate description to the index to distinguish between indices"/>
+        <conditional name="GTFconditional">
+            <param name="GTFselect" type="select" label="Reference genome with or without an annotation" help="Must the index have been created WITH a GTF file (if not you can specify one afterward).">
+                <option value="withoutGTF">use genome reference without builtin gene-model</option>
+                <option value="withGTF">use genome reference with builtin gene-model</option>
+            </param>
+            <when value="withGTF">
+                <conditional name="GTF_source_conditional">
+                    <param name="GTF_source_select" type="select" label="Select source of annotation GTF/GFF" >
+                        <option value="builtin_gff">Use a builtin GTF/GFF</option>
+                        <option value="external_gff">Use an external GTF/GFF</option>
+                    </param>
+                    <when value="builtin_gff">
+                        <param name="all_gff_source" type="select" label="Source FASTA Sequence">
+                            <options from_data_table="all_gff"/>
+                        </param>
+                    </when>
+                    <when value="external_gff">
+                        <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/>
+                    </when>
+                </conditional>
+                <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/>
+            </when>
+            <when value="withoutGTF" />
+        </conditional>
+        <conditional name="advanced_options">
+            <param name="advanced_options_selector" type="select" label="Advanced options">
+                <option value="default" selected="true">Use default options</option>
+                <option value="advanced">Set advanced options</option>
+            </param>
+            <when value="default" />
+            <when value="advanced">
+                <param argument="--genomeSAindexNbases" type="integer" min="1" value="14"
+                    label="Length (bases) of the SA pre-indexing string"
+                    help="Typically between 10 and 15. Longer strings will use much more memory, but allow
+                        faster searches. For small genomes, the parameter –genomeSAindexNbases must be scaled
+                        down to min(14, log2(GenomeLength)/2 - 1). For example, for 1 megaBase genome, this is
+                        equal to 9, for 100 kiloBase genome, this is equal to 7."/>
+                <param argument="--genomeChrBinNbits" type="integer" min="1" value="18"
+                    label="Log2(chrBin), where chrBin is the size of the bins for genome storage"
+                    help="Each chromosome will occupy an integer number of bins. For a genome with large number
+                        of contigs, it is recommended to scale this parameter as min(18,
+                        log2[max(GenomeLength/NumberOfReferences,ReadLength)]). For example, for 3 gigaBase
+                        genome with 100,000 chromosomes/scaffolds, this is equal to 15."/>
+                <param argument="--genomeSAsparseD" type="integer" min="1" value="1" label="Suffix array sparsity"
+                    help="The distance between indices: use bigger numbers to decrease needed RAM at the cost of
+                        mapping speed reduction"/>
+            </when>
+        </conditional>
+    </inputs>
+
+    <outputs>
+        <data name="out_file" format="data_manager_json"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="all_fasta_source" value="phiX174"/>
+            <param name="sequence_name" value="phiX"/>
+            <param name="sequence_id" value="minimal-settings"/>
+            <param name="GTFselect" value="withoutGTF"/>
+
+            <output name="out_file" file="test_star_01.data_manager_json" compare="re_match"/>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+.. class:: infomark
+
+*What it does*
+
+This is a Galaxy data manager tool for the gap-aware RNA aligner STAR.
+
+This version of the tool builds STAR indices of the format first introduced
+with STAR version @IDX_VERSION@.
+
+Please read the fine manual - that and the google group are the places to learn about the options above.
+
+*Memory requirements*
+
+To run efficiently, RNA-STAR requires enough free memory to
+hold the SA-indexed reference genome in RAM. For Human Genome hg19 this
+index is about 27GB and running RNA-STAR requires approximately ~30GB of RAM.
+For custom genomes, the rule of thumb is to multiply the size of the
+reference FASTA file by 9 to estimated required amount of RAM.
+
+*Note on sjdbOverhang*
+
+From https://groups.google.com/forum/#!topic/rna-star/h9oh10UlvhI::
+
+  James is right, using large enough --sjdbOverhang is safer and should not generally cause any problems with reads of varying length. If your reads are very short, &lt;50b, then I would strongly recommend using optimum --sjdbOverhang=mateLength-1
+  By mate length I mean the length of one of the ends of the read, i.e. it's 100 for 2x100b PE or 1x100b SE. For longer reads you can simply use generic --sjdbOverhang 100.
+  It is a bit confusing because of the way I named this parameter. --sjdbOverhang Noverhang is only used at the genome generation step  for constructing the reference sequence out of the annotations.
+  Basically, the Noverhang exonic bases from the donor site and Noverhang exonic bases from the acceptor site are spliced together for each of the junctions, and these spliced sequences are added to the genome sequence.
+
+  At the mapping stage, the reads are aligned to both genomic and splice sequences simultaneously. If a read maps to one of spliced sequences and crosses the "junction" in the middle of it, the coordinates of two pspliced pieces are translated back to genomic space and added to the collection of mapped pieces, which are then all "stitched" together to form the final alignment. Since in the process of "maximal mapped length" search the read is split into pieces of no longer than --seedSearchStartLmax (=50 by default) bases, even if the read (mate) is longer than --sjdbOverhang, it can still be mapped to the spliced reference, as long as --sjdbOverhang > --seedSearchStartLmax.
+
+  Cheers
+  Alex
+
+*Note on gene model requirements for splice junctions*
+
+From https://groups.google.com/forum/#!msg/rna-star/3Y_aaTuzBrE/lUylTB8h5vMJ::
+
+    When you generate a genome with annotations, you need to specify --sjdbOverhang value, which ideally should be equal to (oneMateLength-1), or you could use a generic value of ~100.
+
+    Your gtf lines look fine to me. STAR needs 3 features from a GTF file:
+    1. Chromosome names in col.1 that agree with chromosome names in genome .fasta files. If you have "chr2L" names in the genome .fasta files, and "2L" in the .gtf file, then you need to use --sjdbGTFchrPrefix chr option.
+    2. 'exon' in col.3 for the exons of all transcripts (this name can be changed with --sjdbGTFfeatureExon)
+    3. 'transcript_id' attribute that assigns each exon to a transcript (--this name can be changed with --sjdbGTFtagExonParentTranscript)
+
+    Cheers
+    Alex
+
+**Notice:** If you leave name, description, or id blank, it will be generated automatically.
+    ]]></help>
+    <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/rna_star_index_builder.xml" id="rna_star_index_builder_data_manager_custom" version="0.0.7">
+        <data_table name="rnastar_index2x_versioned">
+            <output>
+                <column name="value" />
+                <column name="dbkey" />
+                <column name="name" />
+                <column name="path" output_ref="out_file" >
+                    <move type="directory" relativize_symlinks="True">
+                        <!-- <source>${path}</source>
+                            out_file.extra_files_path is used as base by default
+                            if no source, eg for type=directory, then refers to base 
+                        -->
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">rnastar/${version}/${dbkey}/${value}/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/rnastar/${version}/${dbkey}/${value}/${path}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+                <column name="with_gene_model" />
+                <column name="version" />
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/all_fasta.loc	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,19 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id>	<dbkey>		<display_name>	<file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3	apiMel3	Honeybee (Apis mellifera): apiMel3		/path/to/genome/apiMel3/apiMel3.fa
+#hg19canon	hg19		Human (Homo sapiens): hg19 Canonical		/path/to/genome/hg19/hg19canon.fa
+#hg19full	hg19		Human (Homo sapiens): hg19 Full			/path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
+phiX174	phiX174	phiX174	${__HERE__}/phiX174.fasta
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/phiX174.fasta	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,79 @@
+>phiX174
+GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT
+GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA
+ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG
+TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA
+GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC
+TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT
+TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT
+CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT
+TGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG
+TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC
+GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA
+CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAG
+TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT
+AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC
+CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA
+TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC
+TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA
+CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA
+GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT
+GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA
+ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC
+TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT
+TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC
+ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCGTGATGTTATTTCTTCATTTGGAGGTAAAAC
+CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT
+GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC
+CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC
+TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG
+TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT
+TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA
+AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT
+TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT
+ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC
+GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC
+TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT
+TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA
+TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG
+TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC
+CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG
+AATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGC
+CGGGCAATAATGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT
+TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG
+CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA
+AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT
+GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG
+GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA
+TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT
+CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG
+TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA
+GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC
+CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA
+TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA
+AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC
+TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT
+CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA
+TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG
+TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT
+CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT
+TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC
+ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG
+TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA
+ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG
+GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC
+CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT
+GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTACTATTCAGCGTTTGATGAATGCAATGCGACAG
+GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT
+ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG
+CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC
+CGTCTTCATTTCCATGCGGTGCATTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC
+GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT
+CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG
+CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA
+TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT
+TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG
+TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC
+AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC
+TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_star_01.data_manager_json	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"rnastar_index2x_versioned": \[{"dbkey": "phiX174", "name": "phiX174", "path": ".*", "value": "phiX174", "version": "2.7.4a", "with_gene_model": "0"}\]}}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/all_fasta.loc.sample	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id>	<dbkey>	<display_name>	<file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3	apiMel3	Honeybee (Apis mellifera): apiMel3	/path/to/genome/apiMel3/apiMel3.fa
+#hg19canon	hg19	Human (Homo sapiens): hg19 Canonical	/path/to/genome/hg19/hg19canon.fa
+#hg19full	hg19	Human (Homo sapiens): hg19 Full	/path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/rnastar_index2x_versioned.loc.sample	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,27 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of rna-star indexed sequences data files.
+#You will need to create these data files and then create a
+#rnastar_index2x_versioned.loc file similar to this one (store it in this
+directory) that points to the directories in which those files are stored.
+#The rnastar_index2x_versioned.loc file has this format (longer white space
+#characters are TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_base_path>	<with_gene_model>	<version>
+#
+#The <with_gene_model> column should be 1 or 0, indicating whether the index
+#was built with annotations (i.e., --sjdbGTFfile and --sjdbOverhang were used)
+#or not.
+#
+#The <version> column indicates the STAR version that introduced the format of
+#the index, i.e., the oldest STAR version that could make use of the index.
+#
+#Note that STAR indices can become quite large. Consequently, it is only
+#advisable to create indices with annotations if it's known ahead of time that
+#(A) the annotations won't be frequently updated and (B) the read lengths used
+#will also rarely vary. If either of these is not the case, it's advisable to
+#create indices without annotations and then specify an annotation file and
+#maximum read length (minus 1) when running STAR.
+#
+#hg19   hg19    hg19 full   /mnt/galaxyIndices/genomes/hg19/rnastar	0	2.7.1a
+#hg19Ensembl   hg19Ensembl    hg19 full with Ensembl annotation   /mnt/galaxyIndices/genomes/hg19Ensembl/rnastar	1	2.7.1a
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,12 @@
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+    <!-- Locations of STAR indexes -->
+    <table name="rnastar_index2x_versioned" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path, with_gene_model, version</columns>
+        <file path="tool-data/rnastar_index2x_versioned.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Tue May 25 15:14:56 2021 +0000
@@ -0,0 +1,12 @@
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="${__HERE__}/test-data/all_fasta.loc" />
+    </table>
+    <!-- Locations of STAR indexes -->
+    <table name="rnastar_index2_versioned" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path, with_gene_model, version</columns>
+        <file path="${__HERE__}/test-data/rnastar_index2_versioned.loc" />
+    </table>
+</tables>