Mercurial > repos > iuc > data_manager_star_index_builder

--- a/data_manager/macros.xml	Fri Sep 10 16:42:21 2021 +0000
+++ b/data_manager/macros.xml	Sun Apr 16 08:28:41 2023 +0000
@@ -1,11 +1,12 @@
 <macros>
-    <!-- REMEMBER to bump the version of rna_star_index_builder_data_manager
-    whenever you make changes to the following two version tokens!
+    <!-- REMEMBER to bump the version of @IDX_VERSION_SUFFIX@
+    whenever you make changes to the @TOOL_VERSION@ token!
     The data manager uses a symlink to this macro file to keep the STAR and
-    the index versions in sync, but you should manually adjust the +galaxy
-    version number. -->
+    the index versions in sync, but you should manually update @IDX_VERSION_SUFFIX@ -->
     <!-- STAR version to be used -->
-    <token name="@VERSION@">2.7.8a</token>
+    <token name="@TOOL_VERSION@">2.7.10b</token>
+    <token name="@VERSION_SUFFIX@">3</token>
+    <token name="@PROFILE@">21.01</token>
     <!-- STAR index version compatible with this version of STAR
     This is the STAR version that introduced the index structure expected
     by the current version.
@@ -14,12 +15,14 @@
     or by looking for the versionGenome parameter in source/parametersDefault
     of STAR's source code -->
     <token name="@IDX_VERSION@">2.7.4a</token>
+    <token name="@IDX_VERSION_SUFFIX@">1</token>
     <token name="@IDX_DATA_TABLE@">rnastar_index2x_versioned</token>

     <xml name="requirements">
         <requirements>
-            <requirement type="package" version="@VERSION@">star</requirement>
-            <requirement type="package" version="1.9">samtools</requirement>
+            <requirement type="package" version="@TOOL_VERSION@">star</requirement>
+            <requirement type="package" version="1.16.1">samtools</requirement>
+            <requirement type="package" version="1.12">gzip</requirement>
             <yield />
         </requirements>
     </xml>
@@ -35,7 +38,7 @@
     </xml>

     <xml name="index_selection" token_with_gene_model="0">
-        <param argument="--genomeDir" name="genomeDir" type="select"
+        <param argument="--genomeDir" type="select"
         label="Select reference genome"
         help="If your genome of interest is not listed, contact the Galaxy team">
             <options from_data_table="@IDX_DATA_TABLE@">
@@ -55,37 +58,45 @@
             <citation type="doi">10.1093/bioinformatics/bts635</citation>
         </citations>
     </xml>
-    <xml name="@SJDBOPTIONS@" token_optional="true">
-         <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="@OPTIONAL@" help="Exon junction information for mapping splices"/>
+    <xml name="SJDBOPTIONS">
+         <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/>
          <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/>
     </xml>
     <xml name="dbKeyActions">
         <actions>
-            <conditional name="refGenomeSource.geneSource">
-                <when value="indexed">
-                    <action type="metadata" name="dbkey">
-                        <option type="from_data_table" name="@IDX_DATA_TABLE@" column="1" offset="0">
-                            <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
-                            <filter type="param_value" ref="refGenomeSource.GTFconditional.genomeDir" column="0"/>
-                        </option>
-                    </action>
-                </when>
-                <when value="history">
-                    <action type="metadata" name="dbkey">
-                        <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey" />
-                    </action>
-                </when>
-            </conditional>
+            <expand macro="dbKeyAction"/>
         </actions>
     </xml>
+    <xml name="dbKeyAction">
+        <conditional name="refGenomeSource.geneSource">
+            <when value="indexed">
+                <action type="metadata" name="dbkey">
+                    <option type="from_data_table" name="@IDX_DATA_TABLE@" column="1" offset="0">
+                        <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
+                        <filter type="param_value" ref="refGenomeSource.GTFconditional.genomeDir" column="0"/>
+                    </option>
+                </action>
+            </when>
+            <when value="history">
+                <action type="metadata" name="dbkey">
+                    <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey" />
+                </action>
+            </when>
+        </conditional>
+    </xml>
     <token name="@TEMPINDEX@"><![CDATA[
     ## Create temporary index for custom reference
     #if str($refGenomeSource.geneSource) == 'history':
+        #if $refGenomeSource.genomeFastaFiles.ext == "fasta"
+            ln -s '$refGenomeSource.genomeFastaFiles' refgenome.fa &&
+        #else
+            gunzip -c '$refGenomeSource.genomeFastaFiles' > refgenome.fa &&
+        #end if
         mkdir -p tempstargenomedir &&
         STAR
             --runMode genomeGenerate
             --genomeDir 'tempstargenomedir'
-            --genomeFastaFiles '${refGenomeSource.genomeFastaFiles}'
+            --genomeFastaFiles refgenome.fa
             ## Handle difference between indices with/without annotations
             #if 'GTFconditional' in $refGenomeSource:
                 ## GTFconditional exists only in STAR, but not STARsolo
@@ -109,6 +120,8 @@
                 --genomeSAindexNbases ${refGenomeSource.genomeSAindexNbases}
             #end if
             --runThreadN \${GALAXY_SLOTS:-4}
+            ## in bytes
+            --limitGenomeGenerateRAM \$((\${GALAXY_MEMORY_MB:-31000} * 1000000))
         &&
     #end if
     ]]></token>
@@ -121,17 +134,15 @@
     #else:
         '${refGenomeSource.GTFconditional.genomeDir.fields.path}'
         ## Handle difference between indices with/without annotations
-        #if str($refGenomeSource.GTFconditional.GTFselect) == 'without-gtf':
-            #if $refGenomeSource.GTFconditional.sjdbGTFfile:
-                --sjdbOverhang $refGenomeSource.GTFconditional.sjdbOverhang
-                --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}'
-                #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3':
-                    --sjdbGTFtagExonParentTranscript Parent
-                #end if
+        #if str($refGenomeSource.GTFconditional.GTFselect) == 'without-gtf-with-gtf':
+            --sjdbOverhang $refGenomeSource.GTFconditional.sjdbOverhang
+            --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}'
+            #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3':
+                --sjdbGTFtagExonParentTranscript Parent
             #end if
         #end if
-        #end if
-        ]]></token>
+    #end if
+    ]]></token>
     <token name="@READSHANDLING@" ><![CDATA[
     ## Check that the input pairs are of the same type
     ## otherwise STARsolo will run for a long time and then error out.
@@ -161,8 +172,13 @@
         @FASTQ_GZ_OPTION@
     #end if
     ]]></token>
+    <token name="@LIMITS@" ><![CDATA[
+        --limitOutSJoneRead $getVar('algo.params.junction_limits.limitOutSJoneRead', $getVar('solo.junction_limits.limitOutSJoneRead', 1000))
+        --limitOutSJcollapsed $getVar('algo.params.junction_limits.limitOutSJcollapsed', $getVar('solo.junction_limits.limitOutSJcollapsed', 1000000))
+        --limitSjdbInsertNsj $getVar('algo.params.junction_limits.limitSjdbInsertNsj', $getVar('solo.junction_limits.limitSjdbInsertNsj', 1000000))
+    ]]></token>
     <xml name="ref_selection">
-        <param argument="--genomeFastaFiles" type="data" format="fasta" label="Select a reference genome" />
+        <param argument="--genomeFastaFiles" type="data" format="fasta,fasta.gz" label="Select a reference genome" />
           <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/>
     </xml>
     <xml name="stdio" >
@@ -206,7 +222,7 @@
         </conditional>
     </xml>
     <xml name="umidedup_options">
-        <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other</option>
+        <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other (1MM_All)</option>
         <option value="1MM_Directional_UMItools" >Directional method from the UMI-tool</option>
         <option value="1MM_Directional" >Directional with stringent UMI deduplication</option>
     </xml>
@@ -218,12 +234,12 @@
     </xml>
     <xml name="cb_match_wl_common">
         <option value="Exact" >Exact</option>
-        <option value="1MM" >Single match</option>
+        <option value="1MM" >Single match (1MM)</option>
     </xml>
     <xml name="cb_match_wl_cellranger">
-        <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2)</option>
-        <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3)</option>
-        <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3)</option>
+        <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2, 1MM_multi)</option>
+        <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3, 1MM_multi_pseudocounts)</option>
+        <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3, 1MM_multi_Nbase_pseudocounts)</option>
     </xml>
     <xml name="solo_adapter_params">
         <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." >
@@ -245,4 +261,153 @@
             <option value="None" >No adapter clipping</option>
         </param>
     </xml>
+    <xml name="common_SAM_attributes">
+        <option value="NH" selected="true">NH (number of reported alignments/hits for the read)</option>
+        <option value="HI" selected="true">HI (query hit index)</option>
+        <option value="AS" selected="true">AS (local alignment score)</option>
+        <option value="nM" selected="true">nM (number of mismatches per (paired) alignment)</option>
+        <option value="NM">NM (edit distance of the aligned read to the reference)</option>
+        <option value="MD">MD (string for mismatching positions)</option>
+        <option value="jM">jM (intron motifs for all junctions)</option>
+        <option value="jI">jI (1-based start and end of introns for all junctions)</option>
+    </xml>
+    <xml name="limits">
+        <section name="junction_limits" title="Junction Limits" expanded="false">
+            <param argument="--limitOutSJoneRead" type="integer" min="1" value="1000" label="Maximum number of junctions for one read (including all multimappers)" />
+            <param argument="--limitOutSJcollapsed" type="integer" min="1" value="1000000" label="Maximum number of collapsed junctions" />
+            <param argument="--limitSjdbInsertNsj" type="integer" min="0" value="1000000" label="Maximum number of inserts to be inserted into the genome on the fly." />
+        </section>
+    </xml>
+    <xml name="outCountActions">
+        <actions>
+            <action name="column_names" type="metadata" default="GeneID,Counts_unstrand,Counts_firstStrand,Counts_secondStrand" />
+            <expand macro="dbKeyAction"/>
+        </actions>
+    </xml>
+    <xml name="outWig">
+        <conditional name="outWig">
+            <param name="outWigType" type="select" label="Compute coverage">
+                <option value="None">No coverage</option>
+                <option value="bedGraph">Yes in bedgraph format</option>
+                <option value="wiggle">Yes in wiggle format</option>
+            </param>
+            <when value="None">
+                <!-- This is necessary for the filtering of output -->
+                <param name="outWigStrand" type="hidden" value="false" />
+            </when>
+            <when value="bedGraph">
+                <expand macro="outWigParams"/>
+            </when>
+            <when value="wiggle">
+                <expand macro="outWigParams"/>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="outWigParams">
+        <param name="outWigTypeSecondWord" type="select" label="Input for coverage">
+            <option value="">Default (everything that mapped)</option>
+            <option value="read_5p">signal from only 5’ of the 1st read</option>
+            <option value="read2">signal from only 2nd read</option>
+        </param>
+        <param argument="--outWigStrand" type="boolean" truevalue="Stranded" falsevalue="Unstranded" checked="true" label="Generate a coverage for each strand (stranded coverage)"/>
+        <param argument="--outWigReferencesPrefix" type="text" value="-" label="prefix matching reference name" help="For example, set 'chr' if you mapped on an ensembl genome but you want to display on UCSC"/>
+        <param argument="--outWigNorm" type="boolean" truevalue="RPM" falsevalue="None" checked="true" label="Normalize coverage to million of mapped reads (RPM)"/>
+    </xml>
+    <token name="@OUTWIG@"><![CDATA[
+        #if str($outWig.outWigType) != 'None':
+            --outWigType '$outWig.outWigType' '$outWig.outWigTypeSecondWord'
+            --outWigStrand '$outWig.outWigStrand'
+            --outWigReferencesPrefix '$outWig.outWigReferencesPrefix'
+            --outWigNorm '$outWig.outWigNorm'
+        #end if
+    ]]></token>
+    <token name="@OUTWIGOUTPUTS@"><![CDATA[
+        #if str($outWig.outWigType) == "bedGraph":
+            && mv Signal.Unique.str1.out.bg Signal.Unique.str1.out
+            && mv Signal.UniqueMultiple.str1.out.bg Signal.UniqueMultiple.str1.out
+            #if str($outWig.outWigStrand) == "Stranded":
+                && mv Signal.Unique.str2.out.bg Signal.Unique.str2.out
+                && mv Signal.UniqueMultiple.str2.out.bg Signal.UniqueMultiple.str2.out
+            #end if
+        #elif str($outWig.outWigType) == "wiggle":
+            && mv Signal.Unique.str1.out.wig Signal.Unique.str1.out
+            && mv Signal.UniqueMultiple.str1.out.wig Signal.UniqueMultiple.str1.out
+            #if str($outWig.outWigStrand) == "Stranded":
+                && mv Signal.Unique.str2.out.wig Signal.Unique.str2.out
+                && mv Signal.UniqueMultiple.str2.out.wig Signal.UniqueMultiple.str2.out
+            #end if
+        #end if
+    ]]></token>
+    <xml name="outWigOutputs">
+        <data format="bedgraph" name="signal_unique_str1" label="${tool.name} on ${on_string}: Coverage Uniquely mapped strand 1" from_work_dir="Signal.Unique.str1.out">
+            <filter>outWig['outWigType'] != "None"</filter>
+            <expand macro="dbKeyActions" />
+            <change_format>
+                <when input="outWig.outWigType" value="wiggle" format="wig" />
+            </change_format>
+        </data>
+        <data format="bedgraph" name="signal_uniquemultiple_str1" label="${tool.name} on ${on_string}: Coverage Uniquely + Multiple mapped strand 1" from_work_dir="Signal.UniqueMultiple.str1.out">
+            <filter>outWig['outWigType'] != "None"</filter>
+            <expand macro="dbKeyActions" />
+            <change_format>
+                <when input="outWig.outWigType" value="wiggle" format="wig" />
+            </change_format>
+        </data>
+        <data format="bedgraph" name="signal_unique_str2" label="${tool.name} on ${on_string}: Coverage Uniquely mapped strand 2" from_work_dir="Signal.Unique.str2.out">
+            <filter>outWig['outWigType'] != "None" and outWig['outWigStrand']</filter>
+            <expand macro="dbKeyActions" />
+            <change_format>
+                <when input="outWig.outWigType" value="wiggle" format="wig" />
+            </change_format>
+        </data>
+        <data format="bedgraph" name="signal_uniquemultiple_str2" label="${tool.name} on ${on_string}: Coverage Uniquely + Multiple mapped strand 2" from_work_dir="Signal.UniqueMultiple.str2.out">
+            <filter>outWig['outWigType'] != "None" and outWig['outWigStrand']</filter>
+            <expand macro="dbKeyActions" />
+            <change_format>
+                <when input="outWig.outWigType" value="wiggle" format="wig" />
+            </change_format>
+        </data>
+    </xml>
+    <xml name="quantMode">
+        <conditional name="quantmode_output">
+            <param argument="--quantMode" type="select"
+            label="Per gene/transcript output"
+            help="STAR can provide analysis results not only with respect to the reference genome, but also with respect to genes and transcripts described by a gene model. Note: This functionality requires either the selection above of a cached index with a gene model, or a gene model provided alongside the index/reference genome in GTF or GFF3 format!">
+                <option value="-">No per gene or transcript output</option>
+                <option value="GeneCounts">Per gene read counts (GeneCounts)</option>
+                <option value="TranscriptomeSAM">Transcript-based BAM output (TranscriptomeSAM)</option>
+                <option value="TranscriptomeSAM GeneCounts">Both per gene read counts and transcript-based BAM output (TranscriptomeSAM GeneCounts)</option>
+            </param>
+            <when value="-" />
+            <when value="GeneCounts" />
+            <when value="TranscriptomeSAM">
+                <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend"
+                label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?"
+                help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled." />
+            </when>
+            <when value="TranscriptomeSAM GeneCounts">
+                <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend"
+                label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?"
+                help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled." />
+            </when>
+        </conditional>
+    </xml>
+    <xml name="quantModeNoGTF">
+        <conditional name="quantmode_output">
+            <param argument="--quantMode" type="select"
+            label="Per gene/transcript output">
+                <option value="-">No per gene or transcript output as no GTF was provided</option>
+            </param>
+            <when value="-" />
+        </conditional>
+    </xml>
+    <xml name="outSAMmapqUnique">
+        <!-- MAPQ 255 is the default in STAR (coming from tophat behaviour and compatibility for Cufflinks) but it is a problematic value
+        - according to SAM/BAM specs it means "undefined".
+        - Using 255 as the max mapq causes problem with modern downstream tools like mutect2: https://sites.duke.edu/workblog/2021/08/18/star-rnaseq-gatk-mutect2/ and 60 has become an inofficial replacement for 255. -->
+        <param argument="--outSAMmapqUnique" type="integer" value="60" min="0" max="255"
+        label="MAPQ value for unique mappers"
+        help="STAR bases the mapping quality scores of alignment records in its BAM output on the number of alternative mappings for the read. If a read maps to multiple locations on the reference genome, the following MAPQ scoring scheme is
+used: >=5 mappings => MAPQ=0; 3-4 mappings => MAPQ=1; 2 mappings => MAPQ=3. This setting lets you control the MAPQ used for reads mapped to a single location. Set to 255 for compatibility with Cufflink (default in STAR) but keep to 60 for modern downstream tools like mutect2." />
+    </xml>
 </macros>
--- a/data_manager/rna_star_index_builder.xml	Fri Sep 10 16:42:21 2021 +0000
+++ b/data_manager/rna_star_index_builder.xml	Sun Apr 16 08:28:41 2023 +0000
@@ -1,4 +1,4 @@
-<tool id="rna_star_index_builder_data_manager" name="rnastar index versioned" tool_type="manage_data" version="@IDX_VERSION@" profile="19.05">
+<tool id="rna_star_index_builder_data_manager" name="rnastar index versioned" tool_type="manage_data" version="@IDX_VERSION@+galaxy@IDX_VERSION_SUFFIX@" profile="19.05">
     <description>builder</description>

     <macros>
--- a/data_manager_conf.xml	Fri Sep 10 16:42:21 2021 +0000
+++ b/data_manager_conf.xml	Sun Apr 16 08:28:41 2023 +0000
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <data_managers>
-    <data_manager tool_file="data_manager/rna_star_index_builder.xml" id="rna_star_index_builder" version="0.0.7">
+    <data_manager tool_file="data_manager/rna_star_index_builder.xml" id="rna_star_index_builder">
         <data_table name="rnastar_index2x_versioned">
             <output>
                 <column name="value" />