Mercurial > repos > iuc > data_manager_star_index_builder
changeset 11:c6f957c373d3 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_star_index_builder commit 096286097ed5cdf189a1b68c3fc34d10f4142e54
| author | iuc | 
|---|---|
| date | Sun, 16 Apr 2023 08:28:41 +0000 | 
| parents | f639ff7dea45 | 
| children | 66a8edd52132 | 
| files | data_manager/macros.xml data_manager/rna_star_index_builder.xml data_manager_conf.xml | 
| diffstat | 3 files changed, 208 insertions(+), 43 deletions(-) [+] | 
line wrap: on
 line diff
--- a/data_manager/macros.xml Fri Sep 10 16:42:21 2021 +0000 +++ b/data_manager/macros.xml Sun Apr 16 08:28:41 2023 +0000 @@ -1,11 +1,12 @@ <macros> - <!-- REMEMBER to bump the version of rna_star_index_builder_data_manager - whenever you make changes to the following two version tokens! + <!-- REMEMBER to bump the version of @IDX_VERSION_SUFFIX@ + whenever you make changes to the @TOOL_VERSION@ token! The data manager uses a symlink to this macro file to keep the STAR and - the index versions in sync, but you should manually adjust the +galaxy - version number. --> + the index versions in sync, but you should manually update @IDX_VERSION_SUFFIX@ --> <!-- STAR version to be used --> - <token name="@VERSION@">2.7.8a</token> + <token name="@TOOL_VERSION@">2.7.10b</token> + <token name="@VERSION_SUFFIX@">3</token> + <token name="@PROFILE@">21.01</token> <!-- STAR index version compatible with this version of STAR This is the STAR version that introduced the index structure expected by the current version. @@ -14,12 +15,14 @@ or by looking for the versionGenome parameter in source/parametersDefault of STAR's source code --> <token name="@IDX_VERSION@">2.7.4a</token> + <token name="@IDX_VERSION_SUFFIX@">1</token> <token name="@IDX_DATA_TABLE@">rnastar_index2x_versioned</token> <xml name="requirements"> <requirements> - <requirement type="package" version="@VERSION@">star</requirement> - <requirement type="package" version="1.9">samtools</requirement> + <requirement type="package" version="@TOOL_VERSION@">star</requirement> + <requirement type="package" version="1.16.1">samtools</requirement> + <requirement type="package" version="1.12">gzip</requirement> <yield /> </requirements> </xml> @@ -35,7 +38,7 @@ </xml> <xml name="index_selection" token_with_gene_model="0"> - <param argument="--genomeDir" name="genomeDir" type="select" + <param argument="--genomeDir" type="select" label="Select reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> <options from_data_table="@IDX_DATA_TABLE@"> @@ -55,37 +58,45 @@ <citation type="doi">10.1093/bioinformatics/bts635</citation> </citations> </xml> - <xml name="@SJDBOPTIONS@" token_optional="true"> - <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="@OPTIONAL@" help="Exon junction information for mapping splices"/> + <xml name="SJDBOPTIONS"> + <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/> <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/> </xml> <xml name="dbKeyActions"> <actions> - <conditional name="refGenomeSource.geneSource"> - <when value="indexed"> - <action type="metadata" name="dbkey"> - <option type="from_data_table" name="@IDX_DATA_TABLE@" column="1" offset="0"> - <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/> - <filter type="param_value" ref="refGenomeSource.GTFconditional.genomeDir" column="0"/> - </option> - </action> - </when> - <when value="history"> - <action type="metadata" name="dbkey"> - <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey" /> - </action> - </when> - </conditional> + <expand macro="dbKeyAction"/> </actions> </xml> + <xml name="dbKeyAction"> + <conditional name="refGenomeSource.geneSource"> + <when value="indexed"> + <action type="metadata" name="dbkey"> + <option type="from_data_table" name="@IDX_DATA_TABLE@" column="1" offset="0"> + <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/> + <filter type="param_value" ref="refGenomeSource.GTFconditional.genomeDir" column="0"/> + </option> + </action> + </when> + <when value="history"> + <action type="metadata" name="dbkey"> + <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey" /> + </action> + </when> + </conditional> + </xml> <token name="@TEMPINDEX@"><![CDATA[ ## Create temporary index for custom reference #if str($refGenomeSource.geneSource) == 'history': + #if $refGenomeSource.genomeFastaFiles.ext == "fasta" + ln -s '$refGenomeSource.genomeFastaFiles' refgenome.fa && + #else + gunzip -c '$refGenomeSource.genomeFastaFiles' > refgenome.fa && + #end if mkdir -p tempstargenomedir && STAR --runMode genomeGenerate --genomeDir 'tempstargenomedir' - --genomeFastaFiles '${refGenomeSource.genomeFastaFiles}' + --genomeFastaFiles refgenome.fa ## Handle difference between indices with/without annotations #if 'GTFconditional' in $refGenomeSource: ## GTFconditional exists only in STAR, but not STARsolo @@ -109,6 +120,8 @@ --genomeSAindexNbases ${refGenomeSource.genomeSAindexNbases} #end if --runThreadN \${GALAXY_SLOTS:-4} + ## in bytes + --limitGenomeGenerateRAM \$((\${GALAXY_MEMORY_MB:-31000} * 1000000)) && #end if ]]></token> @@ -121,17 +134,15 @@ #else: '${refGenomeSource.GTFconditional.genomeDir.fields.path}' ## Handle difference between indices with/without annotations - #if str($refGenomeSource.GTFconditional.GTFselect) == 'without-gtf': - #if $refGenomeSource.GTFconditional.sjdbGTFfile: - --sjdbOverhang $refGenomeSource.GTFconditional.sjdbOverhang - --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}' - #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3': - --sjdbGTFtagExonParentTranscript Parent - #end if + #if str($refGenomeSource.GTFconditional.GTFselect) == 'without-gtf-with-gtf': + --sjdbOverhang $refGenomeSource.GTFconditional.sjdbOverhang + --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}' + #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3': + --sjdbGTFtagExonParentTranscript Parent #end if #end if - #end if - ]]></token> + #end if + ]]></token> <token name="@READSHANDLING@" ><![CDATA[ ## Check that the input pairs are of the same type ## otherwise STARsolo will run for a long time and then error out. @@ -161,8 +172,13 @@ @FASTQ_GZ_OPTION@ #end if ]]></token> + <token name="@LIMITS@" ><![CDATA[ + --limitOutSJoneRead $getVar('algo.params.junction_limits.limitOutSJoneRead', $getVar('solo.junction_limits.limitOutSJoneRead', 1000)) + --limitOutSJcollapsed $getVar('algo.params.junction_limits.limitOutSJcollapsed', $getVar('solo.junction_limits.limitOutSJcollapsed', 1000000)) + --limitSjdbInsertNsj $getVar('algo.params.junction_limits.limitSjdbInsertNsj', $getVar('solo.junction_limits.limitSjdbInsertNsj', 1000000)) + ]]></token> <xml name="ref_selection"> - <param argument="--genomeFastaFiles" type="data" format="fasta" label="Select a reference genome" /> + <param argument="--genomeFastaFiles" type="data" format="fasta,fasta.gz" label="Select a reference genome" /> <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/> </xml> <xml name="stdio" > @@ -206,7 +222,7 @@ </conditional> </xml> <xml name="umidedup_options"> - <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other</option> + <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other (1MM_All)</option> <option value="1MM_Directional_UMItools" >Directional method from the UMI-tool</option> <option value="1MM_Directional" >Directional with stringent UMI deduplication</option> </xml> @@ -218,12 +234,12 @@ </xml> <xml name="cb_match_wl_common"> <option value="Exact" >Exact</option> - <option value="1MM" >Single match</option> + <option value="1MM" >Single match (1MM)</option> </xml> <xml name="cb_match_wl_cellranger"> - <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2)</option> - <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3)</option> - <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3)</option> + <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2, 1MM_multi)</option> + <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3, 1MM_multi_pseudocounts)</option> + <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3, 1MM_multi_Nbase_pseudocounts)</option> </xml> <xml name="solo_adapter_params"> <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." > @@ -245,4 +261,153 @@ <option value="None" >No adapter clipping</option> </param> </xml> + <xml name="common_SAM_attributes"> + <option value="NH" selected="true">NH (number of reported alignments/hits for the read)</option> + <option value="HI" selected="true">HI (query hit index)</option> + <option value="AS" selected="true">AS (local alignment score)</option> + <option value="nM" selected="true">nM (number of mismatches per (paired) alignment)</option> + <option value="NM">NM (edit distance of the aligned read to the reference)</option> + <option value="MD">MD (string for mismatching positions)</option> + <option value="jM">jM (intron motifs for all junctions)</option> + <option value="jI">jI (1-based start and end of introns for all junctions)</option> + </xml> + <xml name="limits"> + <section name="junction_limits" title="Junction Limits" expanded="false"> + <param argument="--limitOutSJoneRead" type="integer" min="1" value="1000" label="Maximum number of junctions for one read (including all multimappers)" /> + <param argument="--limitOutSJcollapsed" type="integer" min="1" value="1000000" label="Maximum number of collapsed junctions" /> + <param argument="--limitSjdbInsertNsj" type="integer" min="0" value="1000000" label="Maximum number of inserts to be inserted into the genome on the fly." /> + </section> + </xml> + <xml name="outCountActions"> + <actions> + <action name="column_names" type="metadata" default="GeneID,Counts_unstrand,Counts_firstStrand,Counts_secondStrand" /> + <expand macro="dbKeyAction"/> + </actions> + </xml> + <xml name="outWig"> + <conditional name="outWig"> + <param name="outWigType" type="select" label="Compute coverage"> + <option value="None">No coverage</option> + <option value="bedGraph">Yes in bedgraph format</option> + <option value="wiggle">Yes in wiggle format</option> + </param> + <when value="None"> + <!-- This is necessary for the filtering of output --> + <param name="outWigStrand" type="hidden" value="false" /> + </when> + <when value="bedGraph"> + <expand macro="outWigParams"/> + </when> + <when value="wiggle"> + <expand macro="outWigParams"/> + </when> + </conditional> + </xml> + <xml name="outWigParams"> + <param name="outWigTypeSecondWord" type="select" label="Input for coverage"> + <option value="">Default (everything that mapped)</option> + <option value="read_5p">signal from only 5’ of the 1st read</option> + <option value="read2">signal from only 2nd read</option> + </param> + <param argument="--outWigStrand" type="boolean" truevalue="Stranded" falsevalue="Unstranded" checked="true" label="Generate a coverage for each strand (stranded coverage)"/> + <param argument="--outWigReferencesPrefix" type="text" value="-" label="prefix matching reference name" help="For example, set 'chr' if you mapped on an ensembl genome but you want to display on UCSC"/> + <param argument="--outWigNorm" type="boolean" truevalue="RPM" falsevalue="None" checked="true" label="Normalize coverage to million of mapped reads (RPM)"/> + </xml> + <token name="@OUTWIG@"><![CDATA[ + #if str($outWig.outWigType) != 'None': + --outWigType '$outWig.outWigType' '$outWig.outWigTypeSecondWord' + --outWigStrand '$outWig.outWigStrand' + --outWigReferencesPrefix '$outWig.outWigReferencesPrefix' + --outWigNorm '$outWig.outWigNorm' + #end if + ]]></token> + <token name="@OUTWIGOUTPUTS@"><![CDATA[ + #if str($outWig.outWigType) == "bedGraph": + && mv Signal.Unique.str1.out.bg Signal.Unique.str1.out + && mv Signal.UniqueMultiple.str1.out.bg Signal.UniqueMultiple.str1.out + #if str($outWig.outWigStrand) == "Stranded": + && mv Signal.Unique.str2.out.bg Signal.Unique.str2.out + && mv Signal.UniqueMultiple.str2.out.bg Signal.UniqueMultiple.str2.out + #end if + #elif str($outWig.outWigType) == "wiggle": + && mv Signal.Unique.str1.out.wig Signal.Unique.str1.out + && mv Signal.UniqueMultiple.str1.out.wig Signal.UniqueMultiple.str1.out + #if str($outWig.outWigStrand) == "Stranded": + && mv Signal.Unique.str2.out.wig Signal.Unique.str2.out + && mv Signal.UniqueMultiple.str2.out.wig Signal.UniqueMultiple.str2.out + #end if + #end if + ]]></token> + <xml name="outWigOutputs"> + <data format="bedgraph" name="signal_unique_str1" label="${tool.name} on ${on_string}: Coverage Uniquely mapped strand 1" from_work_dir="Signal.Unique.str1.out"> + <filter>outWig['outWigType'] != "None"</filter> + <expand macro="dbKeyActions" /> + <change_format> + <when input="outWig.outWigType" value="wiggle" format="wig" /> + </change_format> + </data> + <data format="bedgraph" name="signal_uniquemultiple_str1" label="${tool.name} on ${on_string}: Coverage Uniquely + Multiple mapped strand 1" from_work_dir="Signal.UniqueMultiple.str1.out"> + <filter>outWig['outWigType'] != "None"</filter> + <expand macro="dbKeyActions" /> + <change_format> + <when input="outWig.outWigType" value="wiggle" format="wig" /> + </change_format> + </data> + <data format="bedgraph" name="signal_unique_str2" label="${tool.name} on ${on_string}: Coverage Uniquely mapped strand 2" from_work_dir="Signal.Unique.str2.out"> + <filter>outWig['outWigType'] != "None" and outWig['outWigStrand']</filter> + <expand macro="dbKeyActions" /> + <change_format> + <when input="outWig.outWigType" value="wiggle" format="wig" /> + </change_format> + </data> + <data format="bedgraph" name="signal_uniquemultiple_str2" label="${tool.name} on ${on_string}: Coverage Uniquely + Multiple mapped strand 2" from_work_dir="Signal.UniqueMultiple.str2.out"> + <filter>outWig['outWigType'] != "None" and outWig['outWigStrand']</filter> + <expand macro="dbKeyActions" /> + <change_format> + <when input="outWig.outWigType" value="wiggle" format="wig" /> + </change_format> + </data> + </xml> + <xml name="quantMode"> + <conditional name="quantmode_output"> + <param argument="--quantMode" type="select" + label="Per gene/transcript output" + help="STAR can provide analysis results not only with respect to the reference genome, but also with respect to genes and transcripts described by a gene model. Note: This functionality requires either the selection above of a cached index with a gene model, or a gene model provided alongside the index/reference genome in GTF or GFF3 format!"> + <option value="-">No per gene or transcript output</option> + <option value="GeneCounts">Per gene read counts (GeneCounts)</option> + <option value="TranscriptomeSAM">Transcript-based BAM output (TranscriptomeSAM)</option> + <option value="TranscriptomeSAM GeneCounts">Both per gene read counts and transcript-based BAM output (TranscriptomeSAM GeneCounts)</option> + </param> + <when value="-" /> + <when value="GeneCounts" /> + <when value="TranscriptomeSAM"> + <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend" + label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?" + help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled." /> + </when> + <when value="TranscriptomeSAM GeneCounts"> + <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend" + label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?" + help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled." /> + </when> + </conditional> + </xml> + <xml name="quantModeNoGTF"> + <conditional name="quantmode_output"> + <param argument="--quantMode" type="select" + label="Per gene/transcript output"> + <option value="-">No per gene or transcript output as no GTF was provided</option> + </param> + <when value="-" /> + </conditional> + </xml> + <xml name="outSAMmapqUnique"> + <!-- MAPQ 255 is the default in STAR (coming from tophat behaviour and compatibility for Cufflinks) but it is a problematic value + - according to SAM/BAM specs it means "undefined". + - Using 255 as the max mapq causes problem with modern downstream tools like mutect2: https://sites.duke.edu/workblog/2021/08/18/star-rnaseq-gatk-mutect2/ and 60 has become an inofficial replacement for 255. --> + <param argument="--outSAMmapqUnique" type="integer" value="60" min="0" max="255" + label="MAPQ value for unique mappers" + help="STAR bases the mapping quality scores of alignment records in its BAM output on the number of alternative mappings for the read. If a read maps to multiple locations on the reference genome, the following MAPQ scoring scheme is +used: >=5 mappings => MAPQ=0; 3-4 mappings => MAPQ=1; 2 mappings => MAPQ=3. This setting lets you control the MAPQ used for reads mapped to a single location. Set to 255 for compatibility with Cufflink (default in STAR) but keep to 60 for modern downstream tools like mutect2." /> + </xml> </macros>
--- a/data_manager/rna_star_index_builder.xml Fri Sep 10 16:42:21 2021 +0000 +++ b/data_manager/rna_star_index_builder.xml Sun Apr 16 08:28:41 2023 +0000 @@ -1,4 +1,4 @@ -<tool id="rna_star_index_builder_data_manager" name="rnastar index versioned" tool_type="manage_data" version="@IDX_VERSION@" profile="19.05"> +<tool id="rna_star_index_builder_data_manager" name="rnastar index versioned" tool_type="manage_data" version="@IDX_VERSION@+galaxy@IDX_VERSION_SUFFIX@" profile="19.05"> <description>builder</description> <macros>
--- a/data_manager_conf.xml Fri Sep 10 16:42:21 2021 +0000 +++ b/data_manager_conf.xml Sun Apr 16 08:28:41 2023 +0000 @@ -1,6 +1,6 @@ <?xml version="1.0"?> <data_managers> - <data_manager tool_file="data_manager/rna_star_index_builder.xml" id="rna_star_index_builder" version="0.0.7"> + <data_manager tool_file="data_manager/rna_star_index_builder.xml" id="rna_star_index_builder"> <data_table name="rnastar_index2x_versioned"> <output> <column name="value" />
