Mercurial > repos > iuc > data_manager_star_index_builder
changeset 12:66a8edd52132 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_star_index_builder commit 57d05675396f0b44265fb4dbc8f9f891c6073219
| author | iuc | 
|---|---|
| date | Thu, 05 Dec 2024 06:47:38 +0000 | 
| parents | c6f957c373d3 | 
| children | |
| files | data_manager/macros.xml data_manager/rna_star_index_builder.py data_manager/rna_star_index_builder.xml data_manager_conf.xml test-data/rnastar_index2_versioned.loc test-data/rnastar_index2x_versioned.loc test-data/test_star_01.data_manager_json tool_data_table_conf.xml.test | 
| diffstat | 6 files changed, 210 insertions(+), 168 deletions(-) [+] | 
line wrap: on
 line diff
--- a/data_manager/macros.xml Sun Apr 16 08:28:41 2023 +0000 +++ b/data_manager/macros.xml Thu Dec 05 06:47:38 2024 +0000 @@ -4,8 +4,8 @@ The data manager uses a symlink to this macro file to keep the STAR and the index versions in sync, but you should manually update @IDX_VERSION_SUFFIX@ --> <!-- STAR version to be used --> - <token name="@TOOL_VERSION@">2.7.10b</token> - <token name="@VERSION_SUFFIX@">3</token> + <token name="@TOOL_VERSION@">2.7.11a</token> + <token name="@VERSION_SUFFIX@">1</token> <token name="@PROFILE@">21.01</token> <!-- STAR index version compatible with this version of STAR This is the STAR version that introduced the index structure expected @@ -15,18 +15,16 @@ or by looking for the versionGenome parameter in source/parametersDefault of STAR's source code --> <token name="@IDX_VERSION@">2.7.4a</token> - <token name="@IDX_VERSION_SUFFIX@">1</token> + <token name="@IDX_VERSION_SUFFIX@">3</token> <token name="@IDX_DATA_TABLE@">rnastar_index2x_versioned</token> - <xml name="requirements"> <requirements> <requirement type="package" version="@TOOL_VERSION@">star</requirement> - <requirement type="package" version="1.16.1">samtools</requirement> - <requirement type="package" version="1.12">gzip</requirement> - <yield /> + <requirement type="package" version="1.18">samtools</requirement> + <requirement type="package" version="1.13">gzip</requirement> + <yield/> </requirements> </xml> - <xml name="edam"> <edam_topics> <edam_topic>topic_3170</edam_topic> @@ -36,20 +34,16 @@ <edam_operation>operation_0292</edam_operation> </edam_operations> </xml> - <xml name="index_selection" token_with_gene_model="0"> - <param argument="--genomeDir" type="select" - label="Select reference genome" - help="If your genome of interest is not listed, contact the Galaxy team"> + <param argument="--genomeDir" type="select" label="Select reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> <options from_data_table="@IDX_DATA_TABLE@"> - <filter type="static_value" column="4" value="@WITH_GENE_MODEL@" /> - <filter type="static_value" column="5" value="@IDX_VERSION@" /> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No indexes are available for the selected input dataset" /> + <filter type="static_value" column="4" value="@WITH_GENE_MODEL@"/> + <filter type="static_value" column="5" value="@IDX_VERSION@"/> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> </options> </param> </xml> - <token name="@FASTQ_GZ_OPTION@"> --readFilesCommand zcat </token> @@ -59,8 +53,9 @@ </citations> </xml> <xml name="SJDBOPTIONS"> - <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/> - <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/> + <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="false" help="Exon junction information for mapping splices"/> + <param argument="--sjdbGTFfeatureExon" type="text" value="exon" label="Elements to use from the gene model to use for splice junctions" help="By default and for almost all cases: 'exon', referring to finding junctions at the RNA splice sites. This can optionally be changed to allow splicing at other levels, such as 'gene', 'transcript', 'CDS'."/> + <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/> </xml> <xml name="dbKeyActions"> <actions> @@ -79,7 +74,7 @@ </when> <when value="history"> <action type="metadata" name="dbkey"> - <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey" /> + <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey"/> </action> </when> </conditional> @@ -103,15 +98,17 @@ #if str($refGenomeSource.GTFconditional.GTFselect) == 'with-gtf': --sjdbOverhang '${refGenomeSource.GTFconditional.sjdbOverhang}' --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}' + --sjdbGTFfeatureExon '${refGenomeSource.GTFconditional.sjdbGTFfeatureExon}' #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3': --sjdbGTFtagExonParentTranscript Parent #end if #end if #else: - ## ref genome selection is less complex for STARsolo cause + ## ref genome selection is less complex for STARsolo because ## with-gtf is mandatory there --sjdbOverhang '${refGenomeSource.sjdbOverhang}' --sjdbGTFfile '${refGenomeSource.sjdbGTFfile}' + --sjdbGTFfeatureExon '${refGenomeSource.sjdbGTFfeatureExon}' #if str($refGenomeSource.sjdbGTFfile.ext) == 'gff3': --sjdbGTFtagExonParentTranscript Parent #end if @@ -119,13 +116,20 @@ #if str($refGenomeSource.genomeSAindexNbases): --genomeSAindexNbases ${refGenomeSource.genomeSAindexNbases} #end if + ## Diploid mode + #if 'diploidconditional' in $refGenomeSource: + #if str($refGenomeSource.diploidconditional.diploid) == 'Yes': + --genomeTransformVCF '${refGenomeSource.diploidconditional.genomeTransformVCF}' + --genomeTransformType Diploid + #end if + #end if --runThreadN \${GALAXY_SLOTS:-4} ## in bytes --limitGenomeGenerateRAM \$((\${GALAXY_MEMORY_MB:-31000} * 1000000)) && #end if ]]></token> - <token name="@REFGENOMEHANDLING@" ><![CDATA[ + <token name="@REFGENOMEHANDLING@"><![CDATA[ --runThreadN \${GALAXY_SLOTS:-4} --genomeLoad NoSharedMemory --genomeDir @@ -137,13 +141,14 @@ #if str($refGenomeSource.GTFconditional.GTFselect) == 'without-gtf-with-gtf': --sjdbOverhang $refGenomeSource.GTFconditional.sjdbOverhang --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}' + --sjdbGTFfeatureExon '${refGenomeSource.GTFconditional.sjdbGTFfeatureExon}' #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3': --sjdbGTFtagExonParentTranscript Parent #end if #end if #end if ]]></token> - <token name="@READSHANDLING@" ><![CDATA[ + <token name="@READSHANDLING@"><![CDATA[ ## Check that the input pairs are of the same type ## otherwise STARsolo will run for a long time and then error out. ## We consume either repeats of two inputs R1 + R2 @@ -172,59 +177,57 @@ @FASTQ_GZ_OPTION@ #end if ]]></token> - <token name="@LIMITS@" ><![CDATA[ + <token name="@LIMITS@"><![CDATA[ --limitOutSJoneRead $getVar('algo.params.junction_limits.limitOutSJoneRead', $getVar('solo.junction_limits.limitOutSJoneRead', 1000)) --limitOutSJcollapsed $getVar('algo.params.junction_limits.limitOutSJcollapsed', $getVar('solo.junction_limits.limitOutSJcollapsed', 1000000)) --limitSjdbInsertNsj $getVar('algo.params.junction_limits.limitSjdbInsertNsj', $getVar('solo.junction_limits.limitSjdbInsertNsj', 1000000)) ]]></token> <xml name="ref_selection"> - <param argument="--genomeFastaFiles" type="data" format="fasta,fasta.gz" label="Select a reference genome" /> - <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/> + <param argument="--genomeFastaFiles" type="data" format="fasta,fasta.gz" label="Select a reference genome"/> + <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/> </xml> - <xml name="stdio" > + <xml name="stdio"> <stdio> <regex match="FATAL error" source="both" level="fatal"/> <regex match="EXITING: FATAL INPUT ERROR:" source="both" level="fatal"/> <regex match="EXITING: fatal error trying to allocate genome arrays, exception thrown: std::bad_alloc" source="both" level="fatal"/> <regex match="\[sam_read1\] missing header\? Abort!" source="both" level="fatal"/> - <yield /> + <yield/> </stdio> </xml> <xml name="input_selection"> - <conditional name="input_types" > - <param name="use" type="select" label="Input Type" > - <option value="repeat" >Separate barcode and cDNA reads</option> - <option value="list_paired" >Paired collection of barcode and cDNA reads</option> + <conditional name="input_types"> + <param name="use" type="select" label="Input Type"> + <option value="repeat">Separate barcode and cDNA reads</option> + <option value="list_paired">Paired collection of barcode and cDNA reads</option> </param> <when value="repeat"> - <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input1" type="data" multiple="true" - label="RNA-Seq FASTQ/FASTA file, Barcode reads" /> - <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input2" type="data" multiple="true" - label="RNA-Seq FASTQ/FASTA file, cDNA reads"/> + <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input1" type="data" multiple="true" label="RNA-Seq FASTQ/FASTA file, Barcode reads"/> + <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input2" type="data" multiple="true" label="RNA-Seq FASTQ/FASTA file, cDNA reads"/> </when> <when value="list_paired"> - <param name="input_collection" collection_type="paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="Collection of Pairs" /> + <param name="input_collection" collection_type="paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="Collection of Pairs"/> </when> </conditional> </xml> <xml name="input_selection_smart_seq"> - <conditional name="input_types_smart_seq" > - <param name="use" type="select" label="Input Type" > - <option value="list_single_end" >Single-end FASTQ collection</option> - <option value="list_paired_end" >Paired FASTQ collection</option> + <conditional name="input_types_smart_seq"> + <param name="use" type="select" label="Input Type"> + <option value="list_single_end">Single-end FASTQ collection</option> + <option value="list_paired_end">Paired FASTQ collection</option> </param> <when value="list_single_end"> - <param name="single_end_collection" collection_type="list" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of single-end FASTQ files" /> + <param name="single_end_collection" collection_type="list" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of single-end FASTQ files"/> </when> <when value="list_paired_end"> - <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of paired-end FASTQ files" /> + <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of paired-end FASTQ files"/> </when> </conditional> </xml> <xml name="umidedup_options"> <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other (1MM_All)</option> - <option value="1MM_Directional_UMItools" >Directional method from the UMI-tool</option> - <option value="1MM_Directional" >Directional with stringent UMI deduplication</option> + <option value="1MM_Directional_UMItools">Directional method from the UMI-tool</option> + <option value="1MM_Directional">Directional with stringent UMI deduplication</option> </xml> <xml name="anchor_types"> <option value="0">Read start</option> @@ -233,16 +236,16 @@ <option value="3">Adapter end</option> </xml> <xml name="cb_match_wl_common"> - <option value="Exact" >Exact</option> - <option value="1MM" >Single match (1MM)</option> + <option value="Exact">Exact</option> + <option value="1MM">Single match (1MM)</option> </xml> <xml name="cb_match_wl_cellranger"> - <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2, 1MM_multi)</option> - <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3, 1MM_multi_pseudocounts)</option> - <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3, 1MM_multi_Nbase_pseudocounts)</option> + <option value="1MM_multi" selected="true">Multiple matches (CellRanger 2, 1MM_multi)</option> + <option value="1MM_multi_pseudocounts">Multiple matches (CellRanger 3, 1MM_multi_pseudocounts)</option> + <option value="1MM_multi_Nbase_pseudocounts">Multimatching to WL is allowed for CBs with N-bases (CellRanger 3, 1MM_multi_Nbase_pseudocounts)</option> </xml> <xml name="solo_adapter_params"> - <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." > + <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes."> <sanitizer> <valid initial="string.digits"> <add value="-"/> @@ -254,11 +257,11 @@ </valid> </sanitizer> </param> - <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence" /> - <param argument="--clipAdapterType" type="select" > - <option value="Hamming" selected="true" >Adapter clipping based on Hamming distance</option> - <option value="CellRanger4" >5p and 3p adapter clipping similar to CellRanger4</option> - <option value="None" >No adapter clipping</option> + <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence"/> + <param argument="--clipAdapterType" type="select"> + <option value="Hamming" selected="true">Adapter clipping based on Hamming distance</option> + <option value="CellRanger4">5p and 3p adapter clipping similar to CellRanger4</option> + <option value="None">No adapter clipping</option> </param> </xml> <xml name="common_SAM_attributes"> @@ -273,14 +276,14 @@ </xml> <xml name="limits"> <section name="junction_limits" title="Junction Limits" expanded="false"> - <param argument="--limitOutSJoneRead" type="integer" min="1" value="1000" label="Maximum number of junctions for one read (including all multimappers)" /> - <param argument="--limitOutSJcollapsed" type="integer" min="1" value="1000000" label="Maximum number of collapsed junctions" /> - <param argument="--limitSjdbInsertNsj" type="integer" min="0" value="1000000" label="Maximum number of inserts to be inserted into the genome on the fly." /> + <param argument="--limitOutSJoneRead" type="integer" min="1" value="1000" label="Maximum number of junctions for one read (including all multimappers)"/> + <param argument="--limitOutSJcollapsed" type="integer" min="1" value="1000000" label="Maximum number of collapsed junctions"/> + <param argument="--limitSjdbInsertNsj" type="integer" min="0" value="1000000" label="Maximum number of inserts to be inserted into the genome on the fly."/> </section> </xml> <xml name="outCountActions"> <actions> - <action name="column_names" type="metadata" default="GeneID,Counts_unstrand,Counts_firstStrand,Counts_secondStrand" /> + <action name="column_names" type="metadata" default="GeneID,Counts_unstrand,Counts_firstStrand,Counts_secondStrand"/> <expand macro="dbKeyAction"/> </actions> </xml> @@ -293,7 +296,7 @@ </param> <when value="None"> <!-- This is necessary for the filtering of output --> - <param name="outWigStrand" type="hidden" value="false" /> + <param name="outWigStrand" type="hidden" value="false"/> </when> <when value="bedGraph"> <expand macro="outWigParams"/> @@ -341,73 +344,92 @@ <xml name="outWigOutputs"> <data format="bedgraph" name="signal_unique_str1" label="${tool.name} on ${on_string}: Coverage Uniquely mapped strand 1" from_work_dir="Signal.Unique.str1.out"> <filter>outWig['outWigType'] != "None"</filter> - <expand macro="dbKeyActions" /> + <expand macro="dbKeyActions"/> <change_format> - <when input="outWig.outWigType" value="wiggle" format="wig" /> + <when input="outWig.outWigType" value="wiggle" format="wig"/> </change_format> </data> <data format="bedgraph" name="signal_uniquemultiple_str1" label="${tool.name} on ${on_string}: Coverage Uniquely + Multiple mapped strand 1" from_work_dir="Signal.UniqueMultiple.str1.out"> <filter>outWig['outWigType'] != "None"</filter> - <expand macro="dbKeyActions" /> + <expand macro="dbKeyActions"/> <change_format> - <when input="outWig.outWigType" value="wiggle" format="wig" /> + <when input="outWig.outWigType" value="wiggle" format="wig"/> </change_format> </data> <data format="bedgraph" name="signal_unique_str2" label="${tool.name} on ${on_string}: Coverage Uniquely mapped strand 2" from_work_dir="Signal.Unique.str2.out"> <filter>outWig['outWigType'] != "None" and outWig['outWigStrand']</filter> - <expand macro="dbKeyActions" /> + <expand macro="dbKeyActions"/> <change_format> - <when input="outWig.outWigType" value="wiggle" format="wig" /> + <when input="outWig.outWigType" value="wiggle" format="wig"/> </change_format> </data> <data format="bedgraph" name="signal_uniquemultiple_str2" label="${tool.name} on ${on_string}: Coverage Uniquely + Multiple mapped strand 2" from_work_dir="Signal.UniqueMultiple.str2.out"> <filter>outWig['outWigType'] != "None" and outWig['outWigStrand']</filter> - <expand macro="dbKeyActions" /> + <expand macro="dbKeyActions"/> <change_format> - <when input="outWig.outWigType" value="wiggle" format="wig" /> + <when input="outWig.outWigType" value="wiggle" format="wig"/> </change_format> </data> </xml> <xml name="quantMode"> <conditional name="quantmode_output"> - <param argument="--quantMode" type="select" - label="Per gene/transcript output" - help="STAR can provide analysis results not only with respect to the reference genome, but also with respect to genes and transcripts described by a gene model. Note: This functionality requires either the selection above of a cached index with a gene model, or a gene model provided alongside the index/reference genome in GTF or GFF3 format!"> + <param argument="--quantMode" type="select" label="Per gene/transcript output" help="STAR can provide analysis results not only with respect to the reference genome, but also with respect to genes and transcripts described by a gene model. Note: This functionality requires either the selection above of a cached index with a gene model, or a gene model provided alongside the index/reference genome in GTF or GFF3 format!"> <option value="-">No per gene or transcript output</option> <option value="GeneCounts">Per gene read counts (GeneCounts)</option> <option value="TranscriptomeSAM">Transcript-based BAM output (TranscriptomeSAM)</option> <option value="TranscriptomeSAM GeneCounts">Both per gene read counts and transcript-based BAM output (TranscriptomeSAM GeneCounts)</option> </param> - <when value="-" /> - <when value="GeneCounts" /> + <when value="-"/> + <when value="GeneCounts"/> <when value="TranscriptomeSAM"> - <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend" - label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?" - help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled." /> + <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend" label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?" help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled."/> </when> <when value="TranscriptomeSAM GeneCounts"> - <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend" - label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?" - help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled." /> + <param argument="--quantTranscriptomeBan" type="boolean" truevalue="IndelSoftclipSingleend" falsevalue="Singleend" label="Exclude alignments with indels or soft clipping from the transcriptome BAM output?" help="You will need to exclude alignments with indels and soft-clipped bases from the transcriptome BAM output for compatibility with certain transcript quantification tools, most notably RSEM. If you are using a tool, like eXpress, that can deal with indels and soft-clipped bases, you can achieve better results by leaving this option disabled."/> </when> </conditional> </xml> <xml name="quantModeNoGTF"> <conditional name="quantmode_output"> - <param argument="--quantMode" type="select" - label="Per gene/transcript output"> + <param argument="--quantMode" type="select" label="Per gene/transcript output"> <option value="-">No per gene or transcript output as no GTF was provided</option> </param> - <when value="-" /> + <when value="-"/> </conditional> </xml> <xml name="outSAMmapqUnique"> <!-- MAPQ 255 is the default in STAR (coming from tophat behaviour and compatibility for Cufflinks) but it is a problematic value - according to SAM/BAM specs it means "undefined". - Using 255 as the max mapq causes problem with modern downstream tools like mutect2: https://sites.duke.edu/workblog/2021/08/18/star-rnaseq-gatk-mutect2/ and 60 has become an inofficial replacement for 255. --> - <param argument="--outSAMmapqUnique" type="integer" value="60" min="0" max="255" - label="MAPQ value for unique mappers" - help="STAR bases the mapping quality scores of alignment records in its BAM output on the number of alternative mappings for the read. If a read maps to multiple locations on the reference genome, the following MAPQ scoring scheme is -used: >=5 mappings => MAPQ=0; 3-4 mappings => MAPQ=1; 2 mappings => MAPQ=3. This setting lets you control the MAPQ used for reads mapped to a single location. Set to 255 for compatibility with Cufflink (default in STAR) but keep to 60 for modern downstream tools like mutect2." /> + <param argument="--outSAMmapqUnique" type="integer" value="60" min="0" max="255" label="MAPQ value for unique mappers" help="STAR bases the mapping quality scores of alignment records in its BAM output on the number of alternative mappings for the read. If a read maps to multiple locations on the reference genome, the following MAPQ scoring scheme is used: >=5 mappings => MAPQ=0; 3-4 mappings => MAPQ=1; 2 mappings => MAPQ=3. This setting lets you control the MAPQ used for reads mapped to a single location. Set to 255 for compatibility with Cufflink (default in STAR) but keep to 60 for modern downstream tools like mutect2."/> + </xml> + <xml name="wasp"> + <!-- + This is re-implementation of the original WASP algorithm by Bryce van de Geijn, Graham McVicker, + Yoav Gilad and Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, + 1061–1063 (2015) https://www.nature.com/articles/nmeth.3582. WASP filtering is activated + with "waspOutputMode SAMtag". + --> + <conditional name="wasp_conditional"> + <param argument="--waspOutputMode" type="select" label="Actiavte WASP filtering"> + <help><![CDATA[This is a reimplementation of the original WASP algorithm by Bryce van de Geijn, Graham McVicker, + Yoav Gilad and Jonathan K Pritchard. https://doi.org/10.1038/nmeth.3582. This option will add the vW tag to the SAM output. vW:i:1 means + alignment passed WASP filtering, and all other values mean it did not:<br/> + - vW:i:2 = multi-mapping read<br/> + - vW:i:3 = variant base in the read is N (non-ACGT)<br/> + - vW:i:4 = remapped read did not map <br/> + - vW:i:5 = remapped read multi-maps <br/> + - vW:i:6 = remapped read maps to a different locus <br/> + - vW:i:7 = read overlaps too many variants <br/> + ]]> + </help> + <option value="" selected="true">No WASP filtering</option> + <option value="wasp_mode">Activate WASP filtering</option> + </param> + <when value="wasp_mode"> + <param argument="--varVCFfile" type="data" format="vcf" label="VCF file with personal variants" help="Each variant is expected to have a genotype with two alleles. The VCF file needs to have the 10th column with genotype recorded as 0/1, 1/0, 1/1 (or | instead of /)"/> + </when> + <when value=""/> + </conditional> </xml> </macros>
--- a/data_manager/rna_star_index_builder.py Sun Apr 16 08:28:41 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -#!/usr/bin/env python - -import argparse -import json - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--config-file') - parser.add_argument('--value') - parser.add_argument('--dbkey') - parser.add_argument('--name') - parser.add_argument('--subdir') - parser.add_argument('--data-table') - parser.add_argument('--with-gene-model', action='store_true') - parser.add_argument('--index-version') - - args = parser.parse_args() - - if args.dbkey in [None, '', '?']: - raise Exception( - '"%s" is not a valid dbkey. You must specify a valid dbkey.' - % (args.dbkey) - ) - - with_gene_model = "0" - if args.with_gene_model: - with_gene_model = "1" - - data_manager_dict = { - 'data_tables': { - args.data_table: [ - { - "value": args.value, - "dbkey": args.dbkey, - "name": args.name, - "path": args.subdir, - "with_gene_model": with_gene_model, - "version": args.index_version - } - ] - } - } - with open(args.config_file, 'w') as fh: - json.dump(data_manager_dict, fh, sort_keys=True) - - -if __name__ == "__main__": - main()
--- a/data_manager/rna_star_index_builder.xml Sun Apr 16 08:28:41 2023 +0000 +++ b/data_manager/rna_star_index_builder.xml Thu Dec 05 06:47:38 2024 +0000 @@ -1,15 +1,15 @@ -<tool id="rna_star_index_builder_data_manager" name="rnastar index versioned" tool_type="manage_data" version="@IDX_VERSION@+galaxy@IDX_VERSION_SUFFIX@" profile="19.05"> +<tool id="rna_star_index_builder_data_manager" name="RNAStar index versioned" tool_type="manage_data" version="@IDX_VERSION@+galaxy@IDX_VERSION_SUFFIX@" profile="23.0"> <description>builder</description> <macros> <import>macros.xml</import> </macros> - <expand macro="requirements"> - <requirement type="package" version="3.7">python</requirement> - </expand> + <expand macro="requirements"/> <command><![CDATA[ +. '$dmfxns' && + if [ -z "\$GALAXY_MEMORY_MB" ] ; then GALAXY_MEMORY_BYTES=31000000000 ; else @@ -22,6 +22,27 @@ mkdir '${target_directory}' && +#if $auto_sa_index_nbases or $auto_chr_bin_nbits: +nbases="\$(grep -v '^>' '${all_fasta_source.fields.path}' | tr -d '\n' | wc -c)" && +echo "Bases in reference: \$nbases" && +#end if + +#if $auto_sa_index_nbases: +saindex_nbases=\$((\$(log2 \$nbases) / 2 - 1)) && +[[ \$saindex_nbases -lt 14 ]] || saindex_nbases=14 && +#else if $advanced_options.advanced_options_selector == "advanced": +saindex_nbases=${advanced_options.genomeSAindexNbases} && +#end if + +#if $auto_chr_bin_nbits: +nseqs="\$(grep -c '>' '${all_fasta_source.fields.path}')" && +echo "Sequences in reference: \$nseqs" && +chr_bin_nbits=\$((\$(log2 \$nbases) / \$(log2 \$nseqs))) && +[[ \$chr_bin_nbits -lt 18 ]] || chr_bin_nbits=18 && +#else if $advanced_options.advanced_options_selector == "advanced": +chr_bin_nbits=${advanced_options.genomeChrBinNbits} && +#end if + STAR --runMode genomeGenerate --genomeFastaFiles '${all_fasta_source.fields.path}' @@ -31,29 +52,52 @@ --sjdbGTFfile '${GTFconditional.sjdbGTFfile}' --sjdbOverhang ${GTFconditional.sjdbOverhang} #end if +#if $advanced_options.advanced_options_selector == "advanced" or $auto_sa_index_nbases: + --genomeSAindexNbases "\$saindex_nbases" +#end if +#if $advanced_options.advanced_options_selector == "advanced" or $auto_chr_bin_nbits: + --genomeChrBinNbits "\$chr_bin_nbits" +#end if #if $advanced_options.advanced_options_selector == "advanced": - --genomeSAindexNbases ${advanced_options.genomeSAindexNbases} - --genomeChrBinNbits ${advanced_options.genomeChrBinNbits} --genomeSAsparseD ${advanced_options.genomeSAsparseD} #end if --runThreadN \${GALAXY_SLOTS:-2} && -python '${__tool_directory__}/rna_star_index_builder.py' ---config-file '${out_file}' ---value '${all_fasta_source.fields.value}' ---dbkey '${all_fasta_source.fields.dbkey}' ---index-version '@IDX_VERSION@' -#if $name: - --name '$name' -#else - --name '${all_fasta_source.fields.name}' -#end if -#if str($GTFconditional.GTFselect) == "withGTF": - --with-gene-model -#end if ---data-table @IDX_DATA_TABLE@ ---subdir '${subdir}' +cp '$dmjson' '$out_file' ]]></command> + <configfiles> + <configfile name="dmfxns"><![CDATA[ +function log2() { + local n=\$1 + local log2=0 + while [[ \$n -gt 1 ]]; do + n=\$((n >> 1)) + log2=\$((log2 + 1)) + done + [[ \$log2 -gt 0 ]] && echo \$log2 || echo 1 +} +]]></configfile> + <configfile name="dmjson"><![CDATA[#slurp +#set $fasta_file_name = str($all_fasta_source.fields.path).split('/')[-1] +#set $name = $name or $all_fasta_source.fields.name +#set $target_directory = str($out_file.extra_files_path) +#set $with_gene_model = 1 if str($GTFconditional.GTFselect) == "withGTF" else 0 +{ + "data_tables":{ + "@IDX_DATA_TABLE@":[ + { + "value": "${all_fasta_source.fields.value}", + "dbkey": "${all_fasta_source.fields.dbkey}", + "name": "${name}", + "path": "SA", + "with_gene_model": "${with_gene_model}", + "version": "@IDX_VERSION@" + } + ] + } +} +]]></configfile> + </configfiles> <inputs> <param name="all_fasta_source" type="select" label="Source FASTA Sequence"> <options from_data_table="all_fasta"/> @@ -71,6 +115,12 @@ </when> <when value="withoutGTF" /> </conditional> + <param name="auto_sa_index_nbases" type="boolean" checked="true" + label="Automatically calculate --genomeSAindexNbases" + help="The value specified for --genomeSAindexNbases in advanced options will be ignored if this option is selected"/> + <param name="auto_chr_bin_nbits" type="boolean" checked="true" + label="Automatically calculate --genomeChrBinNbits" + help="The value specified for --genomeChrBinNbits in advanced options will be ignored if this option is selected"/> <conditional name="advanced_options"> <param name="advanced_options_selector" type="select" label="Advanced options"> <option value="default" selected="true">Use default options</option> @@ -90,7 +140,8 @@ of contigs, it is recommended to scale this parameter as min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)]). For example, for 3 gigaBase genome with 100,000 chromosomes/scaffolds, this is equal to 15."/> - <param argument="--genomeSAsparseD" type="integer" min="1" value="1" label="Suffix array sparsity" + <param argument="--genomeSAsparseD" type="integer" min="1" value="1" + label="Suffix array sparsity" help="The distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction"/> </when> @@ -104,11 +155,16 @@ <tests> <test> <param name="all_fasta_source" value="phiX174"/> - <param name="sequence_name" value="phiX"/> - <param name="sequence_id" value="minimal-settings"/> - <param name="modelformat" value="None"/> - - <output name="out_file" file="test_star_01.data_manager_json" compare="re_match"/> + <output name="out_file" file="test_star_01.data_manager_json"/> + </test> + <test> + <param name="all_fasta_source" value="phiX174"/> + <param name="name" value="phiX"/> + <output name="out_file"> + <assert_contents> + <has_text text='"name": "phiX"'/> + </assert_contents> + </output> </test> </tests>
--- a/data_manager_conf.xml Sun Apr 16 08:28:41 2023 +0000 +++ b/data_manager_conf.xml Thu Dec 05 06:47:38 2024 +0000 @@ -12,9 +12,9 @@ out_file.extra_files_path is used as base by default if no source, eg for type=directory, then refers to base --> - <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">rnastar/${version}/${dbkey}/${value}/${path}</target> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">genomes/${dbkey}/rnastar_index/v${version}/${value}</target> </move> - <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/rnastar/${version}/${dbkey}/${value}/${path}</value_translation> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/genomes/${dbkey}/rnastar_index/v${version}/${value}</value_translation> <value_translation type="function">abspath</value_translation> </column> <column name="with_gene_model" />
--- a/test-data/test_star_01.data_manager_json Sun Apr 16 08:28:41 2023 +0000 +++ b/test-data/test_star_01.data_manager_json Thu Dec 05 06:47:38 2024 +0000 @@ -1,1 +1,14 @@ -{"data_tables": {"rnastar_index2x_versioned": \[{"dbkey": "phiX174", "name": "phiX174", "path": ".*", "value": "phiX174", "version": "2.7.4a", "with_gene_model": "0"}\]}} +{ + "data_tables":{ + "rnastar_index2x_versioned":[ + { + "value": "phiX174", + "dbkey": "phiX174", + "name": "phiX174", + "path": "SA", + "with_gene_model": "0", + "version": "2.7.4a" + } + ] + } +}
--- a/tool_data_table_conf.xml.test Sun Apr 16 08:28:41 2023 +0000 +++ b/tool_data_table_conf.xml.test Thu Dec 05 06:47:38 2024 +0000 @@ -5,8 +5,8 @@ <file path="${__HERE__}/test-data/all_fasta.loc" /> </table> <!-- Locations of STAR indexes --> - <table name="rnastar_index2_versioned" comment_char="#" allow_duplicate_entries="False"> + <table name="rnastar_index2x_versioned" comment_char="#" allow_duplicate_entries="False"> <columns>value, dbkey, name, path, with_gene_model, version</columns> - <file path="${__HERE__}/test-data/rnastar_index2_versioned.loc" /> + <file path="${__HERE__}/test-data/rnastar_index2x_versioned.loc" /> </table> </tables>
