Mercurial > repos > ieguinoa > data_manager_star_index_custom
diff data_manager/macros.xml @ 0:e23440b3332a draft
Uploaded
author | ieguinoa |
---|---|
date | Tue, 25 May 2021 15:14:56 +0000 |
parents | |
children | 0cd30e1fd3eb |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/macros.xml Tue May 25 15:14:56 2021 +0000 @@ -0,0 +1,248 @@ +<macros> + <!-- REMEMBER to bump the version of rna_star_index_builder_data_manager + whenever you make changes to the following two version tokens! + The data manager uses a symlink to this macro file to keep the STAR and + the index versions in sync, but you should manually adjust the +galaxy + version number. --> + <!-- STAR version to be used --> + <token name="@VERSION@">2.7.8a</token> + <!-- STAR index version compatible with this version of STAR + This is the STAR version that introduced the index structure expected + by the current version. + It can be found for any specific version of STAR with: + STAR -h | grep versionGenome + or by looking for the versionGenome parameter in source/parametersDefault + of STAR's source code --> + <token name="@IDX_VERSION@">2.7.4a</token> + <token name="@IDX_DATA_TABLE@">rnastar_index2x_versioned</token> + + <xml name="requirements"> + <requirements> + <requirement type="package" version="@VERSION@">star</requirement> + <requirement type="package" version="1.9">samtools</requirement> + <yield /> + </requirements> + </xml> + + <xml name="edam"> + <edam_topics> + <edam_topic>topic_3170</edam_topic> + <edam_topic>topic_3308</edam_topic> + </edam_topics> + <edam_operations> + <edam_operation>operation_0292</edam_operation> + </edam_operations> + </xml> + + <xml name="index_selection" token_with_gene_model="0"> + <param argument="--genomeDir" name="genomeDir" type="select" + label="Select reference genome" + help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="@IDX_DATA_TABLE@"> + <filter type="static_value" column="4" value="@WITH_GENE_MODEL@" /> + <filter type="static_value" column="5" value="@IDX_VERSION@" /> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No indexes are available for the selected input dataset" /> + </options> + </param> + </xml> + + <token name="@FASTQ_GZ_OPTION@"> + --readFilesCommand zcat + </token> + <xml name="citations"> + <citations> + <citation type="doi">10.1093/bioinformatics/bts635</citation> + </citations> + </xml> + <xml name="@SJDBOPTIONS@" token_optional="true"> + <param argument="--sjdbGTFfile" type="data" format="gff3,gtf" label="Gene model (gff3,gtf) file for splice junctions" optional="@OPTIONAL@" help="Exon junction information for mapping splices"/> + <param argument="--sjdbOverhang" type="integer" min="1" value="100" label="Length of the genomic sequence around annotated junctions" help="Used in constructing the splice junctions database. Ideal value is ReadLength-1"/> + </xml> + <xml name="dbKeyActions"> + <actions> + <conditional name="refGenomeSource.geneSource"> + <when value="indexed"> + <action type="metadata" name="dbkey"> + <option type="from_data_table" name="@IDX_DATA_TABLE@" column="1" offset="0"> + <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/> + <filter type="param_value" ref="refGenomeSource.GTFconditional.genomeDir" column="0"/> + </option> + </action> + </when> + <when value="history"> + <action type="metadata" name="dbkey"> + <option type="from_param" name="refGenomeSource.genomeFastaFiles" param_attribute="dbkey" /> + </action> + </when> + </conditional> + </actions> + </xml> + <token name="@TEMPINDEX@"><![CDATA[ + ## Create temporary index for custom reference + #if str($refGenomeSource.geneSource) == 'history': + mkdir -p tempstargenomedir && + STAR + --runMode genomeGenerate + --genomeDir 'tempstargenomedir' + --genomeFastaFiles '${refGenomeSource.genomeFastaFiles}' + ## Handle difference between indices with/without annotations + #if 'GTFconditional' in $refGenomeSource: + ## GTFconditional exists only in STAR, but not STARsolo + #if str($refGenomeSource.GTFconditional.GTFselect) == 'with-gtf': + --sjdbOverhang '${refGenomeSource.GTFconditional.sjdbOverhang}' + --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}' + #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3': + --sjdbGTFtagExonParentTranscript Parent + #end if + #end if + #else: + ## ref genome selection is less complex for STARsolo cause + ## with-gtf is mandatory there + --sjdbOverhang '${refGenomeSource.sjdbOverhang}' + --sjdbGTFfile '${refGenomeSource.sjdbGTFfile}' + #if str($refGenomeSource.sjdbGTFfile.ext) == 'gff3': + --sjdbGTFtagExonParentTranscript Parent + #end if + #end if + #if str($refGenomeSource.genomeSAindexNbases): + --genomeSAindexNbases ${refGenomeSource.genomeSAindexNbases} + #end if + --runThreadN \${GALAXY_SLOTS:-4} + && + #end if + ]]></token> + <token name="@REFGENOMEHANDLING@" ><![CDATA[ + --runThreadN \${GALAXY_SLOTS:-4} + --genomeLoad NoSharedMemory + --genomeDir + #if str($refGenomeSource.geneSource) == 'history': + tempstargenomedir + #else: + '${refGenomeSource.GTFconditional.genomeDir.fields.path}' + ## Handle difference between indices with/without annotations + #if str($refGenomeSource.GTFconditional.GTFselect) == 'without-gtf': + #if $refGenomeSource.GTFconditional.sjdbGTFfile: + --sjdbOverhang $refGenomeSource.GTFconditional.sjdbOverhang + --sjdbGTFfile '${refGenomeSource.GTFconditional.sjdbGTFfile}' + #if str($refGenomeSource.GTFconditional.sjdbGTFfile.ext) == 'gff3': + --sjdbGTFtagExonParentTranscript Parent + #end if + #end if + #end if + #end if + ]]></token> + <token name="@READSHANDLING@" ><![CDATA[ + ## Check that the input pairs are of the same type + ## otherwise STARsolo will run for a long time and then error out. + ## We consume either repeats of two inputs R1 + R2 + ## or a collection of paired reads. + #if str($sc.input_types.use) == "repeat": + #set $reads1 = [] + #set $reads2 = [] + #for $r1, $r2 in zip($sc.input_types.input1, $sc.input_types.input2): + #assert $r1.datatype == $r2.datatype + #silent $reads1.append(str($r1)) + #silent $reads2.append(str($r2)) + #end for + #set $reads1 = ','.join($reads1) + #set $reads2 = ','.join($reads2) + #elif str($sc.input_types.use) == "list_paired": + #set $r1 = $sc.input_types.input_collection.forward + #set $r2 = $sc.input_types.input_collection.reverse + #set $reads1 = $r1 + #set $reads2 = $r2 + #end if + ## cDNA sequence(s) [R2] always go first, then barcode(s) [R1] + ## see: Section 3.2 of STAR manual for multiple inputs, and Section 13 for STARsolo inputs + --readFilesIn $reads2 $reads1 + --soloCBmatchWLtype $sc.soloCBmatchWLtype + #if $r1.is_of_type('fastq.gz', 'fastqsanger.gz'): + @FASTQ_GZ_OPTION@ + #end if + ]]></token> + <xml name="ref_selection"> + <param argument="--genomeFastaFiles" type="data" format="fasta" label="Select a reference genome" /> + <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/> + </xml> + <xml name="stdio" > + <stdio> + <regex match="FATAL error" source="both" level="fatal"/> + <regex match="EXITING: FATAL INPUT ERROR:" source="both" level="fatal"/> + <regex match="EXITING: fatal error trying to allocate genome arrays, exception thrown: std::bad_alloc" source="both" level="fatal"/> + <regex match="\[sam_read1\] missing header\? Abort!" source="both" level="fatal"/> + <yield /> + </stdio> + </xml> + <xml name="input_selection"> + <conditional name="input_types" > + <param name="use" type="select" label="Input Type" > + <option value="repeat" >Separate barcode and cDNA reads</option> + <option value="list_paired" >Paired collection of barcode and cDNA reads</option> + </param> + <when value="repeat"> + <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input1" type="data" multiple="true" + label="RNA-Seq FASTQ/FASTA file, Barcode reads" /> + <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input2" type="data" multiple="true" + label="RNA-Seq FASTQ/FASTA file, cDNA reads"/> + </when> + <when value="list_paired"> + <param name="input_collection" collection_type="paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="Collection of Pairs" /> + </when> + </conditional> + </xml> + <xml name="input_selection_smart_seq"> + <conditional name="input_types_smart_seq" > + <param name="use" type="select" label="Input Type" > + <option value="list_single_end" >Single-end FASTQ collection</option> + <option value="list_paired_end" >Paired FASTQ collection</option> + </param> + <when value="list_single_end"> + <param name="single_end_collection" collection_type="list" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of single-end FASTQ files" /> + </when> + <when value="list_paired_end"> + <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of paired-end FASTQ files" /> + </when> + </conditional> + </xml> + <xml name="umidedup_options"> + <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other</option> + <option value="1MM_Directional_UMItools" >Directional method from the UMI-tool</option> + <option value="1MM_Directional" >Directional with stringent UMI deduplication</option> + </xml> + <xml name="anchor_types"> + <option value="0">Read start</option> + <option value="1">Read end</option> + <option value="2">Adapter start</option> + <option value="3">Adapter end</option> + </xml> + <xml name="cb_match_wl_common"> + <option value="Exact" >Exact</option> + <option value="1MM" >Single match</option> + </xml> + <xml name="cb_match_wl_cellranger"> + <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2)</option> + <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3)</option> + <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3)</option> + </xml> + <xml name="solo_adapter_params"> + <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." > + <sanitizer> + <valid initial="string.digits"> + <add value="-"/> + <add value="A"/> + <add value="T"/> + <add value="C"/> + <add value="G"/> + <add value="N"/> + </valid> + </sanitizer> + </param> + <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence" /> + <param argument="--clipAdapterType" type="select" > + <option value="Hamming" selected="true" >Adapter clipping based on Hamming distance</option> + <option value="CellRanger4" >5p and 3p adapter clipping similar to CellRanger4</option> + <option value="None" >No adapter clipping</option> + </param> + </xml> +</macros>