Mercurial > repos > iuc > data_manager_star_index_builder

--- a/data_manager/macros.xml	Wed Aug 05 13:36:20 2020 +0000
+++ b/data_manager/macros.xml	Fri Sep 10 16:42:21 2021 +0000
@@ -5,7 +5,7 @@
     the index versions in sync, but you should manually adjust the +galaxy
     version number. -->
     <!-- STAR version to be used -->
-    <token name="@VERSION@">2.7.5b</token>
+    <token name="@VERSION@">2.7.8a</token>
     <!-- STAR index version compatible with this version of STAR
     This is the STAR version that introduced the index structure expected
     by the current version.
@@ -24,6 +24,16 @@
         </requirements>
     </xml>

+    <xml name="edam">
+        <edam_topics>
+            <edam_topic>topic_3170</edam_topic>
+            <edam_topic>topic_3308</edam_topic>
+        </edam_topics>
+        <edam_operations>
+            <edam_operation>operation_0292</edam_operation>
+        </edam_operations>
+    </xml>
+
     <xml name="index_selection" token_with_gene_model="0">
         <param argument="--genomeDir" name="genomeDir" type="select"
         label="Select reference genome"
@@ -122,12 +132,38 @@
         #end if
         #end if
         ]]></token>
+    <token name="@READSHANDLING@" ><![CDATA[
+    ## Check that the input pairs are of the same type
+    ## otherwise STARsolo will run for a long time and then error out.
+    ## We consume either repeats of two inputs R1 + R2
+    ## or a collection of paired reads.
+    #if str($sc.input_types.use) == "repeat":
+        #set $reads1 = []
+        #set $reads2 = []
+        #for $r1, $r2 in zip($sc.input_types.input1, $sc.input_types.input2):
+            #assert $r1.datatype == $r2.datatype
+            #silent $reads1.append(str($r1))
+            #silent $reads2.append(str($r2))
+        #end for
+        #set $reads1 = ','.join($reads1)
+        #set $reads2 = ','.join($reads2)
+    #elif str($sc.input_types.use) == "list_paired":
+        #set $r1 = $sc.input_types.input_collection.forward
+        #set $r2 = $sc.input_types.input_collection.reverse
+        #set $reads1 = $r1
+        #set $reads2 = $r2
+    #end if
+    ## cDNA sequence(s) [R2] always go first, then barcode(s) [R1]
+    ## see: Section 3.2 of STAR manual for multiple inputs, and Section 13 for STARsolo inputs
+    --readFilesIn $reads2 $reads1
+    --soloCBmatchWLtype $sc.soloCBmatchWLtype
+    #if $r1.is_of_type('fastq.gz', 'fastqsanger.gz'):
+        @FASTQ_GZ_OPTION@
+    #end if
+    ]]></token>
     <xml name="ref_selection">
         <param argument="--genomeFastaFiles" type="data" format="fasta" label="Select a reference genome" />
-        <!-- Currently, this parameter is not exposed in the wrapper,
-             but used only in the tests to avoid excessive index sizes for
-             the tiny test genomes. -->
-        <param name="genomeSAindexNbases" type="hidden" value="" />
+          <param argument="--genomeSAindexNbases" type="integer" min="2" max="16" value="14" label="Length of the SA pre-indexing string" help="Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1)"/>
     </xml>
     <xml name="stdio" >
         <stdio>
@@ -138,4 +174,75 @@
             <yield />
         </stdio>
     </xml>
+    <xml name="input_selection">
+        <conditional name="input_types" >
+            <param name="use" type="select" label="Input Type" >
+                <option value="repeat" >Separate barcode and cDNA reads</option>
+                <option value="list_paired" >Paired collection of barcode and cDNA reads</option>
+            </param>
+            <when value="repeat">
+                <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input1" type="data"  multiple="true"
+                label="RNA-Seq FASTQ/FASTA file, Barcode reads" />
+                <param format="fastq,fasta,fastq.gz,fastqsanger.gz" name="input2" type="data"  multiple="true"
+                label="RNA-Seq FASTQ/FASTA file, cDNA reads"/>
+            </when>
+            <when value="list_paired">
+                <param name="input_collection" collection_type="paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="Collection of Pairs" />
+            </when>
+        </conditional>
+    </xml>
+    <xml name="input_selection_smart_seq">
+        <conditional name="input_types_smart_seq" >
+            <param name="use" type="select" label="Input Type" >
+                <option value="list_single_end" >Single-end FASTQ collection</option>
+                <option value="list_paired_end" >Paired FASTQ collection</option>
+            </param>
+            <when value="list_single_end">
+                <param name="single_end_collection" collection_type="list" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of single-end FASTQ files" />
+            </when>
+            <when value="list_paired_end">
+                <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fasta,fastq.gz,fastqsanger.gz" label="List of paired-end FASTQ files" />
+            </when>
+        </conditional>
+    </xml>
+    <xml name="umidedup_options">
+        <option value="1MM_All" selected="true">Collapse all UMIs with 1 mismatch distance to each other</option>
+        <option value="1MM_Directional_UMItools" >Directional method from the UMI-tool</option>
+        <option value="1MM_Directional" >Directional with stringent UMI deduplication</option>
+    </xml>
+    <xml name="anchor_types">
+        <option value="0">Read start</option>
+        <option value="1">Read end</option>
+        <option value="2">Adapter start</option>
+        <option value="3">Adapter end</option>
+    </xml>
+    <xml name="cb_match_wl_common">
+        <option value="Exact" >Exact</option>
+        <option value="1MM" >Single match</option>
+    </xml>
+    <xml name="cb_match_wl_cellranger">
+        <option value="1MM_multi" selected="true" >Multiple matches (CellRanger 2)</option>
+        <option value="1MM_multi_pseudocounts" >Multiple matches (CellRanger 3)</option>
+        <option value="1MM_multi_Nbase_pseudocounts" >Multimatching to WL is allowed for CBs with N-bases (CellRanger 3)</option>
+    </xml>
+    <xml name="solo_adapter_params">
+        <param argument="--soloAdapterSequence" type="text" value="-" label="Adapter sequence to anchor barcodes." >
+            <sanitizer>
+                <valid initial="string.digits">
+                    <add value="-"/>
+                    <add value="A"/>
+                    <add value="T"/>
+                    <add value="C"/>
+                    <add value="G"/>
+                    <add value="N"/>
+                </valid>
+            </sanitizer>
+        </param>
+        <param argument="--soloAdapterMismatchesNmax" type="integer" min="1" value="1" label="Maximum number of mismatches allowed in adapter sequence" />
+        <param argument="--clipAdapterType" type="select" >
+            <option value="Hamming" selected="true" >Adapter clipping based on Hamming distance</option>
+            <option value="CellRanger4" >5p and 3p adapter clipping similar to CellRanger4</option>
+            <option value="None" >No adapter clipping</option>
+        </param>
+    </xml>
 </macros>
--- a/data_manager/rna_star_index_builder.py	Wed Aug 05 13:36:20 2020 +0000
+++ b/data_manager/rna_star_index_builder.py	Fri Sep 10 16:42:21 2021 +0000
@@ -41,7 +41,8 @@
             ]
         }
     }
-    open(args.config_file, 'w').write(json.dumps(data_manager_dict, sort_keys=True))
+    with open(args.config_file, 'w') as fh:
+        json.dump(data_manager_dict, fh, sort_keys=True)


 if __name__ == "__main__":
--- a/tool-data/rnastar_index2x_versioned.loc.sample	Wed Aug 05 13:36:20 2020 +0000
+++ b/tool-data/rnastar_index2x_versioned.loc.sample	Fri Sep 10 16:42:21 2021 +0000
@@ -2,7 +2,7 @@
 #to use a directory of rna-star indexed sequences data files.
 #You will need to create these data files and then create a
 #rnastar_index2x_versioned.loc file similar to this one (store it in this
-directory) that points to the directories in which those files are stored.
+#directory) that points to the directories in which those files are stored.
 #The rnastar_index2x_versioned.loc file has this format (longer white space
 #characters are TAB characters):
 #