diff datasets_genome.xml @ 16:9a10a6449901 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_datasets commit 1c7abf3293422fb432b5acd2ef178e7536d65f0b
author iuc
date Fri, 21 Feb 2025 18:43:58 +0000
parents dfad868c911b
children 35d32c807c23
line wrap: on
line diff
--- a/datasets_genome.xml	Thu Sep 21 23:02:45 2023 +0000
+++ b/datasets_genome.xml	Fri Feb 21 18:43:58 2025 +0000
@@ -33,6 +33,10 @@
     --chromosomes '$filters.chromosomes'
 #end if
 $filters.exclude_atypical
+#if $filters.mag:
+    --mag '$filters.mag'
+#end if
+
 @INCLUDE@
 @RELEASED_BEFORE@
 @RELEASED_AFTER@
@@ -51,7 +55,7 @@
 ## unzip and rehydrate if any data is to be downloaded (include is not None)
 #if $file_choices.include
     ## unzip
-    && 7z x -y ncbi_dataset.zip > 7z.log
+    && unzip ncbi_dataset.zip
 
     ## rehydrate
     && datasets rehydrate
@@ -106,6 +110,10 @@
             <expand macro="assembly_source"/>
             <expand macro="chromosomes"/>
             <param argument="--exclude-atypical" type="boolean" truevalue="--exclude-atypical" falsevalue="" label="Exclude atypical assemblies"/>
+            <param argument="--mag" type="select" multiple="false" optional="true" label="Filter metagenome assembled genomes (MAGs)">
+                <option value="only" selected="false">Limit to MAGs</option>
+                <option value="exclude" selected="false">Exclude MAGs</option>
+            </param>
             <expand macro="released_options"/>
             <expand macro="released_options" before_or_after="after"/>
 
@@ -254,7 +262,7 @@
                 <param name="decompress" value="true"/>
             </section>
             <assert_stderr>
-                <has_text text="No assemblies found that match selection"/>
+                <has_text text="no genome assemblies were found"/>
             </assert_stderr>
             <!-- In the current state of the NCBI tool/DB, no output to check.
              But the returned results seem to change from time to time and it might
@@ -297,14 +305,14 @@
                     <assert_contents>
                         <has_text text="GCF_000007445.1"/>
                         <has_n_lines n="2"/>
-                        <has_n_columns n="14"/>
+                        <has_n_columns n="15"/>
                     </assert_contents>
                 </element>
                 <element name="GCF_000013305.1">
                     <assert_contents>
                         <has_text text="GCF_000013305.1"/>
                         <has_n_lines n="2"/>
-                        <has_n_columns n="14"/>
+                        <has_n_columns n="15"/>
                     </assert_contents>
                 </element>
             </output_collection>
@@ -332,8 +340,8 @@
             </section>
             <output name="genome_data_report">
                 <assert_contents>
-                   <has_text text="GCF_000013305.1"/>
-                   <has_text text="GCF_000007445.1"/>
+                    <has_text text="GCF_000013305.1"/>
+                    <has_text text="GCF_000007445.1"/>
                     <has_n_lines n="3"/>
                     <has_n_columns n="4"/>
                 </assert_contents>
@@ -349,7 +357,7 @@
         </test>
 
         <!-- should not fail https://github.com/ncbi/datasets/issues/194 -->
-        <test expect_num_outputs="2"> <!-- expect_failure="true"> -->
+        <test expect_num_outputs="2">
             <conditional name="query|subcommand">
                 <param name="download_by" value="accession"/>
                 <conditional name="text_or_file">
@@ -362,9 +370,17 @@
             <section name="file_choices">
                 <param name="include" value="seq-report"/>
             </section>
-            <!-- 
-            <output_collection name="sequence_report" type="list" count="4" >
-            -->
+            <output name="genome_data_report">
+                <!-- assert that we get at least the 16 versions available at the time of writing this test -->
+                <assert_contents>
+                    <has_text text="GCF_000001405" min="16"/>
+                    <has_n_lines min="16"/>
+                    <has_n_columns n="4"/>
+                </assert_contents>
+            </output>
+            <!--not testing the collection output. the count will change over time
+                and this can't be tested for at the moment
+                <output_collection name="sequence_report" type="list" count="16"/> -->
         </test>
         <test expect_num_outputs="5">
             <conditional name="query|subcommand">
@@ -413,7 +429,7 @@
                 <element name="GCF_000146045.2">
                     <element name="GCF_000146045.2_R64" ftype="fasta.gz">
                         <assert_contents>
-                            <has_size value="3843460"/>
+                            <has_size value="3843460" delta="2000"/>
                         </assert_contents>
                     </element>
                 </element>
@@ -421,14 +437,14 @@
             <output_collection name="protein_fasta" type="list" count="1">
                 <element name="GCF_000146045.2" ftype="fasta.gz">
                     <assert_contents>
-                        <has_size value="1845038"/>
+                        <has_size value="1845038" delta="2000"/>
                     </assert_contents>
                 </element>
             </output_collection>
             <output_collection name="rna_fasta" type="list" count="1">
                 <element name="GCF_000146045.2" ftype="fasta.gz">
                     <assert_contents>
-                        <has_size value="2784899"/>
+                        <has_size min="2700000" max="2800000"/>
                     </assert_contents>
                 </element>
             </output_collection>
@@ -447,14 +463,13 @@
             </section>
             <output_collection name="sequence_report" type="list" count="2"/>
             <output_collection name="genome_fasta" type="list:list" count="2">
-                <expand macro="genome_fasta_assert" el1="GCF_000002945.1" el2="GCF_000002945.1_ASM294v2" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="4"/>
-                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc] (mitochondrion|chromosome .*), complete (sequence|genome)" expression_n="17"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000002945.2" el2="GCF_000002945.2_ASM294v3" expression=">NC_[0-9]+\.[0-9]+ Schizosaccharomyces pombe.*" expression_n="4"/>
+                <expand macro="genome_fasta_assert" el1="GCF_000146045.2" el2="GCF_000146045.2_R64" expression=">NC_[0-9]+\.[0-9]+ Saccharomyces cerevisiae S288[Cc].*" expression_n="17"/>
             </output_collection>
         </test>
-        <!-- tax_exact_match seems not able to filter out strains
-             https://github.com/ncbi/datasets/issues/187
-             hence we set  expect_test_failure="true"-->
-        <test expect_num_outputs="1" expect_test_failure="true">
+        <!-- tax_exact_match should filter out strains
+             https://github.com/ncbi/datasets/issues/187 -->
+        <test expect_num_outputs="1">
             <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"/>
                 <param name="taxon_positional" value="4932"/>