changeset 3:4522bc2f7cca draft

"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
author bgruening
date Tue, 25 May 2021 20:11:49 +0000
parents 5afc675c2d1c
children 7f96f59e1d5c
files bionano_scaffold.xml macros.xml remove_fake_cut_sites.py test-data/test_05_report.txt
diffstat 4 files changed, 198 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/bionano_scaffold.xml	Sun May 23 17:21:47 2021 +0000
+++ b/bionano_scaffold.xml	Tue May 25 20:11:49 2021 +0000
@@ -7,6 +7,8 @@
     <expand macro="requirements"/>
     <command detect_errors="exit_code"><![CDATA[
         #set RefAligner = '/usr/local/bin/RefAligner'
+        #set output_file_NCBI = 'hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD_NCBI.fasta'
+        #set output_file_not_scaffolded = 'hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD_NOT_SCAFFOLDED.fasta'
         ## softlinks do not work
         cp '${ngs_fasta}' ./ngs.fasta
         && cp '${bionano_cmap}' ./bionano.cmap
@@ -33,8 +35,7 @@
         -r $RefAligner
         #if $conflict_resolution
             -M '${conflict_resolution}'
-        #end if
-        #if not $conflict_resolution
+        #else
             -B $conflict_filter_genome
             -N $conflict_filter_sequence
         #end if
@@ -54,7 +55,10 @@
         -f
         $zip_file
         -o ./
-
+        && cat $output_file_NCBI $output_file_not_scaffolded > total_contigs_raw.fasta
+        #if $trim_cut_sites
+            && python '$__tool_directory__/remove_fake_cut_sites.py' 'total_contigs_raw.fasta' 'total_contigs_trimmed.fasta' 'output.log'
+        #end if        
     ]]>    </command>
     <configfiles>
         <configfile name="vgp_mode"><![CDATA[
@@ -70,7 +74,7 @@
                         <flag attr="RAmem" val0="3" val1="1"/>
                     </global>
                     <fasta2cmap>
-                        <flag attr="enzyme" val0="$configuration_options.enzyme" display="Enzyme" group="FASTA to CMAP digestion" description="Define single enzyme for in-silico FASTA to CMAP digestion. Avalible enzymes: BspQI, BbvCI, BsmI, BsrDI, BssSI, DLE1."/>
+                        <flag attr="enzyme" val0="$configuration_options.enzyme" display="Enzyme" group="FASTA to CMAP digestion" description="Define single enzyme for in-silico FASTA to CMAP digestion. Available enzymes: BspQI, BbvCI, BsmI, BsrDI, BssSI, DLE1."/>
                         <flag attr="channelNum" val0="1" display="Channel number" group="FASTA to CMAP digestion" description="Specify the channel the enzyme was used."/>
                         <flag attr="minLabels" val0="0" display="Minimum label sites" group="FASTA to CMAP digestion" description="Specify minimum number of label sites per digested contig."/>
                         <flag attr="minLength" val0="0" display="Minimum length (Kb)" group="FASTA to CMAP digestion" description="Specify minimum length in Kb of each digested contig."/>
@@ -384,6 +388,7 @@
             <option value="3">Exclude conflicting contig</option>
         </param>
         <param name="zip_file" argument="-z" type="boolean" truevalue="-z results.zip" falsevalue="" checked="false" label="Generate an output package in ZIP format" help="The hybrid scaffold output package (.zip) can be imported into Access for visualization" />
+        <param name="trim_cut_sites" type="boolean" checked="true" label="Remove BioNano cut sites" help="This option removes the spurious BioNano cut sites that are inserted into gaps in some assemblies, replacing them with Ns." />
         <!-- 
         
         Those options have been disabled because the Docker container doesn't include the required packages
@@ -415,16 +420,22 @@
         -->
     </inputs>
     <outputs>
-        <data name="ngs_contigs_scaffold_fasta" format="fasta" from_work_dir="hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD.fasta" label="${tool.name} on ${on_string}: NGScontigs hybrid scaffold (fasta)"/>
-        <data name="ngs_contigs_scaffold_agp" format="txt" from_work_dir="hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD.agp" label="${tool.name} on ${on_string}: NGScontigs hybrid scaffold (agp)"/>
-        <data name="ngs_contigs_scaffold_gap" format="txt" from_work_dir="hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD.gap" label="${tool.name} on ${on_string}: NGScontigs hybrid scaffold (gap)"/>
+        <data name="ngs_contigs" format="fasta" from_work_dir="total_contigs_raw.fasta" label="${tool.name} on ${on_string}: NGScontigs">
+            <filter>trim_cut_sites == False</filter>
+        </data>
+        <data name="ngs_contigs_trimmed" format="fasta" from_work_dir="total_contigs_trimmed.fasta" label="${tool.name} on ${on_string}: NGScontigs trimmed">
+            <filter>trim_cut_sites</filter>
+        </data>
+        <data name="ngs_contigs_trimmed_report" format="txt" from_work_dir="output.log" label="${tool.name} on ${on_string}: NGScontigs trimmed log">
+            <filter>trim_cut_sites</filter>
+        </data>
         <data name="report" format="txt" from_work_dir="hybrid_scaffolds/hybrid_scaffold_informatics_report.txt" label="${tool.name} on ${on_string}: hybrid scaffold report"/>
         <data name="results" format="zip" from_work_dir="results.zip" label="${tool.name} on ${on_string}: results">
             <filter>zip_file</filter>
         </data>
     </outputs>
     <tests>
-        <test expect_num_outputs="5">
+        <test expect_num_outputs="3">
             <param name="ngs_fasta" value="assembly.fasta.gz"/>
             <param name="bionano_cmap" value="colormap_assembly.cmap"/>
             <param name="conflict_filter_genome" value="3"/>
@@ -434,13 +445,12 @@
                 <param name="configuration_file" value="configuration.xml"/>
             </conditional>
             <param name="zip_file" value="true"/>
-            <output name="ngs_contigs_scaffold_fasta" ftype="fasta">
+            <param name="trim_cut_sites" value="false"/>
+            <output name="ngs_contigs" ftype="fasta">
                 <assert_contents>
                     <has_size value="4753369" delta="300" />
                 </assert_contents>
             </output>
-            <output name="ngs_contigs_scaffold_agp" file="test_01.agp" ftype="txt"/>
-            <output name="ngs_contigs_scaffold_gap" file="test_01.gap" ftype="txt"/>
             <output name="report" file="test_01_report.txt" ftype="txt"/>
             <output name="results" ftype="zip">
                 <assert_contents>
@@ -464,24 +474,23 @@
                 <has_text text="hybridScaffold"/>
             </assert_stdout>
         </test>
-        <test expect_num_outputs="4">
+        <test expect_num_outputs="2">
             <param name="ngs_fasta" value="assembly.fasta.gz"/>
             <param name="bionano_cmap" value="colormap_assembly.cmap"/>
             <param name="conflict_filter_genome" value="2"/>
             <param name="conflict_filter_sequence" value="2"/>
+            <param name="trim_cut_sites" value="false"/>
             <conditional name="configuration_options">
                 <param name="configuration" value="file"/>
                 <param name="configuration_file" value="configuration.xml"/>
             </conditional>
-            <output name="ngs_contigs_scaffold_fasta" ftype="fasta">
+            <output name="ngs_contigs" ftype="fasta">
                 <assert_contents>
                     <has_size value="4753369" delta="100" />
                     <has_n_lines n="2"/>
                     <has_line line=">Super-Scaffold_1"/>
                 </assert_contents>
             </output>
-            <output name="ngs_contigs_scaffold_agp" file="test_02.agp" ftype="txt"/>
-            <output name="ngs_contigs_scaffold_gap" file="test_02.gap" ftype="txt"/>
             <output name="report" file="test_02_report.txt" ftype="txt"/>
             <assert_stdout>
                 <has_text text='attr="maxmem" val0="8"'/>
@@ -500,7 +509,7 @@
                 <!--attribute_is path="hybridScaffold/global/flag[@attr='maxthreads']" attribute="val0" text="2"/-->
             </assert_stdout>
         </test>
-        <test expect_num_outputs="5">
+        <test expect_num_outputs="3">
             <param name="ngs_fasta" value="assembly.fasta.gz"/>
             <param name="bionano_cmap" value="colormap_assembly.cmap"/>
             <param name="conflict_filter_genome" value="2"/>
@@ -510,15 +519,14 @@
                 <param name="configuration_file" value="configuration.xml"/>
             </conditional>
             <param name="zip_file" value="true"/>
-            <output name="ngs_contigs_scaffold_fasta" ftype="fasta">
+            <param name="trim_cut_sites" value="false"/>
+            <output name="ngs_contigs" ftype="fasta">
                 <assert_contents>
                     <has_size value="4753369" delta="100" />
                     <has_n_lines n="2"/>
                     <has_line line=">Super-Scaffold_1"/>
                 </assert_contents>
             </output>
-            <output name="ngs_contigs_scaffold_agp" file="test_03.agp" ftype="txt"/>
-            <output name="ngs_contigs_scaffold_gap" file="test_03.gap" ftype="txt"/>
             <output name="report" file="test_03_report.txt" ftype="txt"/>
             <output name="results" ftype="zip">
                 <assert_contents>
@@ -542,7 +550,7 @@
                 <has_text text="hybridScaffold"/>
             </assert_stdout>
         </test>
-        <test expect_num_outputs="5">
+        <test expect_num_outputs="3">
             <param name="ngs_fasta" value="assembly.fasta.gz"/>
             <param name="bionano_cmap" value="colormap_assembly.cmap"/>
             <param name="conflict_filter_genome" value="2"/>
@@ -552,15 +560,14 @@
                 <param name="enzyme" value="BspQI"/>
             </conditional>
             <param name="zip_file" value="true"/>
-            <output name="ngs_contigs_scaffold_fasta" ftype="fasta">
+            <param name="trim_cut_sites" value="false"/>
+            <output name="ngs_contigs" ftype="fasta">
                 <assert_contents>
                     <has_size value="4753369" delta="100" />
                     <has_n_lines n="2"/>
                     <has_line line=">Super-Scaffold_1"/>
                 </assert_contents>
             </output>
-            <output name="ngs_contigs_scaffold_agp" file="test_04.agp" ftype="txt"/>
-            <output name="ngs_contigs_scaffold_gap" file="test_04.gap" ftype="txt"/>
             <output name="report" file="test_04_report.txt" ftype="txt"/>
             <output name="results" ftype="zip">
                 <assert_contents>
@@ -583,6 +590,43 @@
                 <has_text text="hybridScaffold"/>
             </assert_stdout>
         </test>
+        <test expect_num_outputs="3">
+            <param name="ngs_fasta" value="assembly.fasta.gz"/>
+            <param name="bionano_cmap" value="colormap_assembly.cmap"/>
+            <param name="conflict_filter_genome" value="3"/>
+            <param name="conflict_filter_sequence" value="3"/>
+            <conditional name="configuration_options">
+                <param name="configuration" value="file"/>
+                <param name="configuration_file" value="configuration.xml"/>
+            </conditional>
+            <param name="trim_cut_sites" value="true"/>
+            <output name="ngs_contigs_trimmed" ftype="fasta">
+                <assert_contents>
+                    <has_size value="4832591" delta="300" />
+                </assert_contents>
+            </output>
+            <output name="ngs_contigs_trimmed_report" ftype="txt">
+                <assert_contents>
+                    <has_size value="0" />
+                </assert_contents>
+            </output>
+            <output name="report" file="test_05_report.txt" ftype="txt"/>
+            <assert_stdout>
+                <has_text text='attr="maxmem" val0="8"'/>
+            </assert_stdout>
+            <assert_stdout>
+                <has_text text='attr="maxthreads" val0="1"'/>
+            </assert_stdout>
+            <assert_stdout>
+                <has_text text='attr="insertThreads" val0="1"'/>
+            </assert_stdout>
+             <assert_stdout>
+                <has_text text='attr="maxvirtmem" val0="8"'/>
+            </assert_stdout>
+            <assert_stdout>
+                <has_text text="hybridScaffold"/>
+            </assert_stdout>
+        </test>
     </tests>
     <help><![CDATA[
 .. class:: infomark
--- a/macros.xml	Sun May 23 17:21:47 2021 +0000
+++ b/macros.xml	Tue May 25 20:11:49 2021 +0000
@@ -1,6 +1,6 @@
 <macros>
     <token name="@TOOL_VERSION@">3.6.1</token>
-    <token name="@GALAXY_TOOL_VERSION@">galaxy0</token>
+    <token name="@GALAXY_TOOL_VERSION@">galaxy1</token>
     <token name="@BIONANO_SUPPORT_TEXT@">
 Bionano Genomics has agreed to provide the licensed Bionano Solve
 software to enable the VGP to package the software in a container.
@@ -23,7 +23,7 @@
     </xml>
     <xml name="requirements">
         <requirements>
-            <container type="docker">bionanodocker/bionano-docker-scaffold:latest</container>
+            <container type="docker">quay.io/galaxy/bionano-docker-scaffold:1.6.01-bio</container>
         </requirements>
     </xml>
     <macro name="sanitize_string" >
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/remove_fake_cut_sites.py	Tue May 25 20:11:49 2021 +0000
@@ -0,0 +1,84 @@
+import re
+import sys
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+
+
+def main():
+
+    fasta_file = sys.argv[1]
+    output_file = sys.argv[2]
+    log_file = sys.argv[3]
+
+    output_handle = open(output_file, "w")
+    log_handle = open(log_file, "w")
+
+    with open(fasta_file, "r") as fasta_input_handle:
+        for record in SeqIO.parse(fasta_input_handle, "fasta"):
+
+            change_count = 0
+            cut_sites = [
+                Seq("CTTAAG"),
+                Seq("CTTCTCG"),
+                Seq("GCTCTTC"),
+                Seq("CCTCAGC"),
+                Seq("GAATGC"),
+                Seq("GCAATG"),
+                Seq("ATCGAT"),
+                Seq("CACGAG"),
+            ]
+
+            for cut_site in cut_sites:
+                cut_site_both_orientations = (cut_site, cut_site.reverse_complement())
+
+                for cut_site_for_orientation in cut_site_both_orientations:
+
+                    n_flank_length = 1
+                    search_pattern = (
+                        "N" * n_flank_length
+                        + str(cut_site_for_orientation)
+                        + "N" * n_flank_length
+                    )
+                    replacement = "N" * (
+                        n_flank_length * 2 + len(cut_site_for_orientation)
+                    )
+
+                    (new_string, changes) = re.subn(
+                        search_pattern,
+                        replacement,
+                        str(record.seq.upper()),
+                        flags=re.IGNORECASE,
+                    )
+                    change_count += changes
+
+                    record.seq = Seq(new_string)
+
+            if change_count > 0:
+                log_handle.write(
+                    " ".join([record.id, ":", str(change_count), "changes\n"])
+                )
+            SeqIO.write([record], output_handle, "fasta")
+
+            # Finally, count the matches
+            possible_fake_cut_sites = re.findall(
+                "N[^N]{1,10}N", str(record.seq.upper())
+            )
+            if len(possible_fake_cut_sites) > 0:
+                log_handle.write(
+                    " ".join(
+                        [
+                            record.id,
+                            ":",
+                            str(len(possible_fake_cut_sites)),
+                            "possible non-standard fake cut sites\n",
+                        ]
+                    )
+                )
+
+    output_handle.close()
+    log_handle.close()
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_05_report.txt	Tue May 25 20:11:49 2021 +0000
@@ -0,0 +1,45 @@
+Original BioNano Genome Map statistics:
+Count  = 2
+Min length (Mbp) = 0.720
+Median length (Mbp) = 2.313
+Mean length (Mbp) = 2.313
+N50 length (Mbp) = 3.906
+Max length (Mbp) = 3.906
+Total length (Mbp) = 4.625
+
+Original NGS sequences statistics:
+Count  = 1
+Min length (Mbp) = 4.753
+Median length (Mbp) = 4.753
+Mean length (Mbp) = 4.753
+N50 length (Mbp) = 4.753
+Max length (Mbp) = 4.753
+Total length (Mbp) = 4.753
+
+NGS FASTA sequence in hybrid scaffold statistics:
+Count  = 1
+Min length (Mbp) = 4.753
+Median length (Mbp) = 4.753
+Mean length (Mbp) = 4.753
+N50 length (Mbp) = 4.753
+Max length (Mbp) = 4.753
+Total length (Mbp) = 4.753
+
+Hybrid scaffold FASTA statistics:
+Count  = 1
+Min length (Mbp) = 4.753
+Median length (Mbp) = 4.753
+Mean length (Mbp) = 4.753
+N50 length (Mbp) = 4.753
+Max length (Mbp) = 4.753
+Total length (Mbp) = 4.753
+
+Hybrid scaffold FASTA plus not scaffolded NGS FASTA statistics:
+Count  = 1
+Min length (Mbp) = 4.753
+Median length (Mbp) = 4.753
+Mean length (Mbp) = 4.753
+N50 length (Mbp) = 4.753
+Max length (Mbp) = 4.753
+Total length (Mbp) = 4.753
+