Mercurial > repos > bgruening > bionano_scaffold
changeset 3:4522bc2f7cca draft
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
author | bgruening |
---|---|
date | Tue, 25 May 2021 20:11:49 +0000 |
parents | 5afc675c2d1c |
children | 7f96f59e1d5c |
files | bionano_scaffold.xml macros.xml remove_fake_cut_sites.py test-data/test_05_report.txt |
diffstat | 4 files changed, 198 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/bionano_scaffold.xml Sun May 23 17:21:47 2021 +0000 +++ b/bionano_scaffold.xml Tue May 25 20:11:49 2021 +0000 @@ -7,6 +7,8 @@ <expand macro="requirements"/> <command detect_errors="exit_code"><![CDATA[ #set RefAligner = '/usr/local/bin/RefAligner' + #set output_file_NCBI = 'hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD_NCBI.fasta' + #set output_file_not_scaffolded = 'hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD_NOT_SCAFFOLDED.fasta' ## softlinks do not work cp '${ngs_fasta}' ./ngs.fasta && cp '${bionano_cmap}' ./bionano.cmap @@ -33,8 +35,7 @@ -r $RefAligner #if $conflict_resolution -M '${conflict_resolution}' - #end if - #if not $conflict_resolution + #else -B $conflict_filter_genome -N $conflict_filter_sequence #end if @@ -54,7 +55,10 @@ -f $zip_file -o ./ - + && cat $output_file_NCBI $output_file_not_scaffolded > total_contigs_raw.fasta + #if $trim_cut_sites + && python '$__tool_directory__/remove_fake_cut_sites.py' 'total_contigs_raw.fasta' 'total_contigs_trimmed.fasta' 'output.log' + #end if ]]> </command> <configfiles> <configfile name="vgp_mode"><![CDATA[ @@ -70,7 +74,7 @@ <flag attr="RAmem" val0="3" val1="1"/> </global> <fasta2cmap> - <flag attr="enzyme" val0="$configuration_options.enzyme" display="Enzyme" group="FASTA to CMAP digestion" description="Define single enzyme for in-silico FASTA to CMAP digestion. Avalible enzymes: BspQI, BbvCI, BsmI, BsrDI, BssSI, DLE1."/> + <flag attr="enzyme" val0="$configuration_options.enzyme" display="Enzyme" group="FASTA to CMAP digestion" description="Define single enzyme for in-silico FASTA to CMAP digestion. Available enzymes: BspQI, BbvCI, BsmI, BsrDI, BssSI, DLE1."/> <flag attr="channelNum" val0="1" display="Channel number" group="FASTA to CMAP digestion" description="Specify the channel the enzyme was used."/> <flag attr="minLabels" val0="0" display="Minimum label sites" group="FASTA to CMAP digestion" description="Specify minimum number of label sites per digested contig."/> <flag attr="minLength" val0="0" display="Minimum length (Kb)" group="FASTA to CMAP digestion" description="Specify minimum length in Kb of each digested contig."/> @@ -384,6 +388,7 @@ <option value="3">Exclude conflicting contig</option> </param> <param name="zip_file" argument="-z" type="boolean" truevalue="-z results.zip" falsevalue="" checked="false" label="Generate an output package in ZIP format" help="The hybrid scaffold output package (.zip) can be imported into Access for visualization" /> + <param name="trim_cut_sites" type="boolean" checked="true" label="Remove BioNano cut sites" help="This option removes the spurious BioNano cut sites that are inserted into gaps in some assemblies, replacing them with Ns." /> <!-- Those options have been disabled because the Docker container doesn't include the required packages @@ -415,16 +420,22 @@ --> </inputs> <outputs> - <data name="ngs_contigs_scaffold_fasta" format="fasta" from_work_dir="hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD.fasta" label="${tool.name} on ${on_string}: NGScontigs hybrid scaffold (fasta)"/> - <data name="ngs_contigs_scaffold_agp" format="txt" from_work_dir="hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD.agp" label="${tool.name} on ${on_string}: NGScontigs hybrid scaffold (agp)"/> - <data name="ngs_contigs_scaffold_gap" format="txt" from_work_dir="hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD.gap" label="${tool.name} on ${on_string}: NGScontigs hybrid scaffold (gap)"/> + <data name="ngs_contigs" format="fasta" from_work_dir="total_contigs_raw.fasta" label="${tool.name} on ${on_string}: NGScontigs"> + <filter>trim_cut_sites == False</filter> + </data> + <data name="ngs_contigs_trimmed" format="fasta" from_work_dir="total_contigs_trimmed.fasta" label="${tool.name} on ${on_string}: NGScontigs trimmed"> + <filter>trim_cut_sites</filter> + </data> + <data name="ngs_contigs_trimmed_report" format="txt" from_work_dir="output.log" label="${tool.name} on ${on_string}: NGScontigs trimmed log"> + <filter>trim_cut_sites</filter> + </data> <data name="report" format="txt" from_work_dir="hybrid_scaffolds/hybrid_scaffold_informatics_report.txt" label="${tool.name} on ${on_string}: hybrid scaffold report"/> <data name="results" format="zip" from_work_dir="results.zip" label="${tool.name} on ${on_string}: results"> <filter>zip_file</filter> </data> </outputs> <tests> - <test expect_num_outputs="5"> + <test expect_num_outputs="3"> <param name="ngs_fasta" value="assembly.fasta.gz"/> <param name="bionano_cmap" value="colormap_assembly.cmap"/> <param name="conflict_filter_genome" value="3"/> @@ -434,13 +445,12 @@ <param name="configuration_file" value="configuration.xml"/> </conditional> <param name="zip_file" value="true"/> - <output name="ngs_contigs_scaffold_fasta" ftype="fasta"> + <param name="trim_cut_sites" value="false"/> + <output name="ngs_contigs" ftype="fasta"> <assert_contents> <has_size value="4753369" delta="300" /> </assert_contents> </output> - <output name="ngs_contigs_scaffold_agp" file="test_01.agp" ftype="txt"/> - <output name="ngs_contigs_scaffold_gap" file="test_01.gap" ftype="txt"/> <output name="report" file="test_01_report.txt" ftype="txt"/> <output name="results" ftype="zip"> <assert_contents> @@ -464,24 +474,23 @@ <has_text text="hybridScaffold"/> </assert_stdout> </test> - <test expect_num_outputs="4"> + <test expect_num_outputs="2"> <param name="ngs_fasta" value="assembly.fasta.gz"/> <param name="bionano_cmap" value="colormap_assembly.cmap"/> <param name="conflict_filter_genome" value="2"/> <param name="conflict_filter_sequence" value="2"/> + <param name="trim_cut_sites" value="false"/> <conditional name="configuration_options"> <param name="configuration" value="file"/> <param name="configuration_file" value="configuration.xml"/> </conditional> - <output name="ngs_contigs_scaffold_fasta" ftype="fasta"> + <output name="ngs_contigs" ftype="fasta"> <assert_contents> <has_size value="4753369" delta="100" /> <has_n_lines n="2"/> <has_line line=">Super-Scaffold_1"/> </assert_contents> </output> - <output name="ngs_contigs_scaffold_agp" file="test_02.agp" ftype="txt"/> - <output name="ngs_contigs_scaffold_gap" file="test_02.gap" ftype="txt"/> <output name="report" file="test_02_report.txt" ftype="txt"/> <assert_stdout> <has_text text='attr="maxmem" val0="8"'/> @@ -500,7 +509,7 @@ <!--attribute_is path="hybridScaffold/global/flag[@attr='maxthreads']" attribute="val0" text="2"/--> </assert_stdout> </test> - <test expect_num_outputs="5"> + <test expect_num_outputs="3"> <param name="ngs_fasta" value="assembly.fasta.gz"/> <param name="bionano_cmap" value="colormap_assembly.cmap"/> <param name="conflict_filter_genome" value="2"/> @@ -510,15 +519,14 @@ <param name="configuration_file" value="configuration.xml"/> </conditional> <param name="zip_file" value="true"/> - <output name="ngs_contigs_scaffold_fasta" ftype="fasta"> + <param name="trim_cut_sites" value="false"/> + <output name="ngs_contigs" ftype="fasta"> <assert_contents> <has_size value="4753369" delta="100" /> <has_n_lines n="2"/> <has_line line=">Super-Scaffold_1"/> </assert_contents> </output> - <output name="ngs_contigs_scaffold_agp" file="test_03.agp" ftype="txt"/> - <output name="ngs_contigs_scaffold_gap" file="test_03.gap" ftype="txt"/> <output name="report" file="test_03_report.txt" ftype="txt"/> <output name="results" ftype="zip"> <assert_contents> @@ -542,7 +550,7 @@ <has_text text="hybridScaffold"/> </assert_stdout> </test> - <test expect_num_outputs="5"> + <test expect_num_outputs="3"> <param name="ngs_fasta" value="assembly.fasta.gz"/> <param name="bionano_cmap" value="colormap_assembly.cmap"/> <param name="conflict_filter_genome" value="2"/> @@ -552,15 +560,14 @@ <param name="enzyme" value="BspQI"/> </conditional> <param name="zip_file" value="true"/> - <output name="ngs_contigs_scaffold_fasta" ftype="fasta"> + <param name="trim_cut_sites" value="false"/> + <output name="ngs_contigs" ftype="fasta"> <assert_contents> <has_size value="4753369" delta="100" /> <has_n_lines n="2"/> <has_line line=">Super-Scaffold_1"/> </assert_contents> </output> - <output name="ngs_contigs_scaffold_agp" file="test_04.agp" ftype="txt"/> - <output name="ngs_contigs_scaffold_gap" file="test_04.gap" ftype="txt"/> <output name="report" file="test_04_report.txt" ftype="txt"/> <output name="results" ftype="zip"> <assert_contents> @@ -583,6 +590,43 @@ <has_text text="hybridScaffold"/> </assert_stdout> </test> + <test expect_num_outputs="3"> + <param name="ngs_fasta" value="assembly.fasta.gz"/> + <param name="bionano_cmap" value="colormap_assembly.cmap"/> + <param name="conflict_filter_genome" value="3"/> + <param name="conflict_filter_sequence" value="3"/> + <conditional name="configuration_options"> + <param name="configuration" value="file"/> + <param name="configuration_file" value="configuration.xml"/> + </conditional> + <param name="trim_cut_sites" value="true"/> + <output name="ngs_contigs_trimmed" ftype="fasta"> + <assert_contents> + <has_size value="4832591" delta="300" /> + </assert_contents> + </output> + <output name="ngs_contigs_trimmed_report" ftype="txt"> + <assert_contents> + <has_size value="0" /> + </assert_contents> + </output> + <output name="report" file="test_05_report.txt" ftype="txt"/> + <assert_stdout> + <has_text text='attr="maxmem" val0="8"'/> + </assert_stdout> + <assert_stdout> + <has_text text='attr="maxthreads" val0="1"'/> + </assert_stdout> + <assert_stdout> + <has_text text='attr="insertThreads" val0="1"'/> + </assert_stdout> + <assert_stdout> + <has_text text='attr="maxvirtmem" val0="8"'/> + </assert_stdout> + <assert_stdout> + <has_text text="hybridScaffold"/> + </assert_stdout> + </test> </tests> <help><![CDATA[ .. class:: infomark
--- a/macros.xml Sun May 23 17:21:47 2021 +0000 +++ b/macros.xml Tue May 25 20:11:49 2021 +0000 @@ -1,6 +1,6 @@ <macros> <token name="@TOOL_VERSION@">3.6.1</token> - <token name="@GALAXY_TOOL_VERSION@">galaxy0</token> + <token name="@GALAXY_TOOL_VERSION@">galaxy1</token> <token name="@BIONANO_SUPPORT_TEXT@"> Bionano Genomics has agreed to provide the licensed Bionano Solve software to enable the VGP to package the software in a container. @@ -23,7 +23,7 @@ </xml> <xml name="requirements"> <requirements> - <container type="docker">bionanodocker/bionano-docker-scaffold:latest</container> + <container type="docker">quay.io/galaxy/bionano-docker-scaffold:1.6.01-bio</container> </requirements> </xml> <macro name="sanitize_string" >
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/remove_fake_cut_sites.py Tue May 25 20:11:49 2021 +0000 @@ -0,0 +1,84 @@ +import re +import sys + +from Bio import SeqIO +from Bio.Seq import Seq + + +def main(): + + fasta_file = sys.argv[1] + output_file = sys.argv[2] + log_file = sys.argv[3] + + output_handle = open(output_file, "w") + log_handle = open(log_file, "w") + + with open(fasta_file, "r") as fasta_input_handle: + for record in SeqIO.parse(fasta_input_handle, "fasta"): + + change_count = 0 + cut_sites = [ + Seq("CTTAAG"), + Seq("CTTCTCG"), + Seq("GCTCTTC"), + Seq("CCTCAGC"), + Seq("GAATGC"), + Seq("GCAATG"), + Seq("ATCGAT"), + Seq("CACGAG"), + ] + + for cut_site in cut_sites: + cut_site_both_orientations = (cut_site, cut_site.reverse_complement()) + + for cut_site_for_orientation in cut_site_both_orientations: + + n_flank_length = 1 + search_pattern = ( + "N" * n_flank_length + + str(cut_site_for_orientation) + + "N" * n_flank_length + ) + replacement = "N" * ( + n_flank_length * 2 + len(cut_site_for_orientation) + ) + + (new_string, changes) = re.subn( + search_pattern, + replacement, + str(record.seq.upper()), + flags=re.IGNORECASE, + ) + change_count += changes + + record.seq = Seq(new_string) + + if change_count > 0: + log_handle.write( + " ".join([record.id, ":", str(change_count), "changes\n"]) + ) + SeqIO.write([record], output_handle, "fasta") + + # Finally, count the matches + possible_fake_cut_sites = re.findall( + "N[^N]{1,10}N", str(record.seq.upper()) + ) + if len(possible_fake_cut_sites) > 0: + log_handle.write( + " ".join( + [ + record.id, + ":", + str(len(possible_fake_cut_sites)), + "possible non-standard fake cut sites\n", + ] + ) + ) + + output_handle.close() + log_handle.close() + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_05_report.txt Tue May 25 20:11:49 2021 +0000 @@ -0,0 +1,45 @@ +Original BioNano Genome Map statistics: +Count = 2 +Min length (Mbp) = 0.720 +Median length (Mbp) = 2.313 +Mean length (Mbp) = 2.313 +N50 length (Mbp) = 3.906 +Max length (Mbp) = 3.906 +Total length (Mbp) = 4.625 + +Original NGS sequences statistics: +Count = 1 +Min length (Mbp) = 4.753 +Median length (Mbp) = 4.753 +Mean length (Mbp) = 4.753 +N50 length (Mbp) = 4.753 +Max length (Mbp) = 4.753 +Total length (Mbp) = 4.753 + +NGS FASTA sequence in hybrid scaffold statistics: +Count = 1 +Min length (Mbp) = 4.753 +Median length (Mbp) = 4.753 +Mean length (Mbp) = 4.753 +N50 length (Mbp) = 4.753 +Max length (Mbp) = 4.753 +Total length (Mbp) = 4.753 + +Hybrid scaffold FASTA statistics: +Count = 1 +Min length (Mbp) = 4.753 +Median length (Mbp) = 4.753 +Mean length (Mbp) = 4.753 +N50 length (Mbp) = 4.753 +Max length (Mbp) = 4.753 +Total length (Mbp) = 4.753 + +Hybrid scaffold FASTA plus not scaffolded NGS FASTA statistics: +Count = 1 +Min length (Mbp) = 4.753 +Median length (Mbp) = 4.753 +Mean length (Mbp) = 4.753 +N50 length (Mbp) = 4.753 +Max length (Mbp) = 4.753 +Total length (Mbp) = 4.753 +