Mercurial > repos > nml > refseq_masher
comparison matches.xml @ 0:dbc58c4b2851 draft default tip
planemo upload commit 80c22275be05e29208e991019309dfffa9704f39
| author | nml |
|---|---|
| date | Thu, 15 Feb 2018 12:08:11 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dbc58c4b2851 |
|---|---|
| 1 <tool id="refseq_masher_matches" name="RefSeq Masher Matches" version="0.1.1"> | |
| 2 <description> | |
| 3 Find closest matching NCBI RefSeq Genomes to your sequences | |
| 4 </description> | |
| 5 <requirements> | |
| 6 <requirement type="package" version="0.1.1">refseq_masher</requirement> | |
| 7 </requirements> | |
| 8 <command detect_errors="exit_code"> | |
| 9 <![CDATA[ | |
| 10 | |
| 11 #import re | |
| 12 | |
| 13 #if $input.type == 'fasta' | |
| 14 #set $input_files = '"{}"'.format($input.fasta.name) | |
| 15 ln -s "$input.fasta" $input_files && | |
| 16 #elif $input.type == 'paired' | |
| 17 #set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.forward.name) else '.fastq' | |
| 18 #set $_forward = '"{}_1{}"'.format($re.sub(r'_[12]\..+$', '', $input.forward.name), $_forward_ext) | |
| 19 #set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.reverse.name) else '.fastq' | |
| 20 #set $_reverse = '"{}_2{}"'.format($re.sub(r'_[12]\..+$', '', $input.reverse.name), $_reverse_ext) | |
| 21 #set $input_files = '{} {}'.format($_forward, $_reverse) | |
| 22 ln -s "$input.forward" $_forward && | |
| 23 ln -s "$input.reverse" $_reverse && | |
| 24 #elif $input.type == 'single' | |
| 25 #set $input_files = '"{}"'.format($input.single.name) | |
| 26 ln -s "$input.single" $input_files && | |
| 27 #elif $input.type == 'paired_collection' | |
| 28 #set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.forward)) else '.fastq' | |
| 29 #set $_forward = '"{}_1{}"'.format($input.paired_collection.name, $_forward_ext) | |
| 30 #set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.reverse)) else '.fastq' | |
| 31 #set $_reverse = '"{}_2{}"'.format($input.paired_collection.name, $_reverse_ext) | |
| 32 #set $input_files = '{} {}'.format($_forward, $_reverse) | |
| 33 ln -s "$input.paired_collection.forward" $_forward && | |
| 34 ln -s "$input.paired_collection.reverse" $_reverse && | |
| 35 #end if | |
| 36 | |
| 37 refseq_masher | |
| 38 $adv.verbosity | |
| 39 matches | |
| 40 --output refseq_masher-matches.${adv.output_type} | |
| 41 --output-type $adv.output_type | |
| 42 --top-n-results $top_n_results | |
| 43 #if $adv.min_kmer_threshold | |
| 44 --min-kmer-threshold $adv.min_kmer_threshold | |
| 45 #end if | |
| 46 -T "\${TMPDIR:-/tmp}" | |
| 47 $input_files | |
| 48 ]]> | |
| 49 </command> | |
| 50 <inputs> | |
| 51 <conditional name="input"> | |
| 52 <param name="type" type="select" label="Sequence input type"> | |
| 53 <option value="fasta">Genome FASTA</option> | |
| 54 <option value="paired">Paired-end FASTQs</option> | |
| 55 <option value="single">Single-end FASTQ</option> | |
| 56 <option value="paired_collection">Paired-end FASTQ collection</option> | |
| 57 </param> | |
| 58 <when value="fasta"> | |
| 59 <param name="fasta" | |
| 60 type="data" format="fasta" | |
| 61 optional="false" | |
| 62 label="Genome FASTA file" | |
| 63 /> | |
| 64 </when> | |
| 65 <when value="paired"> | |
| 66 <param name="forward" | |
| 67 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" | |
| 68 optional="false" | |
| 69 label="Forward FASTQ file" | |
| 70 /> | |
| 71 <param name="reverse" | |
| 72 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" | |
| 73 optional="false" | |
| 74 label="Reverse FASTQ file" | |
| 75 help="File format must match the Forward FASTQ file" | |
| 76 /> | |
| 77 </when> | |
| 78 <when value="single"> | |
| 79 <param name="single" | |
| 80 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" | |
| 81 optional="false" | |
| 82 label="Single-end FASTQ file" | |
| 83 /> | |
| 84 </when> | |
| 85 <when value="paired_collection"> | |
| 86 <param name="paired_collection" | |
| 87 type="data_collection" format="fastq,fastqsanger,fastqillumina,fastqsolexa,fastq.gz,txt" | |
| 88 collection_type="paired" | |
| 89 optional="false" | |
| 90 label="Paired-end FASTQ collection" | |
| 91 /> | |
| 92 </when> | |
| 93 </conditional> | |
| 94 <param name="top_n_results" | |
| 95 type="integer" | |
| 96 min="0" | |
| 97 value="20" | |
| 98 optional="true" | |
| 99 label="Top N matches to report (set to 0 to report all)" | |
| 100 /> | |
| 101 <section name="adv" title="Advanced Options" expanded="false"> | |
| 102 <param name="min_kmer_threshold" | |
| 103 type="integer" | |
| 104 min="1" | |
| 105 value="8" | |
| 106 optional="true" | |
| 107 label="Mash sketch of reads: Minimum copies of each k-mer required to pass noise filter for reads (default=8)" | |
| 108 /> | |
| 109 <param name="output_type" | |
| 110 type="select" | |
| 111 label="Output type" | |
| 112 multiple="false"> | |
| 113 <option value="tab" selected="true"> | |
| 114 Tabular (tab-delimited values) | |
| 115 </option> | |
| 116 <option value="csv"> | |
| 117 CSV (Comma Separated Values) | |
| 118 </option> | |
| 119 </param> | |
| 120 <param name="verbosity" | |
| 121 type="select" | |
| 122 label="Logging verbosity"> | |
| 123 <option value="">Error messages only</option> | |
| 124 <option value="-v">Show warning messages</option> | |
| 125 <option value="-vv" selected="true">Show info messages</option> | |
| 126 <option value="-vvv">Show debug messages</option> | |
| 127 </param> | |
| 128 </section> | |
| 129 </inputs> | |
| 130 <outputs> | |
| 131 <data name="output_path_csv" | |
| 132 format="csv" | |
| 133 label="RefSeq Masher matches table" | |
| 134 from_work_dir="refseq_masher-matches.csv"> | |
| 135 <filter>adv['output_type'] == 'csv'</filter> | |
| 136 </data> | |
| 137 <data name="output_path_tab" | |
| 138 format="tabular" | |
| 139 label="RefSeq Masher matches table" | |
| 140 from_work_dir="refseq_masher-matches.tab"> | |
| 141 <filter>adv['output_type'] == 'tab'</filter> | |
| 142 </data> | |
| 143 </outputs> | |
| 144 <tests> | |
| 145 <test> | |
| 146 <conditional name="input"> | |
| 147 <param name="type" value="fasta"/> | |
| 148 <param name="fasta" value="Se-Enteritidis.fasta"/> | |
| 149 </conditional> | |
| 150 <param name="top_n_results" value="1"/> | |
| 151 <section name="adv"> | |
| 152 <param name="output_type" value="tab"/> | |
| 153 </section> | |
| 154 <output name="output_path_tab" | |
| 155 value="Se-Enteritidis-refseq_masher-matches.tab" | |
| 156 ftype="tabular" | |
| 157 lines_diff="0"> | |
| 158 </output> | |
| 159 </test> | |
| 160 <test> | |
| 161 <conditional name="input"> | |
| 162 <param name="type" value="single"/> | |
| 163 <param name="single" value="SRR1203042_1-head4000.fastq"/> | |
| 164 </conditional> | |
| 165 <param name="top_n_results" value="1"/> | |
| 166 <section name="adv"> | |
| 167 <param name="output_type" value="tab"/> | |
| 168 <param name="min_kmer_threshold" value="2"/> | |
| 169 </section> | |
| 170 <output name="output_path_tab" | |
| 171 value="SRR1203042_1-head4000-refseq_masher-matches-m2.tab" | |
| 172 ftype="tabular" | |
| 173 lines_diff="0"> | |
| 174 </output> | |
| 175 </test> | |
| 176 </tests> | |
| 177 <help> | |
| 178 <![CDATA[ | |
| 179 RefSeq Masher - Genomic Distance | |
| 180 ================================ | |
| 181 | |
| 182 Find what NCBI RefSeq genomes most closely match your sequence data using Mash_ with a Mash sketch database of 54,925 NCBI RefSeq Genomes. | |
| 183 | |
| 184 | |
| 185 Source code available on Github at https://github.com/phac-nml/refseq_masher | |
| 186 | |
| 187 | |
| 188 `matches` - find the closest matching NCBI RefSeq Genomes in your input sequences | |
| 189 --------------------------------------------------------------------------------- | |
| 190 | |
| 191 Command-line usage information:: | |
| 192 | |
| 193 Usage: refseq_masher matches [OPTIONS] INPUT... | |
| 194 | |
| 195 Find NCBI RefSeq genome matches for an input genome fasta file | |
| 196 | |
| 197 Input is expected to be one or more FASTA/FASTQ files or one or more | |
| 198 directories containing FASTA/FASTQ files. Files can be Gzipped. | |
| 199 | |
| 200 Options: | |
| 201 --mash-bin TEXT Mash binary path (default="mash") | |
| 202 -o, --output PATH Output file path (default="-"/stdout) | |
| 203 --output-type [tab|csv] Output file type (tab|csv) | |
| 204 -n, --top-n-results INTEGER Output top N results sorted by distance in | |
| 205 ascending order (default=5) | |
| 206 -m, --min-kmer-threshold INTEGER | |
| 207 Mash sketch of reads: "Minimum copies of | |
| 208 each k-mer required to pass noise filter for | |
| 209 reads" (default=8) | |
| 210 -h, --help Show this message and exit. | |
| 211 | |
| 212 | |
| 213 Example | |
| 214 ~~~~~~~ | |
| 215 | |
| 216 With the FNA.GZ_ file for *Salmonella enterica* subsp. enterica serovar Enteritidis str. CHS44_:: | |
| 217 | |
| 218 | |
| 219 # download sequence file | |
| 220 wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/GCF_000329025.1_ASM32902v1_genomic.fna.gz | |
| 221 | |
| 222 # find RefSeq matches | |
| 223 refseq_masher -vv matches GCF_000329025.1_ASM32902v1_genomic.fna.gz | |
| 224 | |
| 225 | |
| 226 **Log**:: | |
| 227 | |
| 228 | |
| 229 2018-01-29 11:02:13,786 INFO: Collected 1 FASTA inputs and 0 read sets [in ...refseq_masher/refseq_masher/utils.py:185] | |
| 230 2018-01-29 11:02:13,786 INFO: Creating Mash sketch file for ...refseq_masher/GCF_000329025.1_ASM32902v1_genomic.fna.gz [in ...refseq_masher/refseq_masher/mash/sketch.py:24] | |
| 231 2018-01-29 11:02:14,055 INFO: Created Mash sketch file at "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" [in ...refseq_masher/refseq_masher/mash/sketch.py:40] | |
| 232 2018-01-29 11:02:14,613 INFO: Ran Mash dist successfully (output length=11647035). Parsing Mash dist output [in ...refseq_masher/refseq_masher/mash/dist.py:64] | |
| 233 2018-01-29 11:02:15,320 INFO: Parsed Mash dist output into Pandas DataFrame with 54924 rows [in ...refseq_masher/refseq_masher/mash/dist.py:67] | |
| 234 2018-01-29 11:02:15,321 INFO: Deleting temporary sketch file "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" [in ...refseq_masher/refseq_masher/mash/dist.py:72] | |
| 235 2018-01-29 11:02:15,321 INFO: Sketch file "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" deleted! [in ...refseq_masher/refseq_masher/mash/dist.py:74] | |
| 236 2018-01-29 11:02:15,322 INFO: Ran Mash dist on all input. Merging NCBI taxonomic information into results output. [in ...refseq_masher/refseq_masher/cli.py:88] | |
| 237 2018-01-29 11:02:15,323 INFO: Fetching all taxonomy info for 5 unique NCBI Taxonomy UIDs [in ...refseq_masher/refseq_masher/taxonomy.py:35] | |
| 238 2018-01-29 11:02:15,325 INFO: Dropping columns with all NA values (ncol=32) [in ...refseq_masher/refseq_masher/taxonomy.py:38] | |
| 239 2018-01-29 11:02:15,327 INFO: Columns with all NA values dropped (ncol=11) [in ...refseq_masher/refseq_masher/taxonomy.py:40] | |
| 240 2018-01-29 11:02:15,327 INFO: Merging Mash results with relevant taxonomic information [in ...refseq_masher/refseq_masher/taxonomy.py:41] | |
| 241 2018-01-29 11:02:15,329 INFO: Merged Mash results with taxonomy info [in ...refseq_masher/refseq_masher/taxonomy.py:43] | |
| 242 2018-01-29 11:02:15,329 INFO: Merged taxonomic info into results output [in ...refseq_masher/refseq_masher/cli.py:90] | |
| 243 2018-01-29 11:02:15,329 INFO: Reordering output columns [in ...refseq_masher/refseq_masher/cli.py:91] | |
| 244 2018-01-29 11:02:15,331 INFO: Writing output to stdout [in ...refseq_masher/refseq_masher/writers.py:16] | |
| 245 | |
| 246 | |
| 247 **Output** | |
| 248 | |
| 249 +---------------------------------------+--------------------------------------------------------------------+----------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------+---------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+-------------+---------+-------------+--------------+--------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------+ | |
| 250 | sample | top_taxonomy_name | distance | pvalue | matching | full_taxonomy | taxonomic_subspecies | taxonomic_species | taxonomic_genus | taxonomic_family | taxonomic_order | taxonomic_class | taxonomic_phylum | taxonomic_superkingdom | subspecies | serovar | plasmid | bioproject | biosample | taxid | assembly_accession | match_id | | |
| 251 +=======================================+====================================================================+==========+========+==========+=============================================================================================================================================================+=====================================+=====================+==================+====================+==================+=====================+===================+=========================+============+=============+=========+=============+==============+========+=====================+==========================================================================================================================================+ | |
| 252 | GCF_000329025.1_ASM32902v1_genomic | Salmonella enterica subsp. enterica serovar Enteritidis str. CHS44 | 0.0 | 0.0 | 400/400 | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Salmonella; enterica; subsp. enterica; serovar Enteritidis; str. CHS44 | Salmonella enterica subsp. enterica | Salmonella enterica | Salmonella | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria | enterica | Enteritidis | | PRJNA185053 | SAMN01041154 | 702979 | NZ_ALFF | ./rcn/refseq-NZ-702979-PRJNA185053-SAMN01041154-NZ_ALFF-.-Salmonella_enterica_subsp._enterica_serovar_Enteritidis_str._CHS44.fna | | |
| 253 +---------------------------------------+--------------------------------------------------------------------+----------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------+---------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+-------------+---------+-------------+--------------+--------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------+ | |
| 254 | |
| 255 | |
| 256 The top match is *Salmonella enterica* subsp. enterica serovar Enteritidis str. CHS44_ with a distance of 0.0 and 400/400 sketches matching, which is what we expected. There's other taxonomic information available in the results table that may be useful. | |
| 257 | |
| 258 | |
| 259 | |
| 260 Legal | |
| 261 ----- | |
| 262 | |
| 263 Copyright Government of Canada 2017 | |
| 264 | |
| 265 Written by: National Microbiology Laboratory, Public Health Agency of Canada | |
| 266 | |
| 267 Licensed under the Apache License, Version 2.0 (the "License"); you may not use | |
| 268 this work except in compliance with the License. You may obtain a copy of the | |
| 269 License at: | |
| 270 | |
| 271 http://www.apache.org/licenses/LICENSE-2.0 | |
| 272 | |
| 273 Unless required by applicable law or agreed to in writing, software distributed | |
| 274 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |
| 275 CONDITIONS OF ANY KIND, either express or implied. See the License for the | |
| 276 specific language governing permissions and limitations under the License. | |
| 277 | |
| 278 Contact | |
| 279 ------- | |
| 280 | |
| 281 **Gary van Domselaar**: gary.vandomselaar@phac-aspc.gc.ca | |
| 282 | |
| 283 .. _Mash: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x | |
| 284 .. _FNA.GZ: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/GCF_000329025.1_ASM32902v1_genomic.fna.gz | |
| 285 .. _CHS44: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/ | |
| 286 | |
| 287 | |
| 288 ]]> | |
| 289 </help> | |
| 290 <citations> | |
| 291 <!-- Citation for Mash paper --> | |
| 292 <citation type="doi">10.1186/s13059-016-0997-x</citation> | |
| 293 </citations> | |
| 294 </tool> |
