Mercurial > repos > mvdbeek > dedup_hash
comparison dedup_hash.xml @ 0:627dc826a68f draft default tip
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
| author | mvdbeek |
|---|---|
| date | Wed, 23 Nov 2016 07:46:20 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:627dc826a68f |
|---|---|
| 1 <tool id="dedup_hash" name="Deduplicate FASTQ files" version="0.1.1"> | |
| 2 <description>with fast and memory-efficient sequence hashes</description> | |
| 3 <requirements> | |
| 4 <requirement type="package" version="0.150.1">smhasher</requirement> | |
| 5 </requirements> | |
| 6 <command><![CDATA[ | |
| 7 python '$__tool_directory__/dedup_hash/dedup_hash.py' | |
| 8 #if str($readtype.single_or_paired) == "se": | |
| 9 --r1_in '${readtype.input_single}' | |
| 10 --r1_out '$output_single' | |
| 11 #elif str($readtype.single_or_paired) == "pe_sep": | |
| 12 --r1_in '${readtype.input_paired1}' | |
| 13 --r2_in '${readtype.input_paired2}' | |
| 14 --r1_out '$output_paired1' | |
| 15 --r2_out '$output_paired2' | |
| 16 #else | |
| 17 --r1_in '${readtype.input_paired.forward}' | |
| 18 --r2_in '${readtype.input_paired.reverse}' | |
| 19 --r1_out '${output_paired_coll.forward}' | |
| 20 --r2_out '${output_paired_coll.reverse}' | |
| 21 #end if | |
| 22 $compress_fastq | |
| 23 ]]></command> | |
| 24 <inputs> | |
| 25 <conditional name="readtype"> | |
| 26 <param name="single_or_paired" type="select" label="Single-end or paired-end reads?"> | |
| 27 <option value="se" selected="true">Single-end</option> | |
| 28 <option value="pe_sep">Paired-end (two separate input files)</option> | |
| 29 <option value="pe_collection">Paired-end (as collection)</option> | |
| 30 </param> | |
| 31 <when value="se"> | |
| 32 <param format="fastq,fastq.gz" name="input_single" type="data" label="Single-end FASTQ reads" help="(-f)" /> | |
| 33 </when> | |
| 34 <when value="pe_sep"> | |
| 35 <param format="fastq,fastq.gz" name="input_paired1" type="data" label="Paired-end forward strand FASTQ reads" help="(-f)" /> | |
| 36 <param format="fastq,fastq.gz" name="input_paired2" type="data" label="Paired-end reverse strand FASTQ reads" help="(-r)" /> | |
| 37 </when> | |
| 38 <when value="pe_collection"> | |
| 39 <param name="input_paired" format="fastq,fastq.gz" type="data_collection" collection_type="paired" label="Paired-end FASTQ reads as paired collection" /> | |
| 40 </when> | |
| 41 </conditional> | |
| 42 <param name="compress_fastq" type="boolean" checked="true" truevalue="--write_gzip" falsevalue="" label="Produce compressed fastq?"/> | |
| 43 </inputs> | |
| 44 <outputs> | |
| 45 <data name="output_single" format="fastq" label="Single-end output of ${tool.name} on ${on_string}"> | |
| 46 <filter>readtype['single_or_paired'] == 'se'</filter> | |
| 47 <change_format> | |
| 48 <when input="compress_fastq" value="--write_gzip" format="fastq.gz" /> | |
| 49 </change_format> | |
| 50 </data> | |
| 51 <data name="output_paired1" format="fastq" label="Paired-end forward strand output of ${tool.name} on ${on_string}"> | |
| 52 <filter>readtype['single_or_paired'] == 'pe_sep'</filter> | |
| 53 <change_format> | |
| 54 <when input="compress_fastq" value="--write_gzip" format="fastq.gz" /> | |
| 55 </change_format> | |
| 56 </data> | |
| 57 <data name="output_paired2" format="fastq" label="Paired-end reverse strand output of ${tool.name} on ${on_string}"> | |
| 58 <filter>readtype['single_or_paired'] == 'pe_sep'</filter> | |
| 59 <change_format> | |
| 60 <when input="compress_fastq" value="--write_gzip" format="fastq.gz" /> | |
| 61 </change_format> | |
| 62 </data> | |
| 63 <collection name="output_paired_coll" type="paired" structured_like="readtype.pe_collection" label="Paired-end output of ${tool.name} on ${on_string}"> | |
| 64 <filter>readtype['single_or_paired'] == 'pe_collection'</filter> | |
| 65 <data name="forward" format="fastq"> | |
| 66 <change_format> | |
| 67 <when input="compress_fastq" value="--write_gzip" format="fastq.gz" /> | |
| 68 </change_format> | |
| 69 </data> | |
| 70 <data name="reverse" format="fastq"> | |
| 71 <change_format> | |
| 72 <when input="compress_fastq" value="--write_gzip" format="fastq.gz" /> | |
| 73 </change_format> | |
| 74 </data> | |
| 75 </collection> | |
| 76 </outputs> | |
| 77 <tests> | |
| 78 <test> | |
| 79 <param name="single_or_paired" value="pe_sep"/> | |
| 80 <param name="input_paired1" value="r1.fastq.gz" ftype="fastq.gz"/> | |
| 81 <param name="input_paired2" value="r2.fastq.gz" ftype="fastq.gz"/> | |
| 82 <param name="compress_fastq" value="--write_gzip"/> | |
| 83 <output name="output_paired1" file="r1_dedup.fastq.gz" ftype="fastq.gz" compare="sim_size"/> | |
| 84 <output name="output_paired2" file="r2_dedup.fastq.gz" ftype="fastq.gz" compare="sim_size"/> | |
| 85 </test> | |
| 86 <test> | |
| 87 <param name="single_or_paired" value="pe_sep"/> | |
| 88 <param name="input_paired1" value="r1.fastq" ftype="fastq"/> | |
| 89 <param name="input_paired2" value="r2.fastq" ftype="fastq"/> | |
| 90 <param name="compress_fastq" value=""/> | |
| 91 <output name="output_paired1" file="r1_dedup.fastq" ftype="fastq"/> | |
| 92 <output name="output_paired2" file="r2_dedup.fastq" ftype="fastq"/> | |
| 93 </test> | |
| 94 </tests> | |
| 95 <help> <![CDATA[ | |
| 96 **Deduplicate paired fastq** is a fast and memory-efficient tool for removal of duplicates in paired short DNA sequence reads in fastq format. | |
| 97 It identifies duplicates by concatenating the sequence of a readpair and calculating a short hash that uniquely identifies the concatenated sequence. | |
| 98 Sequences that are not unique (i.e a hash of the concatenated sequence has been seen previously) are being discarded. | |
| 99 | |
| 100 Compared to fastuniq this tool requires only a fraction of the memory, but does not identify pairs that are identical, | |
| 101 except for a switch of R1 and R2. Such reads may nevertheless align to different places based on the seed-searching of the aligner, | |
| 102 so this may or may not be a problem for your application. | |
| 103 | |
| 104 Fastuniq consumed 76 GB of memory and took 4:01.52 on a typical dataset of 100nt 25 x 10^6 paired end reads, | |
| 105 while this tool took 4.7GB of memory and 3:23.27 for the same dataset. | |
| 106 | |
| 107 Both tools produced the exact same result, arguing that, at least before quality and/or adapter trimming, | |
| 108 the previously mentioned limitations are of theoretical concern. | |
| 109 | |
| 110 ]]> </help> | |
| 111 <citations> | |
| 112 <citation type="doi">doi:10.1371/journal.pone.0052249</citation> | |
| 113 </citations> | |
| 114 </tool> |
