# HG changeset patch # User slegras # Date 1438866928 14400 # Node ID baf52103977b258a512ac20414c7b37b30c51314 Uploaded diff -r 000000000000 -r baf52103977b ._README Binary file ._README has changed diff -r 000000000000 -r baf52103977b ._cutadapt.xml Binary file ._cutadapt.xml has changed diff -r 000000000000 -r baf52103977b ._cutadapt_adapters.txt.sample Binary file ._cutadapt_adapters.txt.sample has changed diff -r 000000000000 -r baf52103977b ._test-data Binary file ._test-data has changed diff -r 000000000000 -r baf52103977b ._tool_dependencies.xml Binary file ._tool_dependencies.xml has changed diff -r 000000000000 -r baf52103977b README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Thu Aug 06 09:15:28 2015 -0400 @@ -0,0 +1,55 @@ +Galaxy tool definition for cutadapt (http://code.google.com/p/cutadapt/) + + +Installation - Tool Shed +------------------------ + +The recommended way to install cutadapt as a tool in Galaxy is to the use the +Galaxy Tool Shed (http://wiki.galaxyproject.org/Tool%20Shed). + +This will allow cutadapt to be installed automatically and keep track of older +versions of cutadapt and the tool wrapper. + + +Installation - Manual +--------------------- + +1. Install the cutadapt package and make sure it is in path for Galaxy + +2. Copy cutadapt.xml to $GALAXY_HOME/tools/cutadapt + +3. Add the tool to the $GALAXY_HOME/tool_conf.xml tool-registry file + + **Optional steps to setup and run Galaxy functional tests** + +4. Copy test-data/* to $GALAXY_HOME/test-data/ + +5. Set GALAXY_TEST_TOOL_CONF environment variable to a tool_conf.xml file that + contains the tools you want to test. (e.g. 'tool_conf.xml') + +6. $GALAXY_HOME/run_functional_tests.sh -id cutadapt + See the Galaxy Wiki for more information: http://wiki.g2.bx.psu.edu/ + + +Configuration of Adapters +------------------------- + +A list of predefined adapters may be specified in the cutadapt_adapters.txt +file which resides in the tool-data directory underneath the Galaxy root. A sample +file is provided. + + +Limitations of the Galaxy wrapper +--------------------------------- + +Reading adapters from a fasta file is not supported +Colorspace data support is not implemented +Only one "Strip suffix" is suppored + + +Galaxy Wrapper Development +-------------------------- + +Author: Lance Parsons + +Repository: [https://bitbucket.org/lance_parsons/cutadapt\_galaxy\_wrapper](https://bitbucket.org/lance_parsons/cutadapt_galaxy_wrapper) diff -r 000000000000 -r baf52103977b cutadapt.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutadapt.xml Thu Aug 06 09:15:28 2015 -0400 @@ -0,0 +1,483 @@ + + Remove adapter sequences from Fastq/Fasta + + cutadapt + + + + + + + + + cutadapt --version + + cutadapt + #if $input.extension.startswith( "fastq"): + --format=fastq + #if $input.extension == "fastqillumina": + --quality-base=64 + #end if + #if $input.extension == "fastqsolexa": + --quality-base=64 + #end if + #else + --format=$input.extension + #end if + #for $a in $adapters + #if $a.adapter_source.adapter_source_list == 'prebuilt': + --adapter="${a.adapter_source.adapter.fields.name}"='${a.adapter_source.adapter}' + #else if str($a.adapter_source.adapter_name) != "": + --adapter='${a.adapter_source.adapter_name}'='${a.adapter_source.adapter}' + #else + --adapter='${a.adapter_source.adapter}' + #end if + #end for + #for $aa in $anywhere_adapters + #if $aa.anywhere_adapter_source.anywhere_adapter_source_list == 'prebuilt': + --anywhere="${aa.anywhere_adapter_source.anywhere_adapter.fields.name}"='${aa.anywhere_adapter_source.anywhere_adapter}' + #else if str($aa.anywhere_adapter_source.anywhere_adapter_name) != "": + --anywhere='${aa.anywhere_adapter_source.anywhere_adapter_name}'='${aa.anywhere_adapter_source.anywhere_adapter}' + #else + --anywhere='${aa.anywhere_adapter_source.anywhere_adapter}' + #end if + #end for + #for $fa in $front_adapters + #if $fa.front_adapter_source.front_adapter_source_list == 'prebuilt': + --front="${fa.front_adapter_source.front_adapter.fields.name}"='${fa.front_adapter_source.front_adapter}' + #else if str($fa.front_adapter_source.front_adapter_name) != "": + --front='${fa.front_adapter_source.front_adapter_name}'='${fa.front_adapter_source.front_adapter}' + #else + --front='${fa.front_adapter_source.front_adapter}' + #end if + #end for + --error-rate=$error_rate + --times=$count + --overlap=$overlap + $no_indels + $match_read_wildcards + + #if str( $output_filtering_options.output_filtering) == "filter": + $output_filtering_options.discard + $output_filtering_options.discard_untrimmed + $output_filtering_options.no_trim + $output_filtering_options.mask_adapter + #if str($output_filtering_options.min) != '0': + --minimum-length=$output_filtering_options.min + #end if + #if str($output_filtering_options.max) != '0': + --maximum-length=$output_filtering_options.max + #end if + #end if + + --output='$output' + + #if $paired_end.paired_end_boolean: + --paired-output='$paired_output' + #end if + + #if str( $output_params.output_type ) == "additional": + #if $output_params.rest_file: + --rest-file=$rest_output + #end if + #if $output_params.wildcard_file: + --wildcard-file=$wild_output + #end if + #if $output_params.too_short_file: + --too-short-output=$too_short_output + #end if + #if $output_params.too_long_file: + --too-long-output=$too_long_output + #end if + #if $output_params.untrimmed_file: + --untrimmed-output=$untrimmed_output + #if $paired_end.paired_end_boolean: + --untrimmed-paired-output=$untrimmed_paired_output + #end if + #end if + #if $output_params.info_file: + --info-file=$info_file + #end if + + #end if + + #if str( $read_modification_params.read_modification) == "modify": + #if str($read_modification_params.quality_cutoff) != '0': + --quality-cutoff=$read_modification_params.quality_cutoff + #end if + #if str($read_modification_params.cut) != '0': + --cut=$read_modification_params.cut + #end if + #if $read_modification_params.prefix != '': + --prefix="$read_modification_params.prefix" + #end if + #if $read_modification_params.suffix != '': + --suffix="$read_modification_params.suffix" + #end if + #if $read_modification_params.length_tag != '': + --length-tag="$read_modification_params.length_tag" + #end if + $read_modification_params.zero_cap + #end if + + '$input' + + #if $paired_end.paired_end_boolean: + '$input2' + #end if + + > $report + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (paired_end['paired_end_boolean'] is True) + + + (output_params['output_type'] == "additional") + (output_params['rest_file'] is True) + + + (output_params['output_type'] == "additional") + (output_params['wildcard_file'] is True) + + + (output_params['output_type'] == "additional") + (output_params['too_short_file'] is True) + + + (output_params['output_type'] == "additional") + (output_params['too_long_file'] is True) + + + (output_params['output_type'] == "additional") + (output_params['untrimmed_file'] is True) + + + (paired_end['paired_end_boolean'] is True) + (output_params['output_type'] == "additional") + (output_params['untrimmed_file'] is True) + + + (output_params['output_type'] == "additional") + (output_params['info_file'] is True) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Summary +------- +This tool removes adapter sequences from DNA high-throughput +sequencing data. This is usually necessary when the read length of the +machine is longer than the molecule that is sequenced, such as in +microRNA data. + +The tool is based on the opensource `cutadapt +<http://code.google.com/p/cutadapt/>`_ tool. See the `complete cutadapt +documentation <https://cutadapt.readthedocs.org/en/latest/index.html>`_ for additional details. + +----- + +Algorithm +--------- + +cutadapt uses a simple semi-global alignment algorithm, without any special optimizations. +For speed, the algorithm is implemented as a Python extension module in ``calignmodule.c``. + + +Partial adapter matches +----------------------- + +Cutadapt correctly deals with partial adapter matches. As an example, suppose +your adapter sequence is ``ADAPTER`` (specified via 3' Adapters parameter). +If you have these input sequences:: + + MYSEQUENCEADAPTER + MYSEQUENCEADAP + MYSEQUENCEADAPTERSOMETHINGELSE + +All of them will be trimmed to ``MYSEQUENCE``. If the sequence starts with an +adapter, like this:: + + ADAPTERSOMETHING + +It will be empty after trimming. + +When the allowed error rate is sufficiently high, errors in +the adapter sequence are allowed. For example, ``ADABTER`` (1 mismatch), ``ADAPTR`` (1 deletion), +and ``ADAPPTER`` (1 insertion) will all be recognized if the error rate is set to 0.15. + + +Anchoring 5' adapters +--------------------- + +If you specify a 5' (Front) adapter, the adapter may overlap the beginning of the read or +occur anywhere whithin it. If it appears withing the read, the sequence that precedes it +will also be trimmed in addition to the adapter. For example when the adapter sequence is +``ADAPTER``:: + + HELLOADAPTERTHERE + APTERTHERE + +will both be trimmed to ``THERE``. To avoid this, you can prefix the adapter with the character +``^``. This will restrict the search, forcing the adapter to be a prefix of the read. With +the adapter sequence set to ``^ADAPTER``, only reads like this will be trimmed:: + + ADAPTERHELLO + + +Allowing adapters anywhere +-------------------------- + +Cutadapt assumes that any adapter specified via the 3' Adapter parameter +was ligated to the 3\' end of the sequence. This is the correct assumption for +at least the SOLiD and Illumina small RNA protocols and probably others. +The assumption is enforced by the alignment algorithm, which only finds the adapter +when its starting position is within the read. In other words, the 5' base of +the adapter must appear within the read. The adapter and all bases following +it are removed. + +If, on the other hand, your adapter can also be ligated to the 5' end (on +purpose or by accident), you should tell cutadapt so by using the Anywhere Adapter +parameter. It will then use a slightly different alignment algorithm +(so-called semiglobal alignment), which allows any type of overlap between the +adapter and the sequence. In particular, the adapter may appear only partially +in the beginning of the read, like this:: + + PTERMYSEQUENCE + +The decision which part of the read to remove is made as follows: If there is at +least one base before the found adapter, then the adapter is considered to be +a 3' adapter and the adapter itself and everything following it is removed. +Otherwise, the adapter is considered to be a 5' adapter and it is removed from +the read. + +Here are some examples, which may make this clearer (left: read, right: trimmed +read):: + + MYSEQUENCEADAPTER -> MYSEQUENCE (3' adapter) + MADAPTER -> M (3' adapter) + ADAPTERMYSEQUENCE -> MYSEQUENCE (5' adapter) + PTERMYSEQUENCE -> MYSEQUENCE (5' adapter) + +The regular algorithm (3' Adapter) would trim the first two examples in the same way, +but trim the third to an empty sequence and trim the fourth not at all. + + +Format of the info file +----------------------- +The info file contains information about the found adapters. The output is a tab-separated text file. Each line corresponds to one read of the input file. The fields are: + +1. Read name +2. Number of errors +3. 0-based start coordinate of the adapter match +4. 0-based end coordinate of the adapter match +5. Sequence of the read to the left of the adapter match (can be empty) +6. Sequence of the read that was matched to the adapter +7. Sequence of the read to the right of the adapter match (can be empty) +8. Name of the found adapter. + +The concatenation of the fields 5-7 yields the full read sequence. In column 8, adapters without a name are numbered starting from 1. + +If no adapter was found, the format is as follows: + +1. Read name +2. The value -1 +3. The read sequence + +When parsing that file, be aware that additional columns may be added in the future. Note also that some fields can be empty, resulting in consecutive tabs within a line. Also, in the current version, when the *Match times* option is set to a value other than 1 (the default value), multiple lines are written to the info file for each read. + +.. _cutadapt: http://code.google.com/p/cutadapt/ + + + + +@article{marcel_cutadapt_2011, + title = {Cutadapt removes adapter sequences from high-throughput sequencing reads}, + volume = {17}, + copyright = {Authors who publish with this journal agree to the following terms: Authors retain copyright and grant the journal right of first publication with the work simultaneously licensed under a Creative Commons Attribution License that allows others to share the work with an acknowledgement of the work's authorship and initial publication in this journal. Authors are able to enter into separate, additional contractual arrangements for the non-exclusive distribution of the journal's published version of the work (e.g., post it to an institutional repository or publish it in a book), with an acknowledgement of its initial publication in this journal. Authors are permitted and encouraged to post their work online (e.g., in institutional repositories or on their website) prior to and during the submission process, as it can lead to productive exchanges, as well as earlier and greater citation of published work (See The Effect of Open Access ).}, + url = {http://journal.embnet.org/index.php/embnetjournal/article/view/200}, + abstract = {When small RNA is sequenced on current sequencing machines, the resulting reads are usually longer than the RNA and therefore contain parts of the 3' adapter. That adapter must be found and removed error-tolerantly from each read before read mapping. Previous solutions are either hard to use or do not offer required features, in particular support for color space data. As an easy to use alternative, we developed the command-line tool cutadapt, which supports 454, Illumina and SOLiD (color space) data, offers two adapter trimming algorithms, and has other useful features. + +Cutadapt, including its MIT-licensed source code, is available for download at http://code.google.com/p/cutadapt/}, + number = {1}, + urldate = {2011-08-02}, + journal = {EMBnet.journal}, + author = {Marcel, Martin}, + year = {2011}, + note = {When small RNA is sequenced on current sequencing machines, the resulting reads are usually longer than the RNA and therefore contain parts of the 3' adapter. That adapter must be found and removed error-tolerantly from each read before read mapping. Previous solutions are either hard to use or do not offer required features, in particular support for color space data. As an easy to use alternative, we developed the command-line tool cutadapt, which supports 454, Illumina and SOLiD (color space) data, offers two adapter trimming algorithms, and has other useful features. Cutadapt, including its MIT-licensed source code, is available for download at http://code.google.com/p/cutadapt/}, + keywords = {Adapter removal;, fastq, MicroRNA, Sequencing, Small RNA, software}, + file = {Cutadapt removes adapter sequences from high-throughput sequencing reads | Martin | EMBnet.journal:/Users/lparsons/Library/Application Support/Firefox/Profiles/thd2t4je.default/zotero/storage/ZXZT4PSE/200.html:text/html} +} + + + + diff -r 000000000000 -r baf52103977b cutadapt_adapters.txt.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutadapt_adapters.txt.sample Thu Aug 06 09:15:28 2015 -0400 @@ -0,0 +1,14 @@ +# +# Adapter/Linker sequences for FASTX-Clipper tool. +# Also used by cutadapt tool +# +# Format: +# Adapter Sequence Descriptive name +# +# Example: +# AAATTTGATAAGATA Our-Adapter +# +# Some adapters can be found here: +# http://seqanswers.com/forums/showthread.php?t=198 + +TGTAGGCC Dummy-Adapter (do not use me) diff -r 000000000000 -r baf52103977b test-data/._cutadapt_discard.out Binary file test-data/._cutadapt_discard.out has changed diff -r 000000000000 -r baf52103977b test-data/._cutadapt_rest.fa Binary file test-data/._cutadapt_rest.fa has changed diff -r 000000000000 -r baf52103977b test-data/._cutadapt_rest.out Binary file test-data/._cutadapt_rest.out has changed diff -r 000000000000 -r baf52103977b test-data/._cutadapt_rest2.out Binary file test-data/._cutadapt_rest2.out has changed diff -r 000000000000 -r baf52103977b test-data/._cutadapt_small.fastq Binary file test-data/._cutadapt_small.fastq has changed diff -r 000000000000 -r baf52103977b test-data/._cutadapt_small.out Binary file test-data/._cutadapt_small.out has changed diff -r 000000000000 -r baf52103977b test-data/cutadapt_discard.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_discard.out Thu Aug 06 09:15:28 2015 -0400 @@ -0,0 +1,4 @@ +@prefix:1_13_1440/1 +CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC ++ +<=A:A=57!7<';<6?5;;6:+:=)71>70<,=: diff -r 000000000000 -r baf52103977b test-data/cutadapt_rest.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_rest.fa Thu Aug 06 09:15:28 2015 -0400 @@ -0,0 +1,10 @@ +>read1 +TESTINGADAPTERREST1 +>read2 +TESTINGADAPTERRESTING +>read3 +TESTINGADAPTER +>read4 +TESTINGADAPTERRESTLESS +>read5 +TESTINGADAPTERRESTORE diff -r 000000000000 -r baf52103977b test-data/cutadapt_rest.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_rest.out Thu Aug 06 09:15:28 2015 -0400 @@ -0,0 +1,10 @@ +>read1 +TESTING +>read2 +TESTING +>read3 +TESTING +>read4 +TESTING +>read5 +TESTING diff -r 000000000000 -r baf52103977b test-data/cutadapt_rest2.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_rest2.out Thu Aug 06 09:15:28 2015 -0400 @@ -0,0 +1,4 @@ +REST1 read1 +RESTING read2 +RESTLESS read4 +RESTORE read5 diff -r 000000000000 -r baf52103977b test-data/cutadapt_small.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_small.fastq Thu Aug 06 09:15:28 2015 -0400 @@ -0,0 +1,12 @@ +@prefix:1_13_573/1 +CGTCCGAANTAGCTACCACCCTGATTAGACAAAT ++ +)3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 +@prefix:1_13_1259/1 +AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT ++ +;<:&:A;A!9<<<,7:<=3=;:<&70<,=: diff -r 000000000000 -r baf52103977b test-data/cutadapt_small.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_small.out Thu Aug 06 09:15:28 2015 -0400 @@ -0,0 +1,12 @@ +@prefix:1_13_573/1 +CGTCCGAANTAGCTACCACCCTGA ++ +)3%)&&&&!.1&(6:<'67..*,: +@prefix:1_13_1259/1 +AGCCGCTANGACGGGTTGGCCC ++ +;<:&:A;A!9<<<,7:<=3=;: +@prefix:1_13_1440/1 +CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC ++ +<=A:A=57!7<';<6?5;;6:+:=)71>70<,=: diff -r 000000000000 -r baf52103977b tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Aug 06 09:15:28 2015 -0400 @@ -0,0 +1,6 @@ + + + + + +