# HG changeset patch # User lparsons # Date 1412617676 14400 # Node ID 65c3f5d933a310e66cc059bf8737e54d1925464e Updated to version 1.6 diff -r 000000000000 -r 65c3f5d933a3 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Mon Oct 06 13:47:56 2014 -0400 @@ -0,0 +1,55 @@ +Galaxy tool definition for cutadapt (http://code.google.com/p/cutadapt/) + + +Installation - Tool Shed +------------------------ + +The recommended way to install cutadapt as a tool in Galaxy is to the use the +Galaxy Tool Shed (http://wiki.galaxyproject.org/Tool%20Shed). + +This will allow cutadapt to be installed automatically and keep track of older +versions of cutadapt and the tool wrapper. + + +Installation - Manual +--------------------- + +1. Install the cutadapt package and make sure it is in path for Galaxy + +2. Copy cutadapt.xml to $GALAXY_HOME/tools/cutadapt + +3. Add the tool to the $GALAXY_HOME/tool_conf.xml tool-registry file + + **Optional steps to setup and run Galaxy functional tests** + +4. Copy test-data/* to $GALAXY_HOME/test-data/ + +5. Set GALAXY_TEST_TOOL_CONF environment variable to a tool_conf.xml file that + contains the tools you want to test. (e.g. 'tool_conf.xml') + +6. $GALAXY_HOME/run_functional_tests.sh -id cutadapt + See the Galaxy Wiki for more information: http://wiki.g2.bx.psu.edu/ + + +Configuration of Adapters +------------------------- + +A list of predefined adapters may be specified in the cutadapt_adapters.txt +file which resides in the tool-data directory underneath the Galaxy root. A sample +file is provided. + + +Limitations of the Galaxy wrapper +--------------------------------- + +Reading adapters from a fasta file is not supported +Colorspace data support is not implemented +Only one "Strip suffix" is suppored + + +Galaxy Wrapper Development +-------------------------- + +Author: Lance Parsons + +Repository: [https://bitbucket.org/lance_parsons/cutadapt\_galaxy\_wrapper](https://bitbucket.org/lance_parsons/cutadapt_galaxy_wrapper) diff -r 000000000000 -r 65c3f5d933a3 cutadapt.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutadapt.xml Mon Oct 06 13:47:56 2014 -0400 @@ -0,0 +1,472 @@ + + Remove adapter sequences from Fastq/Fasta + + cutadapt + + cutadapt --version + + cutadapt + #if $input.extension.startswith( "fastq"): + --format=fastq + #if $input.extension == "fastqillumina": + --quality-base=64 + #end if + #if $input.extension == "fastqsolexa": + --quality-base=64 + #end if + #else + --format=$input.extension + #end if + #for $a in $adapters + #if $a.adapter_source.adapter_source_list == 'prebuilt': + --adapter="${a.adapter_source.adapter.fields.name}"='${a.adapter_source.adapter}' + #else if str($a.adapter_source.adapter_name) != "": + --adapter='${a.adapter_source.adapter_name}'='${a.adapter_source.adapter}' + #else + --adapter='${a.adapter_source.adapter}' + #end if + #end for + #for $aa in $anywhere_adapters + #if $aa.anywhere_adapter_source.anywhere_adapter_source_list == 'prebuilt': + --anywhere="${aa.anywhere_adapter_source.anywhere_adapter.fields.name}"='${aa.anywhere_adapter_source.anywhere_adapter}' + #else if str($aa.anywhere_adapter_source.anywhere_adapter_name) != "": + --anywhere='${aa.anywhere_adapter_source.anywhere_adapter_name}'='${aa.anywhere_adapter_source.anywhere_adapter}' + #else + --anywhere='${aa.anywhere_adapter_source.anywhere_adapter}' + #end if + #end for + #for $fa in $front_adapters + #if $fa.front_adapter_source.front_adapter_source_list == 'prebuilt': + --front="${fa.front_adapter_source.front_adapter.fields.name}"='${fa.front_adapter_source.front_adapter}' + #else if str($fa.front_adapter_source.front_adapter_name) != "": + --front='${fa.front_adapter_source.front_adapter_name}'='${fa.front_adapter_source.front_adapter}' + #else + --front='${fa.front_adapter_source.front_adapter}' + #end if + #end for + --error-rate=$error_rate + --times=$count + --overlap=$overlap + $no_indels + $match_read_wildcards + + #if str( $output_filtering_options.output_filtering) == "filter": + $output_filtering_options.discard + $output_filtering_options.discard_untrimmed + $output_filtering_options.no_trim + $output_filtering_options.mask_adapter + #if str($output_filtering_options.min) != '0': + --minimum-length=$output_filtering_options.min + #end if + #if str($output_filtering_options.max) != '0': + --maximum-length=$output_filtering_options.max + #end if + #end if + + --output='$output' + + #if $paired_end.paired_end_boolean: + --paired-output='$paired_output' + #end if + + #if str( $output_params.output_type ) == "additional": + #if $output_params.rest_file: + --rest-file=$rest_output + #end if + #if $output_params.wildcard_file: + --wildcard-file=$wild_output + #end if + #if $output_params.too_short_file: + --too-short-output=$too_short_output + #end if + #if $output_params.too_long_file: + --too-long-output=$too_long_output + #end if + #if $output_params.untrimmed_file: + --untrimmed-output=$untrimmed_output + #if $paired_end.paired_end_boolean: + --untrimmed-paired-output=$untrimmed_paired_output + #end if + #end if + #if $output_params.info_file: + --info-file=$info_file + #end if + + #end if + + #if str( $read_modification_params.read_modification) == "modify": + #if str($read_modification_params.quality_cutoff) != '0': + --quality-cutoff=$read_modification_params.quality_cutoff + #end if + #if str($read_modification_params.cut) != '0': + --cut=$read_modification_params.cut + #end if + #if $read_modification_params.prefix != '': + --prefix="$read_modification_params.prefix" + #end if + #if $read_modification_params.suffix != '': + --suffix="$read_modification_params.suffix" + #end if + #if $read_modification_params.length_tag != '': + --length-tag="$read_modification_params.length_tag" + #end if + $read_modification_params.zero_cap + #end if + + '$input' + + #if $paired_end.paired_end_boolean: + '$input2' + #end if + + > $report + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (paired_end['paired_end_boolean'] is True) + + + (output_params['output_type'] == "additional") + (output_params['rest_file'] is True) + + + (output_params['output_type'] == "additional") + (output_params['wildcard_file'] is True) + + + (output_params['output_type'] == "additional") + (output_params['too_short_file'] is True) + + + (output_params['output_type'] == "additional") + (output_params['too_long_file'] is True) + + + (output_params['output_type'] == "additional") + (output_params['untrimmed_file'] is True) + + + (paired_end['paired_end_boolean'] is True) + (output_params['output_type'] == "additional") + (output_params['untrimmed_file'] is True) + + + (output_params['output_type'] == "additional") + (output_params['info_file'] is True) + + + + + + + + + + + + + + + + + + + + + + + + + + + +Summary +------- +This tool removes adapter sequences from DNA high-throughput +sequencing data. This is usually necessary when the read length of the +machine is longer than the molecule that is sequenced, such as in +microRNA data. + +The tool is based on the opensource `cutadapt +<http://code.google.com/p/cutadapt/>`_ tool. See the `complete cutadapt +documentation <https://cutadapt.readthedocs.org/en/latest/index.html>`_ for additional details. + +----- + +Algorithm +--------- + +cutadapt uses a simple semi-global alignment algorithm, without any special optimizations. +For speed, the algorithm is implemented as a Python extension module in ``calignmodule.c``. + + +Partial adapter matches +----------------------- + +Cutadapt correctly deals with partial adapter matches. As an example, suppose +your adapter sequence is ``ADAPTER`` (specified via 3' Adapters parameter). +If you have these input sequences:: + + MYSEQUENCEADAPTER + MYSEQUENCEADAP + MYSEQUENCEADAPTERSOMETHINGELSE + +All of them will be trimmed to ``MYSEQUENCE``. If the sequence starts with an +adapter, like this:: + + ADAPTERSOMETHING + +It will be empty after trimming. + +When the allowed error rate is sufficiently high, errors in +the adapter sequence are allowed. For example, ``ADABTER`` (1 mismatch), ``ADAPTR`` (1 deletion), +and ``ADAPPTER`` (1 insertion) will all be recognized if the error rate is set to 0.15. + + +Anchoring 5' adapters +--------------------- + +If you specify a 5' (Front) adapter, the adapter may overlap the beginning of the read or +occur anywhere whithin it. If it appears withing the read, the sequence that precedes it +will also be trimmed in addition to the adapter. For example when the adapter sequence is +``ADAPTER``:: + + HELLOADAPTERTHERE + APTERTHERE + +will both be trimmed to ``THERE``. To avoid this, you can prefix the adapter with the character +``^``. This will restrict the search, forcing the adapter to be a prefix of the read. With +the adapter sequence set to ``^ADAPTER``, only reads like this will be trimmed:: + + ADAPTERHELLO + + +Allowing adapters anywhere +-------------------------- + +Cutadapt assumes that any adapter specified via the 3' Adapter parameter +was ligated to the 3\' end of the sequence. This is the correct assumption for +at least the SOLiD and Illumina small RNA protocols and probably others. +The assumption is enforced by the alignment algorithm, which only finds the adapter +when its starting position is within the read. In other words, the 5' base of +the adapter must appear within the read. The adapter and all bases following +it are removed. + +If, on the other hand, your adapter can also be ligated to the 5' end (on +purpose or by accident), you should tell cutadapt so by using the Anywhere Adapter +parameter. It will then use a slightly different alignment algorithm +(so-called semiglobal alignment), which allows any type of overlap between the +adapter and the sequence. In particular, the adapter may appear only partially +in the beginning of the read, like this:: + + PTERMYSEQUENCE + +The decision which part of the read to remove is made as follows: If there is at +least one base before the found adapter, then the adapter is considered to be +a 3' adapter and the adapter itself and everything following it is removed. +Otherwise, the adapter is considered to be a 5' adapter and it is removed from +the read. + +Here are some examples, which may make this clearer (left: read, right: trimmed +read):: + + MYSEQUENCEADAPTER -> MYSEQUENCE (3' adapter) + MADAPTER -> M (3' adapter) + ADAPTERMYSEQUENCE -> MYSEQUENCE (5' adapter) + PTERMYSEQUENCE -> MYSEQUENCE (5' adapter) + +The regular algorithm (3' Adapter) would trim the first two examples in the same way, +but trim the third to an empty sequence and trim the fourth not at all. + + +Format of the info file +----------------------- +The info file contains information about the found adapters. The output is a tab-separated text file. Each line corresponds to one read of the input file. The fields are: + +1. Read name +2. Number of errors +3. 0-based start coordinate of the adapter match +4. 0-based end coordinate of the adapter match +5. Sequence of the read to the left of the adapter match (can be empty) +6. Sequence of the read that was matched to the adapter +7. Sequence of the read to the right of the adapter match (can be empty) +8. Name of the found adapter. + +The concatenation of the fields 5-7 yields the full read sequence. In column 8, adapters without a name are numbered starting from 1. + +If no adapter was found, the format is as follows: + +1. Read name +2. The value -1 +3. The read sequence + +When parsing that file, be aware that additional columns may be added in the future. Note also that some fields can be empty, resulting in consecutive tabs within a line. Also, in the current version, when the *Match times* option is set to a value other than 1 (the default value), multiple lines are written to the info file for each read. + +.. _cutadapt: http://code.google.com/p/cutadapt/ + + + diff -r 000000000000 -r 65c3f5d933a3 cutadapt_adapters.txt.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cutadapt_adapters.txt.sample Mon Oct 06 13:47:56 2014 -0400 @@ -0,0 +1,14 @@ +# +# Adapter/Linker sequences for FASTX-Clipper tool. +# Also used by cutadapt tool +# +# Format: +# Adapter Sequence Descriptive name +# +# Example: +# AAATTTGATAAGATA Our-Adapter +# +# Some adapters can be found here: +# http://seqanswers.com/forums/showthread.php?t=198 + +TGTAGGCC Dummy-Adapter (do not use me) diff -r 000000000000 -r 65c3f5d933a3 test-data/cutadapt_discard.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_discard.out Mon Oct 06 13:47:56 2014 -0400 @@ -0,0 +1,4 @@ +@prefix:1_13_1440/1 +CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC ++ +<=A:A=57!7<';<6?5;;6:+:=)71>70<,=: diff -r 000000000000 -r 65c3f5d933a3 test-data/cutadapt_rest.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_rest.fa Mon Oct 06 13:47:56 2014 -0400 @@ -0,0 +1,10 @@ +>read1 +TESTINGADAPTERREST1 +>read2 +TESTINGADAPTERRESTING +>read3 +TESTINGADAPTER +>read4 +TESTINGADAPTERRESTLESS +>read5 +TESTINGADAPTERRESTORE diff -r 000000000000 -r 65c3f5d933a3 test-data/cutadapt_rest.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_rest.out Mon Oct 06 13:47:56 2014 -0400 @@ -0,0 +1,10 @@ +>read1 +TESTING +>read2 +TESTING +>read3 +TESTING +>read4 +TESTING +>read5 +TESTING diff -r 000000000000 -r 65c3f5d933a3 test-data/cutadapt_rest2.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_rest2.out Mon Oct 06 13:47:56 2014 -0400 @@ -0,0 +1,4 @@ +REST1 +RESTING +RESTLESS +RESTORE diff -r 000000000000 -r 65c3f5d933a3 test-data/cutadapt_small.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_small.fastq Mon Oct 06 13:47:56 2014 -0400 @@ -0,0 +1,12 @@ +@prefix:1_13_573/1 +CGTCCGAANTAGCTACCACCCTGATTAGACAAAT ++ +)3%)&&&&!.1&(6:<'67..*,:75)'77&&&5 +@prefix:1_13_1259/1 +AGCCGCTANGACGGGTTGGCCCTTAGACGTATCT ++ +;<:&:A;A!9<<<,7:<=3=;:<&70<,=: diff -r 000000000000 -r 65c3f5d933a3 test-data/cutadapt_small.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cutadapt_small.out Mon Oct 06 13:47:56 2014 -0400 @@ -0,0 +1,12 @@ +@prefix:1_13_573/1 +CGTCCGAANTAGCTACCACCCTGA ++ +)3%)&&&&!.1&(6:<'67..*,: +@prefix:1_13_1259/1 +AGCCGCTANGACGGGTTGGCCC ++ +;<:&:A;A!9<<<,7:<=3=;: +@prefix:1_13_1440/1 +CAAGATCTNCCCTGCCACATTGCCCTAGTTAAAC ++ +<=A:A=57!7<';<6?5;;6:+:=)71>70<,=: diff -r 000000000000 -r 65c3f5d933a3 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Oct 06 13:47:56 2014 -0400 @@ -0,0 +1,6 @@ + + + + + +