Mercurial > repos > artbio > yac_clipper

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.txt	Wed Mar 17 22:08:17 2021 +0000
@@ -0,0 +1,11 @@
+This tool clips adapter sequences from a fastq file and outputs either a
+fasta or fastq file of clipped reads with renumbered fasta/fastq headers.
+
+Clipped sequences with Ns can be discarded.
+
+Min size and max size filter clipped reads on their size.
+
+Note that unclipped reads that satisfy the min and max size conditions are kept.
+
+Homepage: drosophile.org
+Repositoy development: https://bitbucket.org/drosofff/gedtools/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out.fasta	Wed Mar 17 22:08:17 2021 +0000
@@ -0,0 +1,12 @@
+>1
+TGTAAACATCCCCGACTGGCAGC
+>2
+AAAGTGCTACTACTTTTGAGTCT
+>3
+ACTGGACTTGGAGTCCGAAGGC
+>4
+AAGTGCCGCCAGGTTTTGAGTGG
+>5
+TATTGCACTTGTCCCGGCCTGAATCNCGT
+>6
+TAGCTTATCAGACTGATGTTGAC
Binary file test-data/out.fasta.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out.fastqsanger	Wed Mar 17 22:08:17 2021 +0000
@@ -0,0 +1,24 @@
+@HWI-1
+TGTAAACATCCCCGACTGGCAGC
++
+B@BBCBCCBCBCCC8A<@#####
+@HWI-2
+AAAGTGCTACTACTTTTGAGTCT
++
+BAA@7?A@@A@@B<'25?6>59:
+@HWI-3
+ACTGGACTTGGAGTCCGAAGGC
++
+BBB@@ABAAB?9B42&9;####
+@HWI-4
+AAGTGCCGCCAGGTTTTGAGTGG
++
+AB?5;3>/=?>=;416481####
+@HWI-5
+TATTGCACTTGTCCCGGCCTGAATCNCGT
++
+BCB=:ACCBB=>BB8<-############
+@HWI-6
+TAGCTTATCAGACTGATGTTGAC
++
+BBBBBCBBCB;>AA',9=18?1:
Binary file test-data/out.fastqsanger.gz has changed
Binary file test-data/out_with_empty_reads.fasta.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/yac.fasta	Wed Mar 17 22:08:17 2021 +0000
@@ -0,0 +1,20 @@
+>1
+TGTAAACATCCCCGACTGGCAGCATNTCGTATGCCG
+>2
+AAAGTGCTACTACTTTTGAGTCTATNTCGTACGCCG
+>3
+TAGCTTATCAGACTGATGTTGACACNTCGTATGCCG
+>4
+ACTGGACTTGGAGTCCGAAGGCATCNCGTATTCCGT
+>5
+AAGTGCCGCCAGGTTTTGAGTGGATNTCGTATGGCG
+>6
+TATTGCACTTGTCCCGGCCTGAATCNCGTATCCCGT
+>7
+TGGTAGACTATGGAACGTAGGATCTNGCATGCCGCC
+>8
+AGTGGTAGAGCATTTGAATCTCGTANGCCGTCTTCT
+>9
+TAGCTTATCAGACTGATGTTGACATNTCGTACGCCG
+>10
+TTTGGCAATGGTAGAACTCCCACACNTCGTAGGCCG
Binary file test-data/yac.fasta.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/yac.fastq	Wed Mar 17 22:08:17 2021 +0000
@@ -0,0 +1,40 @@
+@SRR290479.1 HWI-EAS285:2:1:66:28/1
+TGTAAACATCCCCGACTGGCAGCATNTCGTATGCCG
++
+B@BBCBCCBCBCCC8A<@##################
+@SRR290479.2 HWI-EAS285:2:1:67:348/1
+AAAGTGCTACTACTTTTGAGTCTATNTCGTACGCCG
++
+BAA@7?A@@A@@B<'25?6>59:;7#<?########
+@SRR290479.3 HWI-EAS285:2:1:68:826/1
+TAGCTTATCAGACTGATGTTGACACNTCGTATGCCG
++
+BB@BBCCBCCBBB:%%83/>B7@44#;;324'117?
+@SRR290479.4 HWI-EAS285:2:1:68:65/1
+ACTGGACTTGGAGTCCGAAGGCATCNCGTATTCCGT
++
+BBB@@ABAAB?9B42&9;##################
+@SRR290479.5 HWI-EAS285:2:1:69:594/1
+AAGTGCCGCCAGGTTTTGAGTGGATNTCGTATGGCG
++
+AB?5;3>/=?>=;416481#################
+@SRR290479.6 HWI-EAS285:2:1:70:700/1
+TATTGCACTTGTCCCGGCCTGAATCNCGTATCCCGT
++
+BCB=:ACCBB=>BB8<-###################
+@SRR290479.7 HWI-EAS285:2:1:70:1679/1
+TGGTAGACTATGGAACGTAGGATCTNGCATGCCGCC
++
+BCBBCCBCCCBCCA?AB>:B@><>############
+@SRR290479.8 HWI-EAS285:2:1:71:1400/1
+AGTGGTAGAGCATTTGAATCTCGTANGCCGTCTTCT
++
+7@BC>>@55CCBCA3CBA14B.A16#*;9359B###
+@SRR290479.9 HWI-EAS285:2:1:71:795/1
+TAGCTTATCAGACTGATGTTGACATNTCGTACGCCG
++
+BBBBBCBBCB;>AA',9=18?1:7:#<;57######
+@SRR290479.10 HWI-EAS285:2:1:71:596/1
+TTTGGCAATGGTAGAACTCCCACACNTCGTAGGCCG
++
+B@B>7>9A@<46B@79972#################
Binary file test-data/yac.fastqsanger.gz has changed
Binary file test-data/yac_with_empty_reads.fastqsanger.gz has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/yac.py	Wed Mar 17 22:08:17 2021 +0000
@@ -0,0 +1,156 @@
+#!/usr/bin/python
+# yac = yet another clipper
+# v 1.2.1 - 23-08-2014 - Support FastQ output
+# v 1.1.0 - 23-08-2014 - argparse implementation
+# Christophe Antoniewski <drosofff@gmail.com>
+
+import argparse
+from itertools import islice
+
+
+def Parser():
+    the_parser = argparse.ArgumentParser()
+    the_parser.add_argument(
+        '--input', action="store", nargs='+', help="input fastq files")
+    the_parser.add_argument(
+        '--output', action="store", type=str,
+        help="output, clipped fasta file")
+    the_parser.add_argument(
+        '--output_format', action="store", type=str,
+        help="output format, fasta or fastq")
+    the_parser.add_argument(
+        '--adapter_to_clip', action="store", type=str,
+        help="adapter sequence to clip")
+    the_parser.add_argument(
+        '--min', action="store", type=int,
+        help="minimal size of clipped sequence to keep")
+    the_parser.add_argument(
+        '--max', action="store", type=int,
+        help="maximal size of clipped sequence to keep")
+    the_parser.add_argument('--Nmode', action="store", type=str, choices=[
+                            "accept", "reject"],
+                            help="accept or reject Ns in clipped sequences")
+    args = the_parser.parse_args()
+    args.adapter_to_clip = args.adapter_to_clip.upper()
+    return args
+
+
+class Clip:
+
+    def __init__(self, inputfile, outputfile, output_format,
+                 adapter, minsize, maxsize, Nmode):
+        self.inputfile = inputfile
+        self.outputfile = outputfile
+        self.output_format = output_format
+        self.adapter = adapter
+        self.minsize = int(minsize)
+        self.maxsize = int(maxsize)
+        self.Nmode = Nmode
+        for line in open(inputfile):
+            if line[0] == "@":
+                self.inputformat = "fastq"
+                break
+            elif line[0] == ">":
+                self.inputformat = "fasta"
+
+        def motives(sequence):
+            '''
+            return a list of motives for perfect (6nt) or
+            imperfect (7nt with one mismatch) search on import string module
+            '''
+            sequencevariants = [
+                sequence[0:6]]  # initializes list with 6mer perfect match
+            dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"}
+            for pos in enumerate(sequence[:6]):
+                for subst in dicsubst[pos[1]]:
+                    sequencevariants.append(
+                        sequence[:pos[0]] + subst + sequence[pos[0] + 1:7])
+            return sequencevariants
+        self.adaptmotifs = motives(self.adapter)
+
+    def scanadapt(self, adaptmotives=[], sequence="", qscore=""):
+        '''scans sequence for adapter motives'''
+        match_position = sequence.rfind(adaptmotives[0])
+        if qscore:
+            if match_position != -1:
+                return sequence[:match_position], qscore[:match_position]
+            for motif in adaptmotives[1:]:
+                match_position = sequence.rfind(motif)
+                if match_position != -1:
+                    return sequence[:match_position], qscore[:match_position]
+            return sequence, qscore
+        else:
+            if match_position != -1:
+                return sequence[:match_position]
+            for motif in adaptmotives[1:]:
+                match_position = sequence.rfind(motif)
+                if match_position != -1:
+                    return sequence[:match_position]
+            return sequence
+
+    def write_output(self, id, read, qscore, output):
+        if self.output_format == "fasta":
+            block = ">{0}\n{1}\n".format(id, read)
+        else:
+            block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore)
+        output.write(block)
+
+    def fasta_in_write_output(self, id, read, output):
+        output.write(">{0}\n{1}\n".format(id, read))
+
+    def handle_io_fastq(self):
+        '''Open input fastq file, pass read sequence and read qscore to
+        scanadapt function. Pass clipped read and qscore to output function.'''
+        id = 0
+        output = open(self.outputfile, "a")
+        with open(self.inputfile, "r") as input:
+            block_gen = islice(input, 1, None, 2)
+            for i, line in enumerate(block_gen):
+                if i % 2:
+                    qscore = line.rstrip()
+                else:
+                    read = line.rstrip()
+                    continue
+                try:
+                    trimmed_read, trimmed_qscore = self.scanadapt(
+                        self.adaptmotifs, read, qscore)
+                except ValueError:
+                    continue
+                if self.minsize <= len(trimmed_read) <= self.maxsize:
+                    if (self.Nmode == "reject") and ("N" in trimmed_read):
+                        continue
+                    id += 1
+                    self.write_output(id, trimmed_read, trimmed_qscore, output)
+        output.close()
+
+    def handle_io_fasta(self):
+        '''Open input fasta file, pass header and read sequence to scanadapt
+        function. Pass clipped read and qscore to output function.'''
+        id = 0
+        output = open(self.outputfile, "a")
+        with open(self.inputfile, "r") as input:
+            block_gen = islice(input, 1, None, 2)
+            for i, line in enumerate(block_gen):
+                read = line.rstrip()
+                trimmed_read = self.scanadapt(self.adaptmotifs, read)
+                if self.minsize <= len(trimmed_read) <= self.maxsize:
+                    if (self.Nmode == "reject") and ("N" in trimmed_read):
+                        continue
+                    id += 1
+                    self.fasta_in_write_output(id, trimmed_read, output)
+        output.close()
+
+
+def main(*argv):
+    instanceClip = Clip(*argv)
+    if instanceClip.inputformat == "fasta":
+        instanceClip.handle_io_fasta()
+    else:
+        instanceClip.handle_io_fastq()
+
+
+if __name__ == "__main__":
+    args = Parser()
+    for inputfile in args.input:
+        main(inputfile, args.output, args.output_format,
+             args.adapter_to_clip, args.min, args.max, args.Nmode)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/yac.xml	Wed Mar 17 22:08:17 2021 +0000
@@ -0,0 +1,169 @@
+<tool id="yac" name="Clip adapter" version="2.4.0">
+    <description />
+    <requirements>
+        <requirement type="package" version="3.7.6">python</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python $__tool_directory__/yac.py
+            --input $input
+            --output 'clip.tmp'
+            --output_format
+            #if $out_format == 'fasta' or $out_format == 'fastagz':
+                'fasta'
+            #else
+                'fastq'
+            #end if
+            --adapter_to_clip $clip_source.clip_sequence
+            --min $min
+            --max $max
+            --Nmode $Nmode &&
+            #if ($out_format == 'fastagz') or ($out_format == 'fastqgz'):
+                gzip -c 'clip.tmp' >  $output
+            #else
+                mv clip.tmp $output
+            #end if
+    ]]></command>
+    <inputs>
+        <param format="fasta,fastq" label="Source file" name="input" type="data" />
+        <param label="min size" name="min" size="4" type="integer" value="15" />
+        <param label="max size" name="max" size="4" type="integer" value="36" />
+        <param label="Select output format" name="out_format" type="select"
+               help="be careful not to select a fastq format for your output if your input has a fasta format">
+            <option value="fasta">Fasta</option>
+            <option value="fastq" selected="true" >Fastq (Sanger)</option>
+            <option value="fastagz">gzipped Fasta</option>
+            <option value="fastqgz">gzipped Fastq (Sanger)</option>
+        </param>
+        <param label="Accept reads containing N?" name="Nmode" type="select">
+            <option selected="True" value="accept">accept</option>
+            <option value="reject">reject</option>
+        </param>
+        <conditional name="clip_source">
+            <param help="Built-in adapters or User-provided" label="Source" name="clip_source_list" type="select">
+                <option selected="True" value="prebuilt">Use a built-in adapter (select from the list below)</option>
+                <option value="user">Use custom sequence</option>
+            </param>
+            <when value="prebuilt">
+                <param help="if your adapter is not listed, input your own sequence" label="Select Adapter to clip" name="clip_sequence" type="select">
+                    <option value="TCGTATGCCGTCTTCTGCTTG">Solexa TCGTATGCCGTCTTCTGCTTG</option>
+                    <option value="ATCTCGTATGCCGTCTTCTGCTT">Illumina ATCTCGTATGCCGTCTTCTGCTT</option>
+                    <option selected="True" value="TGGAATTCTCGGGTGCCAAG">Illumina TruSeq  TGGAATTCTCGGGTGCCAAG</option>
+                    <option value="CTGTAGGCACCATCAATCGT">IdT CTGTAGGCACCATCAATCGT</option>
+                </param>
+            </when>
+            <when value="user">
+                <param label="Enter your Sequence" name="clip_sequence" size="35" type="text" value="GAATCC" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format_source="input" metadata_source="input" name="output" label="Clipped ${input.name}-then-${out_format}">
+          <change_format>
+              <when input="out_format" value="fasta" format="fasta" />
+              <when input="out_format" value="fastq" format="fastqsanger" />
+              <when input="out_format" value="fastagz" format="fasta.gz" />
+              <when input="out_format" value="fastqgz" format="fastqsanger.gz" />
+          </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param ftype="fastqsanger" name="input" value="yac.fastq" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fastq" />
+            <output file="out.fastqsanger" name="output" />
+        </test>
+        <test>
+            <param ftype="fastqsanger" name="input" value="yac.fastq" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fasta" />
+            <output file="out.fasta" name="output" />
+        </test>
+        <test>
+            <param ftype="fastqsanger.gz" name="input" value="yac.fastqsanger.gz" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fastqgz" />
+            <output file="out.fastqsanger.gz" name="output" decompress="True" />
+        </test>
+        <test>
+            <param ftype="fastqsanger.gz" name="input" value="yac.fastqsanger.gz" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fastagz" />
+            <output file="out.fasta.gz" name="output" decompress="True" />
+        </test>
+        <test>
+            <param ftype="fasta.gz" name="input" value="yac.fasta.gz" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="out_format" value="fasta" />
+            <param name="Nmode" value="accept" />
+            <output file="out.fasta" name="output" />
+        </test>
+        <test>
+            <param ftype="fasta.gz" name="input" value="yac.fasta.gz" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fastagz" />
+            <output file="out.fasta.gz" name="output" decompress="True" />
+        </test>
+        <test>
+            <param ftype="fastqsanger.gz" name="input" value="yac_with_empty_reads.fastqsanger.gz" />
+            <param name="min" value="18" />
+            <param name="max" value="30" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="TGGAATTCTCGGGTGCCAAG" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fastagz" />
+            <output file="out_with_empty_reads.fasta.gz" name="output" decompress="True" />
+        </test>
+    </tests>
+    <help>
+
+**What it does**
+
++ Clips adapter sequences
++ Renumbers sequence headers
++ Filters sequences on their size
++ Filters sequences containing unknown nucleotides (optional)
+
+-------
+
+**Inputs**
+
+1. A fastq or fasta file of reads to be clipped
+2. Select the size of the reads to be kept
+3. Select an output format. When input is a fastq file, this may be fastq or fasta, whereas
+when input is a fasta file, this only may be a fasta.
+4. Select whether you wish or do not wish to keep clipped sequences with unknown nucleotides (N)
+5. Select a pre-built adapter sequence or enter your own sequence (at least 7 nucleotides long)
+
+-------
+
+**Output**
+
+A fastq or fasta file containing clipped sequences satisfying the selected criteria.
+
+    </help>
+    <citations />
+</tool>