changeset 1:e5ef40107f54 draft

Uploaded
author mvdbeek
date Mon, 30 Mar 2015 11:40:52 -0400
parents 2445856981a1
children a70911a01f36
files README.txt test-data/yac_fastq.out yac.py yac.xml
diffstat 4 files changed, 195 insertions(+), 128 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.txt	Mon Mar 30 11:40:52 2015 -0400
@@ -0,0 +1,11 @@
+This tool clips adapter sequences from a fastq file and outputs either a    
+fasta or fastq file of clipped reads with renumbered fasta/fastq headers.
+
+Clipped sequences with Ns can be discarded.
+
+Min size and max size filter clipped reads on their size.
+
+Note that unclipped reads that satisfy the min and max size conditions are kept.
+
+Homepage: drosophile.org
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/yac_fastq.out	Mon Mar 30 11:40:52 2015 -0400
@@ -0,0 +1,24 @@
+@HWI-1
+TGTAAACATCCCCGACTGGCAGC
++
+B@BBCBCCBCBCCC8A<@#####
+@HWI-2
+AAAGTGCTACTACTTTTGAGTCT
++
+BAA@7?A@@A@@B<'25?6>59:
+@HWI-3
+ACTGGACTTGGAGTCCGAAGGC
++
+BBB@@ABAAB?9B42&9;####
+@HWI-4
+AAGTGCCGCCAGGTTTTGAGTGG
++
+AB?5;3>/=?>=;416481####
+@HWI-5
+TATTGCACTTGTCCCGGCCTGAATCNCGT
++
+BCB=:ACCBB=>BB8<-############
+@HWI-6
+TAGCTTATCAGACTGATGTTGAC
++
+BBBBBCBBCB;>AA',9=18?1:
--- a/yac.py	Mon Nov 03 09:34:45 2014 -0500
+++ b/yac.py	Mon Mar 30 11:40:52 2015 -0400
@@ -1,91 +1,108 @@
 #!/usr/bin/python
 # yac = yet another clipper
+# v 1.2.1 - 23-08-2014 - Support FastQ output
 # v 1.1.0 - 23-08-2014 - argparse implementation
 # Usage yac.py  $input $output $adapter_to_clip $min $max $Nmode
 # Christophe Antoniewski <drosofff@gmail.com>
 
-import sys, string, argparse
+import sys
+import string
+import argparse
+from itertools import islice
+
 
 def Parser():
-  the_parser = argparse.ArgumentParser()
-  the_parser.add_argument('--input', action="store", type=str, help="input fastq file")
-  the_parser.add_argument('--output', action="store", type=str, help="output, clipped fasta file")
-  the_parser.add_argument('--adapter_to_clip', action="store", type=str, help="adapter sequence to clip")
-  the_parser.add_argument('--min', action="store", type=int, help="minimal size of clipped sequence to keep")
-  the_parser.add_argument('--max', action="store", type=int, help="maximal size of clipped sequence to keep")
-  the_parser.add_argument('--Nmode', action="store", type=str, choices=["accept", "reject"], help="accept or reject sequences with N for clipping")
-  args = the_parser.parse_args()
-  args.adapter_to_clip = args.adapter_to_clip.upper()
-  return args
+    the_parser = argparse.ArgumentParser()
+    the_parser.add_argument(
+        '--input', action="store", nargs='+', help="input fastq files")
+    the_parser.add_argument(
+        '--output', action="store", type=str, help="output, clipped fasta file")
+    the_parser.add_argument(
+        '--output_format', action="store", type=str, help="output format, fasta or fastq")
+    the_parser.add_argument(
+        '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip")
+    the_parser.add_argument(
+        '--min', action="store", type=int, help="minimal size of clipped sequence to keep")
+    the_parser.add_argument(
+        '--max', action="store", type=int, help="maximal size of clipped sequence to keep")
+    the_parser.add_argument('--Nmode', action="store", type=str, choices=[
+                            "accept", "reject"], help="accept or reject sequences with N for clipping")
+    args = the_parser.parse_args()
+    args.adapter_to_clip = args.adapter_to_clip.upper()
+    return args
 
 
 class Clip:
-  def __init__(self, inputfile, outputfile, adapter, minsize, maxsize):
-    self.inputfile = inputfile
-    self.outputfile = outputfile
-    self.adapter = adapter
-    self.minsize = int(minsize)
-    self.maxsize = int(maxsize)
-    def motives (sequence):
-      '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
-      sequencevariants = [sequence[0:6]] # initializes the list with the 6mer perfect match
-      dicsubst= {"A":"TGCN", "T":"AGCN", "G":"TACN", "C":"GATN"}
-      for pos in enumerate(sequence[:6]):
-        for subst in dicsubst[pos[1]]:
-          sequencevariants.append(sequence[:pos[0]]+ subst + sequence[pos[0]+1:7])
-      return sequencevariants
-    self.adaptmotifs= motives(self.adapter)
+
+    def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode):
+        self.inputfile = inputfile
+        self.outputfile = outputfile
+        self.output_format = output_format
+        self.adapter = adapter
+        self.minsize = int(minsize)
+        self.maxsize = int(maxsize)
+        self.Nmode = Nmode
+
+        def motives(sequence):
+            '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
+            sequencevariants = [
+                sequence[0:6]]  # initializes the list with the 6mer perfect match
+            dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"}
+            for pos in enumerate(sequence[:6]):
+                for subst in dicsubst[pos[1]]:
+                    sequencevariants.append(
+                        sequence[:pos[0]] + subst + sequence[pos[0] + 1:7])
+            return sequencevariants
+        self.adaptmotifs = motives(self.adapter)
+
+    def scanadapt(self, adaptmotives=[], sequence="", qscore=""):
+        '''scans sequence for adapter motives'''
+        match_position = sequence.rfind(adaptmotives[0])
+        if match_position != -1:
+            return sequence[:match_position], qscore[:match_position]
+        for motif in adaptmotives[1:]:
+            match_position = sequence.rfind(motif)
+            if match_position != -1:
+                return sequence[:match_position], qscore[:match_position]
+        return sequence, qscore
 
-  def scanadapt(self, adaptmotives=[], sequence=""):
-    '''scans sequence for adapter motives'''
-    if sequence.rfind(adaptmotives[0]) != -1:
-      return sequence[:sequence.rfind(adaptmotives[0])]
-    for motif in adaptmotives[1:]:
-      if sequence.rfind(motif) != -1:
-        return sequence[:sequence.rfind(motif)]
-    return sequence
+    def write_output(self, id, read, qscore, output):
+        if self.output_format == "fasta":
+            block = ">{0}\n{1}\n".format(id, read)
+        else:
+            block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore)
+        output.write(block)
 
-  def clip_with_N (self):
-    '''clips adapter sequences from inputfile. 
-    Reads containing N are retained.'''
-    iterator = 0
+    def handle_io(self):
+        '''Open input file, pass read sequence and read qscore to clipping function.
+        Pass clipped read and qscore to output function.'''
+        id = 0
+        output = open(self.outputfile, "a")
+        with open(self.inputfile, "r") as input:
+            block_gen = islice(input, 1, None, 2)
+            for i, line in enumerate(block_gen):
+                if i % 2:
+                    qscore = line.rstrip()
+                else:
+                    read = line.rstrip()
+                    continue
+                trimmed_read, trimmed_qscore = self.scanadapt(
+                    self.adaptmotifs, read, qscore)
+                if self.minsize <= len(trimmed_read) <= self.maxsize:
+                    if (self.Nmode == "reject") and ("N" in trimmed_read):
+                        continue
+                    id += 1
+                    self.write_output(id, trimmed_read, trimmed_qscore, output)
+            output.close()
+
+
+def main(*argv):
+    instanceClip = Clip(*argv)
+    instanceClip.handle_io()
+
+if __name__ == "__main__":
+    args = Parser()
     id = 0
-    F = open (self.inputfile, "r")
-    O = open (self.outputfile, "w")
-    for line in F:
-      iterator += 1
-      if iterator % 4 == 2:
-        trim = self.scanadapt (self.adaptmotifs, line.rstrip() )
-        if self.minsize <= len(trim) <= self.maxsize:
-          id += 1
-          print >> O, ">%i\n%s" % (id, trim)
-    F.close()
-    O.close()
-  def clip_without_N (self):
-    '''clips adapter sequences from inputfile. 
-    Reads containing N are rejected.'''
-    iterator = 0
-    id = 0
-    F = open (self.inputfile, "r")
-    O = open (self.outputfile, "w")
-    for line in F:
-      iterator += 1
-      if iterator % 4 == 2:
-        trim = self.scanadapt (self.adaptmotifs, line.rstrip() )
-        if "N" in trim: continue
-        if self.minsize <= len(trim) <= self.maxsize:
-          id += 1
-          print >> O, ">%i\n%s" % (id, trim)
-    F.close()
-    O.close()
-
-def __main__ (inputfile, outputfile, adapter, minsize, maxsize, Nmode):
-  instanceClip = Clip (inputfile, outputfile, adapter, minsize, maxsize)
-  if Nmode == "accept":
-    instanceClip.clip_with_N()
-  else:
-    instanceClip.clip_without_N()
-
-if __name__ == "__main__" :
-  args = Parser()
-  __main__(args.input, args.output, args.adapter_to_clip, args.min, args.max, args.Nmode)
+    for inputfile in args.input:
+        main(inputfile, args.output, args.output_format,
+             args.adapter_to_clip, args.min, args.max, args.Nmode)
--- a/yac.xml	Mon Nov 03 09:34:45 2014 -0500
+++ b/yac.xml	Mon Mar 30 11:40:52 2015 -0400
@@ -1,64 +1,79 @@
- <tool id="yac" name="Clip adapter" version="1.1.0">
-  <description></description>
-  <command interpreter="python">yac.py --input $input
+<tool id="yac" name="Clip adapter" version="1.3.3">
+    <description />
+    <command interpreter="python">yac.py --input $input
                                        --output $output
+                                       --output_format "$out_format"
                                        --adapter_to_clip $clip_source.clip_sequence
                                        --min $min
                                        --max $max
                                        --Nmode $Nmode
   </command>
-  <inputs>
-    <param format="fastq" name="input" type="data" label="Source file"/>
-    <param name="min" type="integer" size="4" value="15" label="min size"/>
-    <param name="max" type="integer" size="4" value="36" label="max size"/>
-    <param name="Nmode" type="select" label="Accept reads containing N?">
-        <option value="accept" selected="True">accept</option>
-        <option value="reject">reject</option>
-    </param>
-    <conditional name="clip_source">
-   	 <param name="clip_source_list" type="select" label="Source" help="Built-in adapters or User-provided">
-         	<option value="prebuilt" selected="True">Use a built-in adapter (select from the list below)</option>
-         	<option value="user">Use custom sequence</option>
-         </param>
-         <when value="prebuilt">
-         	<param name="clip_sequence" type="select" label="Select Adapter to clip" help="if your adapter is not listed, input your own sequence">
-                	<option value="TCGTATGCCGTCTTCTGCTTG">Solexa TCGTATGCCGTCTTCTGCTTG</option>
-                        <option value="ATCTCGTATGCCGTCTTCTGCTT">Illumina ATCTCGTATGCCGTCTTCTGCTT</option>
-                        <option value="TGGAATTCTCGGGTGCCAAG" selected="True">Illumina TruSeq  TGGAATTCTCGGGTGCCAAG</option>
-                        <option value="CTGTAGGCACCATCAATCGT">IdT CTGTAGGCACCATCAATCGT</option>
-		</param>
-	</when>
-        <when value="user">
-		 <param name="clip_sequence" type="text" size="35"  label="Enter your Sequence" value="GAATCC"/>
-	</when>
-    </conditional>
-  </inputs>
-  <outputs>
-    <data format="fasta" name="output"  metadata="input" />
-  </outputs>
+    <inputs>
+        <param format="fastq" label="Source file" name="input" type="data" />
+        <param label="min size" name="min" size="4" type="integer" value="15" />
+        <param label="max size" name="max" size="4" type="integer" value="36" />
+        <param label="Select output format" name="out_format" type="select">
+            <option selected="true" value="fasta">Fasta format</option>
+            <option value="fastq">Fastq format</option>
+        </param>
+        <param label="Accept reads containing N?" name="Nmode" type="select">
+            <option selected="True" value="accept">accept</option>
+            <option value="reject">reject</option>
+        </param>
+        <conditional name="clip_source">
+            <param help="Built-in adapters or User-provided" label="Source" name="clip_source_list" type="select">
+                <option selected="True" value="prebuilt">Use a built-in adapter (select from the list below)</option>
+                <option value="user">Use custom sequence</option>
+            </param>
+            <when value="prebuilt">
+                <param help="if your adapter is not listed, input your own sequence" label="Select Adapter to clip" name="clip_sequence" type="select">
+                    <option value="TCGTATGCCGTCTTCTGCTTG">Solexa TCGTATGCCGTCTTCTGCTTG</option>
+                    <option value="ATCTCGTATGCCGTCTTCTGCTT">Illumina ATCTCGTATGCCGTCTTCTGCTT</option>
+                    <option selected="True" value="TGGAATTCTCGGGTGCCAAG">Illumina TruSeq  TGGAATTCTCGGGTGCCAAG</option>
+                    <option value="CTGTAGGCACCATCAATCGT">IdT CTGTAGGCACCATCAATCGT</option>
+                </param>
+            </when>
+            <when value="user">
+                <param label="Enter your Sequence" name="clip_sequence" size="35" type="text" value="GAATCC" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="fasta" metadata="input" name="output" />
+        <change_format>
+            <when format="fastq" input="out_format" value="fastq" />
+        </change_format>
+    </outputs>
+    <tests>
+        <test>
+            <param ftype="fastqsanger" name="input" value="yac.fastq" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <output file="yac.out" name="output" />
+        </test>
+        <test>
+            <param ftype="fastqsanger" name="input" value="yac.fastq" />
+            <param name="min" value="18" />
+            <param name="max" value="29" />
+            <param name="clip_source_list" value="prebuilt" />
+            <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
+            <param name="Nmode" value="accept" />
+            <param name="out_format" value="fastq" />
+            <output file="yac_fastq.out" name="output" />
+        </test>
+    </tests>
+    <help>
+This tool clips adapter sequences from a fastq file and outputs either a
+fasta or fastq file of clipped reads with renumbered fasta/fastq headers.
 
-  <help>
-<!-- write a decent doc ! -->
-This tool clips adapter sequences from a fastq file and fasta file of clipped reads with renumbered fasta headers.
-
-Clipped sequences with Ns can be discarded.
+By defualt clipped sequences with unknown nucleotides are kept, but 
+can be discarded by setting "Accept reads containing N?" to reject.
 
 Min size and max size filter clipped reads on their size.
 
 Note that unclipped reads that satisfy the min and max size conditions are kept.
   </help>
-
-<!-- write a <test> section -->
-	<tests>
-                <test>
-                        <param name="input" value="yac.fastq" ftype="fastqsanger"/>
-                        <param name="min" value="18" />
-                        <param name="max" value="29" />
-                        <param name="clip_source_list" value="prebuilt" />
-                        <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" />
-                        <param name="Nmode" value="accept" />
-                        <output name="output" file="yac.out" />
-                </test>
-        </tests>
-
 </tool>