Mercurial > repos > drosofff > yac_clipper
changeset 1:e5ef40107f54 draft
Uploaded
author | mvdbeek |
---|---|
date | Mon, 30 Mar 2015 11:40:52 -0400 |
parents | 2445856981a1 |
children | a70911a01f36 |
files | README.txt test-data/yac_fastq.out yac.py yac.xml |
diffstat | 4 files changed, 195 insertions(+), 128 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Mon Mar 30 11:40:52 2015 -0400 @@ -0,0 +1,11 @@ +This tool clips adapter sequences from a fastq file and outputs either a +fasta or fastq file of clipped reads with renumbered fasta/fastq headers. + +Clipped sequences with Ns can be discarded. + +Min size and max size filter clipped reads on their size. + +Note that unclipped reads that satisfy the min and max size conditions are kept. + +Homepage: drosophile.org +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/yac_fastq.out Mon Mar 30 11:40:52 2015 -0400 @@ -0,0 +1,24 @@ +@HWI-1 +TGTAAACATCCCCGACTGGCAGC ++ +B@BBCBCCBCBCCC8A<@##### +@HWI-2 +AAAGTGCTACTACTTTTGAGTCT ++ +BAA@7?A@@A@@B<'25?6>59: +@HWI-3 +ACTGGACTTGGAGTCCGAAGGC ++ +BBB@@ABAAB?9B42&9;#### +@HWI-4 +AAGTGCCGCCAGGTTTTGAGTGG ++ +AB?5;3>/=?>=;416481#### +@HWI-5 +TATTGCACTTGTCCCGGCCTGAATCNCGT ++ +BCB=:ACCBB=>BB8<-############ +@HWI-6 +TAGCTTATCAGACTGATGTTGAC ++ +BBBBBCBBCB;>AA',9=18?1:
--- a/yac.py Mon Nov 03 09:34:45 2014 -0500 +++ b/yac.py Mon Mar 30 11:40:52 2015 -0400 @@ -1,91 +1,108 @@ #!/usr/bin/python # yac = yet another clipper +# v 1.2.1 - 23-08-2014 - Support FastQ output # v 1.1.0 - 23-08-2014 - argparse implementation # Usage yac.py $input $output $adapter_to_clip $min $max $Nmode # Christophe Antoniewski <drosofff@gmail.com> -import sys, string, argparse +import sys +import string +import argparse +from itertools import islice + def Parser(): - the_parser = argparse.ArgumentParser() - the_parser.add_argument('--input', action="store", type=str, help="input fastq file") - the_parser.add_argument('--output', action="store", type=str, help="output, clipped fasta file") - the_parser.add_argument('--adapter_to_clip', action="store", type=str, help="adapter sequence to clip") - the_parser.add_argument('--min', action="store", type=int, help="minimal size of clipped sequence to keep") - the_parser.add_argument('--max', action="store", type=int, help="maximal size of clipped sequence to keep") - the_parser.add_argument('--Nmode', action="store", type=str, choices=["accept", "reject"], help="accept or reject sequences with N for clipping") - args = the_parser.parse_args() - args.adapter_to_clip = args.adapter_to_clip.upper() - return args + the_parser = argparse.ArgumentParser() + the_parser.add_argument( + '--input', action="store", nargs='+', help="input fastq files") + the_parser.add_argument( + '--output', action="store", type=str, help="output, clipped fasta file") + the_parser.add_argument( + '--output_format', action="store", type=str, help="output format, fasta or fastq") + the_parser.add_argument( + '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip") + the_parser.add_argument( + '--min', action="store", type=int, help="minimal size of clipped sequence to keep") + the_parser.add_argument( + '--max', action="store", type=int, help="maximal size of clipped sequence to keep") + the_parser.add_argument('--Nmode', action="store", type=str, choices=[ + "accept", "reject"], help="accept or reject sequences with N for clipping") + args = the_parser.parse_args() + args.adapter_to_clip = args.adapter_to_clip.upper() + return args class Clip: - def __init__(self, inputfile, outputfile, adapter, minsize, maxsize): - self.inputfile = inputfile - self.outputfile = outputfile - self.adapter = adapter - self.minsize = int(minsize) - self.maxsize = int(maxsize) - def motives (sequence): - '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module''' - sequencevariants = [sequence[0:6]] # initializes the list with the 6mer perfect match - dicsubst= {"A":"TGCN", "T":"AGCN", "G":"TACN", "C":"GATN"} - for pos in enumerate(sequence[:6]): - for subst in dicsubst[pos[1]]: - sequencevariants.append(sequence[:pos[0]]+ subst + sequence[pos[0]+1:7]) - return sequencevariants - self.adaptmotifs= motives(self.adapter) + + def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode): + self.inputfile = inputfile + self.outputfile = outputfile + self.output_format = output_format + self.adapter = adapter + self.minsize = int(minsize) + self.maxsize = int(maxsize) + self.Nmode = Nmode + + def motives(sequence): + '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module''' + sequencevariants = [ + sequence[0:6]] # initializes the list with the 6mer perfect match + dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"} + for pos in enumerate(sequence[:6]): + for subst in dicsubst[pos[1]]: + sequencevariants.append( + sequence[:pos[0]] + subst + sequence[pos[0] + 1:7]) + return sequencevariants + self.adaptmotifs = motives(self.adapter) + + def scanadapt(self, adaptmotives=[], sequence="", qscore=""): + '''scans sequence for adapter motives''' + match_position = sequence.rfind(adaptmotives[0]) + if match_position != -1: + return sequence[:match_position], qscore[:match_position] + for motif in adaptmotives[1:]: + match_position = sequence.rfind(motif) + if match_position != -1: + return sequence[:match_position], qscore[:match_position] + return sequence, qscore - def scanadapt(self, adaptmotives=[], sequence=""): - '''scans sequence for adapter motives''' - if sequence.rfind(adaptmotives[0]) != -1: - return sequence[:sequence.rfind(adaptmotives[0])] - for motif in adaptmotives[1:]: - if sequence.rfind(motif) != -1: - return sequence[:sequence.rfind(motif)] - return sequence + def write_output(self, id, read, qscore, output): + if self.output_format == "fasta": + block = ">{0}\n{1}\n".format(id, read) + else: + block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore) + output.write(block) - def clip_with_N (self): - '''clips adapter sequences from inputfile. - Reads containing N are retained.''' - iterator = 0 + def handle_io(self): + '''Open input file, pass read sequence and read qscore to clipping function. + Pass clipped read and qscore to output function.''' + id = 0 + output = open(self.outputfile, "a") + with open(self.inputfile, "r") as input: + block_gen = islice(input, 1, None, 2) + for i, line in enumerate(block_gen): + if i % 2: + qscore = line.rstrip() + else: + read = line.rstrip() + continue + trimmed_read, trimmed_qscore = self.scanadapt( + self.adaptmotifs, read, qscore) + if self.minsize <= len(trimmed_read) <= self.maxsize: + if (self.Nmode == "reject") and ("N" in trimmed_read): + continue + id += 1 + self.write_output(id, trimmed_read, trimmed_qscore, output) + output.close() + + +def main(*argv): + instanceClip = Clip(*argv) + instanceClip.handle_io() + +if __name__ == "__main__": + args = Parser() id = 0 - F = open (self.inputfile, "r") - O = open (self.outputfile, "w") - for line in F: - iterator += 1 - if iterator % 4 == 2: - trim = self.scanadapt (self.adaptmotifs, line.rstrip() ) - if self.minsize <= len(trim) <= self.maxsize: - id += 1 - print >> O, ">%i\n%s" % (id, trim) - F.close() - O.close() - def clip_without_N (self): - '''clips adapter sequences from inputfile. - Reads containing N are rejected.''' - iterator = 0 - id = 0 - F = open (self.inputfile, "r") - O = open (self.outputfile, "w") - for line in F: - iterator += 1 - if iterator % 4 == 2: - trim = self.scanadapt (self.adaptmotifs, line.rstrip() ) - if "N" in trim: continue - if self.minsize <= len(trim) <= self.maxsize: - id += 1 - print >> O, ">%i\n%s" % (id, trim) - F.close() - O.close() - -def __main__ (inputfile, outputfile, adapter, minsize, maxsize, Nmode): - instanceClip = Clip (inputfile, outputfile, adapter, minsize, maxsize) - if Nmode == "accept": - instanceClip.clip_with_N() - else: - instanceClip.clip_without_N() - -if __name__ == "__main__" : - args = Parser() - __main__(args.input, args.output, args.adapter_to_clip, args.min, args.max, args.Nmode) + for inputfile in args.input: + main(inputfile, args.output, args.output_format, + args.adapter_to_clip, args.min, args.max, args.Nmode)
--- a/yac.xml Mon Nov 03 09:34:45 2014 -0500 +++ b/yac.xml Mon Mar 30 11:40:52 2015 -0400 @@ -1,64 +1,79 @@ - <tool id="yac" name="Clip adapter" version="1.1.0"> - <description></description> - <command interpreter="python">yac.py --input $input +<tool id="yac" name="Clip adapter" version="1.3.3"> + <description /> + <command interpreter="python">yac.py --input $input --output $output + --output_format "$out_format" --adapter_to_clip $clip_source.clip_sequence --min $min --max $max --Nmode $Nmode </command> - <inputs> - <param format="fastq" name="input" type="data" label="Source file"/> - <param name="min" type="integer" size="4" value="15" label="min size"/> - <param name="max" type="integer" size="4" value="36" label="max size"/> - <param name="Nmode" type="select" label="Accept reads containing N?"> - <option value="accept" selected="True">accept</option> - <option value="reject">reject</option> - </param> - <conditional name="clip_source"> - <param name="clip_source_list" type="select" label="Source" help="Built-in adapters or User-provided"> - <option value="prebuilt" selected="True">Use a built-in adapter (select from the list below)</option> - <option value="user">Use custom sequence</option> - </param> - <when value="prebuilt"> - <param name="clip_sequence" type="select" label="Select Adapter to clip" help="if your adapter is not listed, input your own sequence"> - <option value="TCGTATGCCGTCTTCTGCTTG">Solexa TCGTATGCCGTCTTCTGCTTG</option> - <option value="ATCTCGTATGCCGTCTTCTGCTT">Illumina ATCTCGTATGCCGTCTTCTGCTT</option> - <option value="TGGAATTCTCGGGTGCCAAG" selected="True">Illumina TruSeq TGGAATTCTCGGGTGCCAAG</option> - <option value="CTGTAGGCACCATCAATCGT">IdT CTGTAGGCACCATCAATCGT</option> - </param> - </when> - <when value="user"> - <param name="clip_sequence" type="text" size="35" label="Enter your Sequence" value="GAATCC"/> - </when> - </conditional> - </inputs> - <outputs> - <data format="fasta" name="output" metadata="input" /> - </outputs> + <inputs> + <param format="fastq" label="Source file" name="input" type="data" /> + <param label="min size" name="min" size="4" type="integer" value="15" /> + <param label="max size" name="max" size="4" type="integer" value="36" /> + <param label="Select output format" name="out_format" type="select"> + <option selected="true" value="fasta">Fasta format</option> + <option value="fastq">Fastq format</option> + </param> + <param label="Accept reads containing N?" name="Nmode" type="select"> + <option selected="True" value="accept">accept</option> + <option value="reject">reject</option> + </param> + <conditional name="clip_source"> + <param help="Built-in adapters or User-provided" label="Source" name="clip_source_list" type="select"> + <option selected="True" value="prebuilt">Use a built-in adapter (select from the list below)</option> + <option value="user">Use custom sequence</option> + </param> + <when value="prebuilt"> + <param help="if your adapter is not listed, input your own sequence" label="Select Adapter to clip" name="clip_sequence" type="select"> + <option value="TCGTATGCCGTCTTCTGCTTG">Solexa TCGTATGCCGTCTTCTGCTTG</option> + <option value="ATCTCGTATGCCGTCTTCTGCTT">Illumina ATCTCGTATGCCGTCTTCTGCTT</option> + <option selected="True" value="TGGAATTCTCGGGTGCCAAG">Illumina TruSeq TGGAATTCTCGGGTGCCAAG</option> + <option value="CTGTAGGCACCATCAATCGT">IdT CTGTAGGCACCATCAATCGT</option> + </param> + </when> + <when value="user"> + <param label="Enter your Sequence" name="clip_sequence" size="35" type="text" value="GAATCC" /> + </when> + </conditional> + </inputs> + <outputs> + <data format="fasta" metadata="input" name="output" /> + <change_format> + <when format="fastq" input="out_format" value="fastq" /> + </change_format> + </outputs> + <tests> + <test> + <param ftype="fastqsanger" name="input" value="yac.fastq" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <output file="yac.out" name="output" /> + </test> + <test> + <param ftype="fastqsanger" name="input" value="yac.fastq" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <param name="out_format" value="fastq" /> + <output file="yac_fastq.out" name="output" /> + </test> + </tests> + <help> +This tool clips adapter sequences from a fastq file and outputs either a +fasta or fastq file of clipped reads with renumbered fasta/fastq headers. - <help> -<!-- write a decent doc ! --> -This tool clips adapter sequences from a fastq file and fasta file of clipped reads with renumbered fasta headers. - -Clipped sequences with Ns can be discarded. +By defualt clipped sequences with unknown nucleotides are kept, but +can be discarded by setting "Accept reads containing N?" to reject. Min size and max size filter clipped reads on their size. Note that unclipped reads that satisfy the min and max size conditions are kept. </help> - -<!-- write a <test> section --> - <tests> - <test> - <param name="input" value="yac.fastq" ftype="fastqsanger"/> - <param name="min" value="18" /> - <param name="max" value="29" /> - <param name="clip_source_list" value="prebuilt" /> - <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> - <param name="Nmode" value="accept" /> - <output name="output" file="yac.out" /> - </test> - </tests> - </tool>