Mercurial > repos > artbio > yac_clipper
changeset 0:1dde0b3d5f6a draft default tip
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/yac_clipper commit 9c5f0b8e89dfe4347c610f42923f0acad2ecc81b"
| author | artbio | 
|---|---|
| date | Wed, 17 Mar 2021 22:08:17 +0000 | 
| parents | |
| children | |
| files | README.txt test-data/out.fasta test-data/out.fasta.gz test-data/out.fastqsanger test-data/out.fastqsanger.gz test-data/out_with_empty_reads.fasta.gz test-data/yac.fasta test-data/yac.fasta.gz test-data/yac.fastq test-data/yac.fastqsanger.gz test-data/yac_with_empty_reads.fastqsanger.gz yac.py yac.xml | 
| diffstat | 13 files changed, 432 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Wed Mar 17 22:08:17 2021 +0000 @@ -0,0 +1,11 @@ +This tool clips adapter sequences from a fastq file and outputs either a +fasta or fastq file of clipped reads with renumbered fasta/fastq headers. + +Clipped sequences with Ns can be discarded. + +Min size and max size filter clipped reads on their size. + +Note that unclipped reads that satisfy the min and max size conditions are kept. + +Homepage: drosophile.org +Repositoy development: https://bitbucket.org/drosofff/gedtools/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out.fasta Wed Mar 17 22:08:17 2021 +0000 @@ -0,0 +1,12 @@ +>1 +TGTAAACATCCCCGACTGGCAGC +>2 +AAAGTGCTACTACTTTTGAGTCT +>3 +ACTGGACTTGGAGTCCGAAGGC +>4 +AAGTGCCGCCAGGTTTTGAGTGG +>5 +TATTGCACTTGTCCCGGCCTGAATCNCGT +>6 +TAGCTTATCAGACTGATGTTGAC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out.fastqsanger Wed Mar 17 22:08:17 2021 +0000 @@ -0,0 +1,24 @@ +@HWI-1 +TGTAAACATCCCCGACTGGCAGC ++ +B@BBCBCCBCBCCC8A<@##### +@HWI-2 +AAAGTGCTACTACTTTTGAGTCT ++ +BAA@7?A@@A@@B<'25?6>59: +@HWI-3 +ACTGGACTTGGAGTCCGAAGGC ++ +BBB@@ABAAB?9B42&9;#### +@HWI-4 +AAGTGCCGCCAGGTTTTGAGTGG ++ +AB?5;3>/=?>=;416481#### +@HWI-5 +TATTGCACTTGTCCCGGCCTGAATCNCGT ++ +BCB=:ACCBB=>BB8<-############ +@HWI-6 +TAGCTTATCAGACTGATGTTGAC ++ +BBBBBCBBCB;>AA',9=18?1:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/yac.fasta Wed Mar 17 22:08:17 2021 +0000 @@ -0,0 +1,20 @@ +>1 +TGTAAACATCCCCGACTGGCAGCATNTCGTATGCCG +>2 +AAAGTGCTACTACTTTTGAGTCTATNTCGTACGCCG +>3 +TAGCTTATCAGACTGATGTTGACACNTCGTATGCCG +>4 +ACTGGACTTGGAGTCCGAAGGCATCNCGTATTCCGT +>5 +AAGTGCCGCCAGGTTTTGAGTGGATNTCGTATGGCG +>6 +TATTGCACTTGTCCCGGCCTGAATCNCGTATCCCGT +>7 +TGGTAGACTATGGAACGTAGGATCTNGCATGCCGCC +>8 +AGTGGTAGAGCATTTGAATCTCGTANGCCGTCTTCT +>9 +TAGCTTATCAGACTGATGTTGACATNTCGTACGCCG +>10 +TTTGGCAATGGTAGAACTCCCACACNTCGTAGGCCG
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/yac.fastq Wed Mar 17 22:08:17 2021 +0000 @@ -0,0 +1,40 @@ +@SRR290479.1 HWI-EAS285:2:1:66:28/1 +TGTAAACATCCCCGACTGGCAGCATNTCGTATGCCG ++ +B@BBCBCCBCBCCC8A<@################## +@SRR290479.2 HWI-EAS285:2:1:67:348/1 +AAAGTGCTACTACTTTTGAGTCTATNTCGTACGCCG ++ +BAA@7?A@@A@@B<'25?6>59:;7#<?######## +@SRR290479.3 HWI-EAS285:2:1:68:826/1 +TAGCTTATCAGACTGATGTTGACACNTCGTATGCCG ++ +BB@BBCCBCCBBB:%%83/>B7@44#;;324'117? +@SRR290479.4 HWI-EAS285:2:1:68:65/1 +ACTGGACTTGGAGTCCGAAGGCATCNCGTATTCCGT ++ +BBB@@ABAAB?9B42&9;################## +@SRR290479.5 HWI-EAS285:2:1:69:594/1 +AAGTGCCGCCAGGTTTTGAGTGGATNTCGTATGGCG ++ +AB?5;3>/=?>=;416481################# +@SRR290479.6 HWI-EAS285:2:1:70:700/1 +TATTGCACTTGTCCCGGCCTGAATCNCGTATCCCGT ++ +BCB=:ACCBB=>BB8<-################### +@SRR290479.7 HWI-EAS285:2:1:70:1679/1 +TGGTAGACTATGGAACGTAGGATCTNGCATGCCGCC ++ +BCBBCCBCCCBCCA?AB>:B@><>############ +@SRR290479.8 HWI-EAS285:2:1:71:1400/1 +AGTGGTAGAGCATTTGAATCTCGTANGCCGTCTTCT ++ +7@BC>>@55CCBCA3CBA14B.A16#*;9359B### +@SRR290479.9 HWI-EAS285:2:1:71:795/1 +TAGCTTATCAGACTGATGTTGACATNTCGTACGCCG ++ +BBBBBCBBCB;>AA',9=18?1:7:#<;57###### +@SRR290479.10 HWI-EAS285:2:1:71:596/1 +TTTGGCAATGGTAGAACTCCCACACNTCGTAGGCCG ++ +B@B>7>9A@<46B@79972#################
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/yac.py Wed Mar 17 22:08:17 2021 +0000 @@ -0,0 +1,156 @@ +#!/usr/bin/python +# yac = yet another clipper +# v 1.2.1 - 23-08-2014 - Support FastQ output +# v 1.1.0 - 23-08-2014 - argparse implementation +# Christophe Antoniewski <drosofff@gmail.com> + +import argparse +from itertools import islice + + +def Parser(): + the_parser = argparse.ArgumentParser() + the_parser.add_argument( + '--input', action="store", nargs='+', help="input fastq files") + the_parser.add_argument( + '--output', action="store", type=str, + help="output, clipped fasta file") + the_parser.add_argument( + '--output_format', action="store", type=str, + help="output format, fasta or fastq") + the_parser.add_argument( + '--adapter_to_clip', action="store", type=str, + help="adapter sequence to clip") + the_parser.add_argument( + '--min', action="store", type=int, + help="minimal size of clipped sequence to keep") + the_parser.add_argument( + '--max', action="store", type=int, + help="maximal size of clipped sequence to keep") + the_parser.add_argument('--Nmode', action="store", type=str, choices=[ + "accept", "reject"], + help="accept or reject Ns in clipped sequences") + args = the_parser.parse_args() + args.adapter_to_clip = args.adapter_to_clip.upper() + return args + + +class Clip: + + def __init__(self, inputfile, outputfile, output_format, + adapter, minsize, maxsize, Nmode): + self.inputfile = inputfile + self.outputfile = outputfile + self.output_format = output_format + self.adapter = adapter + self.minsize = int(minsize) + self.maxsize = int(maxsize) + self.Nmode = Nmode + for line in open(inputfile): + if line[0] == "@": + self.inputformat = "fastq" + break + elif line[0] == ">": + self.inputformat = "fasta" + + def motives(sequence): + ''' + return a list of motives for perfect (6nt) or + imperfect (7nt with one mismatch) search on import string module + ''' + sequencevariants = [ + sequence[0:6]] # initializes list with 6mer perfect match + dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"} + for pos in enumerate(sequence[:6]): + for subst in dicsubst[pos[1]]: + sequencevariants.append( + sequence[:pos[0]] + subst + sequence[pos[0] + 1:7]) + return sequencevariants + self.adaptmotifs = motives(self.adapter) + + def scanadapt(self, adaptmotives=[], sequence="", qscore=""): + '''scans sequence for adapter motives''' + match_position = sequence.rfind(adaptmotives[0]) + if qscore: + if match_position != -1: + return sequence[:match_position], qscore[:match_position] + for motif in adaptmotives[1:]: + match_position = sequence.rfind(motif) + if match_position != -1: + return sequence[:match_position], qscore[:match_position] + return sequence, qscore + else: + if match_position != -1: + return sequence[:match_position] + for motif in adaptmotives[1:]: + match_position = sequence.rfind(motif) + if match_position != -1: + return sequence[:match_position] + return sequence + + def write_output(self, id, read, qscore, output): + if self.output_format == "fasta": + block = ">{0}\n{1}\n".format(id, read) + else: + block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore) + output.write(block) + + def fasta_in_write_output(self, id, read, output): + output.write(">{0}\n{1}\n".format(id, read)) + + def handle_io_fastq(self): + '''Open input fastq file, pass read sequence and read qscore to + scanadapt function. Pass clipped read and qscore to output function.''' + id = 0 + output = open(self.outputfile, "a") + with open(self.inputfile, "r") as input: + block_gen = islice(input, 1, None, 2) + for i, line in enumerate(block_gen): + if i % 2: + qscore = line.rstrip() + else: + read = line.rstrip() + continue + try: + trimmed_read, trimmed_qscore = self.scanadapt( + self.adaptmotifs, read, qscore) + except ValueError: + continue + if self.minsize <= len(trimmed_read) <= self.maxsize: + if (self.Nmode == "reject") and ("N" in trimmed_read): + continue + id += 1 + self.write_output(id, trimmed_read, trimmed_qscore, output) + output.close() + + def handle_io_fasta(self): + '''Open input fasta file, pass header and read sequence to scanadapt + function. Pass clipped read and qscore to output function.''' + id = 0 + output = open(self.outputfile, "a") + with open(self.inputfile, "r") as input: + block_gen = islice(input, 1, None, 2) + for i, line in enumerate(block_gen): + read = line.rstrip() + trimmed_read = self.scanadapt(self.adaptmotifs, read) + if self.minsize <= len(trimmed_read) <= self.maxsize: + if (self.Nmode == "reject") and ("N" in trimmed_read): + continue + id += 1 + self.fasta_in_write_output(id, trimmed_read, output) + output.close() + + +def main(*argv): + instanceClip = Clip(*argv) + if instanceClip.inputformat == "fasta": + instanceClip.handle_io_fasta() + else: + instanceClip.handle_io_fastq() + + +if __name__ == "__main__": + args = Parser() + for inputfile in args.input: + main(inputfile, args.output, args.output_format, + args.adapter_to_clip, args.min, args.max, args.Nmode)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/yac.xml Wed Mar 17 22:08:17 2021 +0000 @@ -0,0 +1,169 @@ +<tool id="yac" name="Clip adapter" version="2.4.0"> + <description /> + <requirements> + <requirement type="package" version="3.7.6">python</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python $__tool_directory__/yac.py + --input $input + --output 'clip.tmp' + --output_format + #if $out_format == 'fasta' or $out_format == 'fastagz': + 'fasta' + #else + 'fastq' + #end if + --adapter_to_clip $clip_source.clip_sequence + --min $min + --max $max + --Nmode $Nmode && + #if ($out_format == 'fastagz') or ($out_format == 'fastqgz'): + gzip -c 'clip.tmp' > $output + #else + mv clip.tmp $output + #end if + ]]></command> + <inputs> + <param format="fasta,fastq" label="Source file" name="input" type="data" /> + <param label="min size" name="min" size="4" type="integer" value="15" /> + <param label="max size" name="max" size="4" type="integer" value="36" /> + <param label="Select output format" name="out_format" type="select" + help="be careful not to select a fastq format for your output if your input has a fasta format"> + <option value="fasta">Fasta</option> + <option value="fastq" selected="true" >Fastq (Sanger)</option> + <option value="fastagz">gzipped Fasta</option> + <option value="fastqgz">gzipped Fastq (Sanger)</option> + </param> + <param label="Accept reads containing N?" name="Nmode" type="select"> + <option selected="True" value="accept">accept</option> + <option value="reject">reject</option> + </param> + <conditional name="clip_source"> + <param help="Built-in adapters or User-provided" label="Source" name="clip_source_list" type="select"> + <option selected="True" value="prebuilt">Use a built-in adapter (select from the list below)</option> + <option value="user">Use custom sequence</option> + </param> + <when value="prebuilt"> + <param help="if your adapter is not listed, input your own sequence" label="Select Adapter to clip" name="clip_sequence" type="select"> + <option value="TCGTATGCCGTCTTCTGCTTG">Solexa TCGTATGCCGTCTTCTGCTTG</option> + <option value="ATCTCGTATGCCGTCTTCTGCTT">Illumina ATCTCGTATGCCGTCTTCTGCTT</option> + <option selected="True" value="TGGAATTCTCGGGTGCCAAG">Illumina TruSeq TGGAATTCTCGGGTGCCAAG</option> + <option value="CTGTAGGCACCATCAATCGT">IdT CTGTAGGCACCATCAATCGT</option> + </param> + </when> + <when value="user"> + <param label="Enter your Sequence" name="clip_sequence" size="35" type="text" value="GAATCC" /> + </when> + </conditional> + </inputs> + <outputs> + <data format_source="input" metadata_source="input" name="output" label="Clipped ${input.name}-then-${out_format}"> + <change_format> + <when input="out_format" value="fasta" format="fasta" /> + <when input="out_format" value="fastq" format="fastqsanger" /> + <when input="out_format" value="fastagz" format="fasta.gz" /> + <when input="out_format" value="fastqgz" format="fastqsanger.gz" /> + </change_format> + </data> + </outputs> + <tests> + <test> + <param ftype="fastqsanger" name="input" value="yac.fastq" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <param name="out_format" value="fastq" /> + <output file="out.fastqsanger" name="output" /> + </test> + <test> + <param ftype="fastqsanger" name="input" value="yac.fastq" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <param name="out_format" value="fasta" /> + <output file="out.fasta" name="output" /> + </test> + <test> + <param ftype="fastqsanger.gz" name="input" value="yac.fastqsanger.gz" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <param name="out_format" value="fastqgz" /> + <output file="out.fastqsanger.gz" name="output" decompress="True" /> + </test> + <test> + <param ftype="fastqsanger.gz" name="input" value="yac.fastqsanger.gz" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <param name="out_format" value="fastagz" /> + <output file="out.fasta.gz" name="output" decompress="True" /> + </test> + <test> + <param ftype="fasta.gz" name="input" value="yac.fasta.gz" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="out_format" value="fasta" /> + <param name="Nmode" value="accept" /> + <output file="out.fasta" name="output" /> + </test> + <test> + <param ftype="fasta.gz" name="input" value="yac.fasta.gz" /> + <param name="min" value="18" /> + <param name="max" value="29" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="ATCTCGTATGCCGTCTTCTGCTT" /> + <param name="Nmode" value="accept" /> + <param name="out_format" value="fastagz" /> + <output file="out.fasta.gz" name="output" decompress="True" /> + </test> + <test> + <param ftype="fastqsanger.gz" name="input" value="yac_with_empty_reads.fastqsanger.gz" /> + <param name="min" value="18" /> + <param name="max" value="30" /> + <param name="clip_source_list" value="prebuilt" /> + <param name="clip_sequence" value="TGGAATTCTCGGGTGCCAAG" /> + <param name="Nmode" value="accept" /> + <param name="out_format" value="fastagz" /> + <output file="out_with_empty_reads.fasta.gz" name="output" decompress="True" /> + </test> + </tests> + <help> + +**What it does** + ++ Clips adapter sequences ++ Renumbers sequence headers ++ Filters sequences on their size ++ Filters sequences containing unknown nucleotides (optional) + +------- + +**Inputs** + +1. A fastq or fasta file of reads to be clipped +2. Select the size of the reads to be kept +3. Select an output format. When input is a fastq file, this may be fastq or fasta, whereas +when input is a fasta file, this only may be a fasta. +4. Select whether you wish or do not wish to keep clipped sequences with unknown nucleotides (N) +5. Select a pre-built adapter sequence or enter your own sequence (at least 7 nucleotides long) + +------- + +**Output** + +A fastq or fasta file containing clipped sequences satisfying the selected criteria. + + </help> + <citations /> +</tool>
