Mercurial > repos > artbio > yac_clipper
comparison yac.py @ 0:1dde0b3d5f6a draft default tip
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/yac_clipper commit 9c5f0b8e89dfe4347c610f42923f0acad2ecc81b"
| author | artbio |
|---|---|
| date | Wed, 17 Mar 2021 22:08:17 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1dde0b3d5f6a |
|---|---|
| 1 #!/usr/bin/python | |
| 2 # yac = yet another clipper | |
| 3 # v 1.2.1 - 23-08-2014 - Support FastQ output | |
| 4 # v 1.1.0 - 23-08-2014 - argparse implementation | |
| 5 # Christophe Antoniewski <drosofff@gmail.com> | |
| 6 | |
| 7 import argparse | |
| 8 from itertools import islice | |
| 9 | |
| 10 | |
| 11 def Parser(): | |
| 12 the_parser = argparse.ArgumentParser() | |
| 13 the_parser.add_argument( | |
| 14 '--input', action="store", nargs='+', help="input fastq files") | |
| 15 the_parser.add_argument( | |
| 16 '--output', action="store", type=str, | |
| 17 help="output, clipped fasta file") | |
| 18 the_parser.add_argument( | |
| 19 '--output_format', action="store", type=str, | |
| 20 help="output format, fasta or fastq") | |
| 21 the_parser.add_argument( | |
| 22 '--adapter_to_clip', action="store", type=str, | |
| 23 help="adapter sequence to clip") | |
| 24 the_parser.add_argument( | |
| 25 '--min', action="store", type=int, | |
| 26 help="minimal size of clipped sequence to keep") | |
| 27 the_parser.add_argument( | |
| 28 '--max', action="store", type=int, | |
| 29 help="maximal size of clipped sequence to keep") | |
| 30 the_parser.add_argument('--Nmode', action="store", type=str, choices=[ | |
| 31 "accept", "reject"], | |
| 32 help="accept or reject Ns in clipped sequences") | |
| 33 args = the_parser.parse_args() | |
| 34 args.adapter_to_clip = args.adapter_to_clip.upper() | |
| 35 return args | |
| 36 | |
| 37 | |
| 38 class Clip: | |
| 39 | |
| 40 def __init__(self, inputfile, outputfile, output_format, | |
| 41 adapter, minsize, maxsize, Nmode): | |
| 42 self.inputfile = inputfile | |
| 43 self.outputfile = outputfile | |
| 44 self.output_format = output_format | |
| 45 self.adapter = adapter | |
| 46 self.minsize = int(minsize) | |
| 47 self.maxsize = int(maxsize) | |
| 48 self.Nmode = Nmode | |
| 49 for line in open(inputfile): | |
| 50 if line[0] == "@": | |
| 51 self.inputformat = "fastq" | |
| 52 break | |
| 53 elif line[0] == ">": | |
| 54 self.inputformat = "fasta" | |
| 55 | |
| 56 def motives(sequence): | |
| 57 ''' | |
| 58 return a list of motives for perfect (6nt) or | |
| 59 imperfect (7nt with one mismatch) search on import string module | |
| 60 ''' | |
| 61 sequencevariants = [ | |
| 62 sequence[0:6]] # initializes list with 6mer perfect match | |
| 63 dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"} | |
| 64 for pos in enumerate(sequence[:6]): | |
| 65 for subst in dicsubst[pos[1]]: | |
| 66 sequencevariants.append( | |
| 67 sequence[:pos[0]] + subst + sequence[pos[0] + 1:7]) | |
| 68 return sequencevariants | |
| 69 self.adaptmotifs = motives(self.adapter) | |
| 70 | |
| 71 def scanadapt(self, adaptmotives=[], sequence="", qscore=""): | |
| 72 '''scans sequence for adapter motives''' | |
| 73 match_position = sequence.rfind(adaptmotives[0]) | |
| 74 if qscore: | |
| 75 if match_position != -1: | |
| 76 return sequence[:match_position], qscore[:match_position] | |
| 77 for motif in adaptmotives[1:]: | |
| 78 match_position = sequence.rfind(motif) | |
| 79 if match_position != -1: | |
| 80 return sequence[:match_position], qscore[:match_position] | |
| 81 return sequence, qscore | |
| 82 else: | |
| 83 if match_position != -1: | |
| 84 return sequence[:match_position] | |
| 85 for motif in adaptmotives[1:]: | |
| 86 match_position = sequence.rfind(motif) | |
| 87 if match_position != -1: | |
| 88 return sequence[:match_position] | |
| 89 return sequence | |
| 90 | |
| 91 def write_output(self, id, read, qscore, output): | |
| 92 if self.output_format == "fasta": | |
| 93 block = ">{0}\n{1}\n".format(id, read) | |
| 94 else: | |
| 95 block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore) | |
| 96 output.write(block) | |
| 97 | |
| 98 def fasta_in_write_output(self, id, read, output): | |
| 99 output.write(">{0}\n{1}\n".format(id, read)) | |
| 100 | |
| 101 def handle_io_fastq(self): | |
| 102 '''Open input fastq file, pass read sequence and read qscore to | |
| 103 scanadapt function. Pass clipped read and qscore to output function.''' | |
| 104 id = 0 | |
| 105 output = open(self.outputfile, "a") | |
| 106 with open(self.inputfile, "r") as input: | |
| 107 block_gen = islice(input, 1, None, 2) | |
| 108 for i, line in enumerate(block_gen): | |
| 109 if i % 2: | |
| 110 qscore = line.rstrip() | |
| 111 else: | |
| 112 read = line.rstrip() | |
| 113 continue | |
| 114 try: | |
| 115 trimmed_read, trimmed_qscore = self.scanadapt( | |
| 116 self.adaptmotifs, read, qscore) | |
| 117 except ValueError: | |
| 118 continue | |
| 119 if self.minsize <= len(trimmed_read) <= self.maxsize: | |
| 120 if (self.Nmode == "reject") and ("N" in trimmed_read): | |
| 121 continue | |
| 122 id += 1 | |
| 123 self.write_output(id, trimmed_read, trimmed_qscore, output) | |
| 124 output.close() | |
| 125 | |
| 126 def handle_io_fasta(self): | |
| 127 '''Open input fasta file, pass header and read sequence to scanadapt | |
| 128 function. Pass clipped read and qscore to output function.''' | |
| 129 id = 0 | |
| 130 output = open(self.outputfile, "a") | |
| 131 with open(self.inputfile, "r") as input: | |
| 132 block_gen = islice(input, 1, None, 2) | |
| 133 for i, line in enumerate(block_gen): | |
| 134 read = line.rstrip() | |
| 135 trimmed_read = self.scanadapt(self.adaptmotifs, read) | |
| 136 if self.minsize <= len(trimmed_read) <= self.maxsize: | |
| 137 if (self.Nmode == "reject") and ("N" in trimmed_read): | |
| 138 continue | |
| 139 id += 1 | |
| 140 self.fasta_in_write_output(id, trimmed_read, output) | |
| 141 output.close() | |
| 142 | |
| 143 | |
| 144 def main(*argv): | |
| 145 instanceClip = Clip(*argv) | |
| 146 if instanceClip.inputformat == "fasta": | |
| 147 instanceClip.handle_io_fasta() | |
| 148 else: | |
| 149 instanceClip.handle_io_fastq() | |
| 150 | |
| 151 | |
| 152 if __name__ == "__main__": | |
| 153 args = Parser() | |
| 154 for inputfile in args.input: | |
| 155 main(inputfile, args.output, args.output_format, | |
| 156 args.adapter_to_clip, args.min, args.max, args.Nmode) |
