comparison yac.py @ 0:1dde0b3d5f6a draft default tip

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/yac_clipper commit 9c5f0b8e89dfe4347c610f42923f0acad2ecc81b"
author artbio
date Wed, 17 Mar 2021 22:08:17 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1dde0b3d5f6a
1 #!/usr/bin/python
2 # yac = yet another clipper
3 # v 1.2.1 - 23-08-2014 - Support FastQ output
4 # v 1.1.0 - 23-08-2014 - argparse implementation
5 # Christophe Antoniewski <drosofff@gmail.com>
6
7 import argparse
8 from itertools import islice
9
10
11 def Parser():
12 the_parser = argparse.ArgumentParser()
13 the_parser.add_argument(
14 '--input', action="store", nargs='+', help="input fastq files")
15 the_parser.add_argument(
16 '--output', action="store", type=str,
17 help="output, clipped fasta file")
18 the_parser.add_argument(
19 '--output_format', action="store", type=str,
20 help="output format, fasta or fastq")
21 the_parser.add_argument(
22 '--adapter_to_clip', action="store", type=str,
23 help="adapter sequence to clip")
24 the_parser.add_argument(
25 '--min', action="store", type=int,
26 help="minimal size of clipped sequence to keep")
27 the_parser.add_argument(
28 '--max', action="store", type=int,
29 help="maximal size of clipped sequence to keep")
30 the_parser.add_argument('--Nmode', action="store", type=str, choices=[
31 "accept", "reject"],
32 help="accept or reject Ns in clipped sequences")
33 args = the_parser.parse_args()
34 args.adapter_to_clip = args.adapter_to_clip.upper()
35 return args
36
37
38 class Clip:
39
40 def __init__(self, inputfile, outputfile, output_format,
41 adapter, minsize, maxsize, Nmode):
42 self.inputfile = inputfile
43 self.outputfile = outputfile
44 self.output_format = output_format
45 self.adapter = adapter
46 self.minsize = int(minsize)
47 self.maxsize = int(maxsize)
48 self.Nmode = Nmode
49 for line in open(inputfile):
50 if line[0] == "@":
51 self.inputformat = "fastq"
52 break
53 elif line[0] == ">":
54 self.inputformat = "fasta"
55
56 def motives(sequence):
57 '''
58 return a list of motives for perfect (6nt) or
59 imperfect (7nt with one mismatch) search on import string module
60 '''
61 sequencevariants = [
62 sequence[0:6]] # initializes list with 6mer perfect match
63 dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"}
64 for pos in enumerate(sequence[:6]):
65 for subst in dicsubst[pos[1]]:
66 sequencevariants.append(
67 sequence[:pos[0]] + subst + sequence[pos[0] + 1:7])
68 return sequencevariants
69 self.adaptmotifs = motives(self.adapter)
70
71 def scanadapt(self, adaptmotives=[], sequence="", qscore=""):
72 '''scans sequence for adapter motives'''
73 match_position = sequence.rfind(adaptmotives[0])
74 if qscore:
75 if match_position != -1:
76 return sequence[:match_position], qscore[:match_position]
77 for motif in adaptmotives[1:]:
78 match_position = sequence.rfind(motif)
79 if match_position != -1:
80 return sequence[:match_position], qscore[:match_position]
81 return sequence, qscore
82 else:
83 if match_position != -1:
84 return sequence[:match_position]
85 for motif in adaptmotives[1:]:
86 match_position = sequence.rfind(motif)
87 if match_position != -1:
88 return sequence[:match_position]
89 return sequence
90
91 def write_output(self, id, read, qscore, output):
92 if self.output_format == "fasta":
93 block = ">{0}\n{1}\n".format(id, read)
94 else:
95 block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore)
96 output.write(block)
97
98 def fasta_in_write_output(self, id, read, output):
99 output.write(">{0}\n{1}\n".format(id, read))
100
101 def handle_io_fastq(self):
102 '''Open input fastq file, pass read sequence and read qscore to
103 scanadapt function. Pass clipped read and qscore to output function.'''
104 id = 0
105 output = open(self.outputfile, "a")
106 with open(self.inputfile, "r") as input:
107 block_gen = islice(input, 1, None, 2)
108 for i, line in enumerate(block_gen):
109 if i % 2:
110 qscore = line.rstrip()
111 else:
112 read = line.rstrip()
113 continue
114 try:
115 trimmed_read, trimmed_qscore = self.scanadapt(
116 self.adaptmotifs, read, qscore)
117 except ValueError:
118 continue
119 if self.minsize <= len(trimmed_read) <= self.maxsize:
120 if (self.Nmode == "reject") and ("N" in trimmed_read):
121 continue
122 id += 1
123 self.write_output(id, trimmed_read, trimmed_qscore, output)
124 output.close()
125
126 def handle_io_fasta(self):
127 '''Open input fasta file, pass header and read sequence to scanadapt
128 function. Pass clipped read and qscore to output function.'''
129 id = 0
130 output = open(self.outputfile, "a")
131 with open(self.inputfile, "r") as input:
132 block_gen = islice(input, 1, None, 2)
133 for i, line in enumerate(block_gen):
134 read = line.rstrip()
135 trimmed_read = self.scanadapt(self.adaptmotifs, read)
136 if self.minsize <= len(trimmed_read) <= self.maxsize:
137 if (self.Nmode == "reject") and ("N" in trimmed_read):
138 continue
139 id += 1
140 self.fasta_in_write_output(id, trimmed_read, output)
141 output.close()
142
143
144 def main(*argv):
145 instanceClip = Clip(*argv)
146 if instanceClip.inputformat == "fasta":
147 instanceClip.handle_io_fasta()
148 else:
149 instanceClip.handle_io_fastq()
150
151
152 if __name__ == "__main__":
153 args = Parser()
154 for inputfile in args.input:
155 main(inputfile, args.output, args.output_format,
156 args.adapter_to_clip, args.min, args.max, args.Nmode)