comparison utils/fastqreader.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 #!/usr/bin/env python
2 import os
3 __version__ = '0.5'
4
5
6 class FastqReadGenerator(object):
7 """A simple FASTQ parser that returns reads one at a time.
8 Handles multi-line read/quality values.
9 Usage:
10 fastq = FastqReadGenerator('/home/user/sequence.fq')
11 for read in fastq:
12 print "There is a read with this identifier: "+read.id
13 print "(Its full name is "+read.name+".)"
14 print "Its sequence is: "+read.seq
15 print "Its quality is: "+read.qual
16 All values (id, name, seq, qual) are whitespace-stripped.
17 """
18
19 def __init__(self, filepath):
20 if not os.path.isfile(filepath):
21 raise IOError('File not found: "'+filepath+'"')
22 self.filepath = filepath
23 self.name = None
24 self.id = None
25
26 def __iter__(self):
27 return self.reads()
28
29 def reads(self):
30 with open(self.filepath, 'rU') as filehandle:
31 read = None
32 line_type = 'first'
33 for line_raw in filehandle:
34 line = line_raw.strip()
35 if not line:
36 continue # allow empty lines
37 # Determine what kind of line we're in
38 if line.startswith('@'):
39 if line_type == 'first':
40 line_type = 'name'
41 elif line_type == 'plus':
42 line_type = 'qual'
43 elif line_type == 'qual':
44 # Determine if it's another qual line or a name line.
45 # If the quality scores observed so far already cover the whole read, we've seen all
46 # the quality information already. It should be a name line.
47 if len(read.qual) >= len(read.seq):
48 line_type = 'name'
49 else:
50 line_type = 'qual'
51 else:
52 raise FormatError('"@" starts line in wrong context:\n'+line_raw)
53 elif line.startswith('+'):
54 if line_type == 'seq':
55 line_type = 'plus'
56 elif line_type == 'qual':
57 pass
58 else:
59 raise FormatError('"+" starts line in wrong context:\n'+line_raw)
60 elif line_type == 'name':
61 line_type = 'seq'
62 elif line_type == 'plus':
63 line_type = 'qual'
64 elif line_type == 'first':
65 raise FormatError('First line must start with a "@":\n'+line_raw)
66 else:
67 raise FormatError('Invalid parser state: line_type "{}", first char "{}":\n{}'
68 .format(line_type, line[0], line_raw))
69 if line_type == 'name':
70 # Return the previous read.
71 if read is not None:
72 yield read
73 read = Read()
74 read.name = line[1:] # remove ">"
75 if read.name:
76 read.id = read.name.split()[0]
77 else:
78 read.id = ''
79 elif line_type == 'seq':
80 read.seq += line
81 elif line_type == 'qual':
82 read.qual += line
83 # Return the last read.
84 if read is not None:
85 yield read
86
87
88 class Read(object):
89 def __init__(self):
90 self.seq = ''
91 self.qual = ''
92 self.id = ''
93 self.name = ''
94
95
96 class FormatError(Exception):
97 def __init__(self, message=None):
98 if message:
99 Exception.__init__(self, message)