annotate utils/fastqreader.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
1 #!/usr/bin/env python
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
2 import os
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
3 __version__ = '0.5'
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
4
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
5
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
6 class FastqReadGenerator(object):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
7 """A simple FASTQ parser that returns reads one at a time.
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
8 Handles multi-line read/quality values.
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
9 Usage:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
10 fastq = FastqReadGenerator('/home/user/sequence.fq')
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
11 for read in fastq:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
12 print "There is a read with this identifier: "+read.id
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
13 print "(Its full name is "+read.name+".)"
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
14 print "Its sequence is: "+read.seq
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
15 print "Its quality is: "+read.qual
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
16 All values (id, name, seq, qual) are whitespace-stripped.
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
17 """
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
18
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
19 def __init__(self, filepath):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
20 if not os.path.isfile(filepath):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
21 raise IOError('File not found: "'+filepath+'"')
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
22 self.filepath = filepath
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
23 self.name = None
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
24 self.id = None
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
25
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
26 def __iter__(self):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
27 return self.reads()
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
28
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
29 def reads(self):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
30 with open(self.filepath, 'rU') as filehandle:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
31 read = None
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
32 line_type = 'first'
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
33 for line_raw in filehandle:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
34 line = line_raw.strip()
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
35 if not line:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
36 continue # allow empty lines
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
37 # Determine what kind of line we're in
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
38 if line.startswith('@'):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
39 if line_type == 'first':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
40 line_type = 'name'
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
41 elif line_type == 'plus':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
42 line_type = 'qual'
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
43 elif line_type == 'qual':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
44 # Determine if it's another qual line or a name line.
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
45 # If the quality scores observed so far already cover the whole read, we've seen all
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
46 # the quality information already. It should be a name line.
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
47 if len(read.qual) >= len(read.seq):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
48 line_type = 'name'
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
49 else:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
50 line_type = 'qual'
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
51 else:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
52 raise FormatError('"@" starts line in wrong context:\n'+line_raw)
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
53 elif line.startswith('+'):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
54 if line_type == 'seq':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
55 line_type = 'plus'
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
56 elif line_type == 'qual':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
57 pass
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
58 else:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
59 raise FormatError('"+" starts line in wrong context:\n'+line_raw)
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
60 elif line_type == 'name':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
61 line_type = 'seq'
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
62 elif line_type == 'plus':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
63 line_type = 'qual'
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
64 elif line_type == 'first':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
65 raise FormatError('First line must start with a "@":\n'+line_raw)
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
66 else:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
67 raise FormatError('Invalid parser state: line_type "{}", first char "{}":\n{}'
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
68 .format(line_type, line[0], line_raw))
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
69 if line_type == 'name':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
70 # Return the previous read.
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
71 if read is not None:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
72 yield read
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
73 read = Read()
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
74 read.name = line[1:] # remove ">"
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
75 if read.name:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
76 read.id = read.name.split()[0]
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
77 else:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
78 read.id = ''
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
79 elif line_type == 'seq':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
80 read.seq += line
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
81 elif line_type == 'qual':
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
82 read.qual += line
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
83 # Return the last read.
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
84 if read is not None:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
85 yield read
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
86
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
87
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
88 class Read(object):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
89 def __init__(self):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
90 self.seq = ''
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
91 self.qual = ''
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
92 self.id = ''
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
93 self.name = ''
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
94
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
95
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
96 class FormatError(Exception):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
97 def __init__(self, message=None):
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
98 if message:
e4d75f9efb90 planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff changeset
99 Exception.__init__(self, message)