Mercurial > repos > nick > duplex
comparison utils/fastqreader.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:836fa4fe9494 | 18:e4d75f9efb90 |
---|---|
1 #!/usr/bin/env python | |
2 import os | |
3 __version__ = '0.5' | |
4 | |
5 | |
6 class FastqReadGenerator(object): | |
7 """A simple FASTQ parser that returns reads one at a time. | |
8 Handles multi-line read/quality values. | |
9 Usage: | |
10 fastq = FastqReadGenerator('/home/user/sequence.fq') | |
11 for read in fastq: | |
12 print "There is a read with this identifier: "+read.id | |
13 print "(Its full name is "+read.name+".)" | |
14 print "Its sequence is: "+read.seq | |
15 print "Its quality is: "+read.qual | |
16 All values (id, name, seq, qual) are whitespace-stripped. | |
17 """ | |
18 | |
19 def __init__(self, filepath): | |
20 if not os.path.isfile(filepath): | |
21 raise IOError('File not found: "'+filepath+'"') | |
22 self.filepath = filepath | |
23 self.name = None | |
24 self.id = None | |
25 | |
26 def __iter__(self): | |
27 return self.reads() | |
28 | |
29 def reads(self): | |
30 with open(self.filepath, 'rU') as filehandle: | |
31 read = None | |
32 line_type = 'first' | |
33 for line_raw in filehandle: | |
34 line = line_raw.strip() | |
35 if not line: | |
36 continue # allow empty lines | |
37 # Determine what kind of line we're in | |
38 if line.startswith('@'): | |
39 if line_type == 'first': | |
40 line_type = 'name' | |
41 elif line_type == 'plus': | |
42 line_type = 'qual' | |
43 elif line_type == 'qual': | |
44 # Determine if it's another qual line or a name line. | |
45 # If the quality scores observed so far already cover the whole read, we've seen all | |
46 # the quality information already. It should be a name line. | |
47 if len(read.qual) >= len(read.seq): | |
48 line_type = 'name' | |
49 else: | |
50 line_type = 'qual' | |
51 else: | |
52 raise FormatError('"@" starts line in wrong context:\n'+line_raw) | |
53 elif line.startswith('+'): | |
54 if line_type == 'seq': | |
55 line_type = 'plus' | |
56 elif line_type == 'qual': | |
57 pass | |
58 else: | |
59 raise FormatError('"+" starts line in wrong context:\n'+line_raw) | |
60 elif line_type == 'name': | |
61 line_type = 'seq' | |
62 elif line_type == 'plus': | |
63 line_type = 'qual' | |
64 elif line_type == 'first': | |
65 raise FormatError('First line must start with a "@":\n'+line_raw) | |
66 else: | |
67 raise FormatError('Invalid parser state: line_type "{}", first char "{}":\n{}' | |
68 .format(line_type, line[0], line_raw)) | |
69 if line_type == 'name': | |
70 # Return the previous read. | |
71 if read is not None: | |
72 yield read | |
73 read = Read() | |
74 read.name = line[1:] # remove ">" | |
75 if read.name: | |
76 read.id = read.name.split()[0] | |
77 else: | |
78 read.id = '' | |
79 elif line_type == 'seq': | |
80 read.seq += line | |
81 elif line_type == 'qual': | |
82 read.qual += line | |
83 # Return the last read. | |
84 if read is not None: | |
85 yield read | |
86 | |
87 | |
88 class Read(object): | |
89 def __init__(self): | |
90 self.seq = '' | |
91 self.qual = '' | |
92 self.id = '' | |
93 self.name = '' | |
94 | |
95 | |
96 class FormatError(Exception): | |
97 def __init__(self, message=None): | |
98 if message: | |
99 Exception.__init__(self, message) |