Mercurial > repos > nick > duplex
annotate utils/fastqreader.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
rev | line source |
---|---|
18
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
1 #!/usr/bin/env python |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
2 import os |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
3 __version__ = '0.5' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
4 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
5 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
6 class FastqReadGenerator(object): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
7 """A simple FASTQ parser that returns reads one at a time. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
8 Handles multi-line read/quality values. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
9 Usage: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
10 fastq = FastqReadGenerator('/home/user/sequence.fq') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
11 for read in fastq: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
12 print "There is a read with this identifier: "+read.id |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
13 print "(Its full name is "+read.name+".)" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
14 print "Its sequence is: "+read.seq |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
15 print "Its quality is: "+read.qual |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
16 All values (id, name, seq, qual) are whitespace-stripped. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
17 """ |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
18 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
19 def __init__(self, filepath): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
20 if not os.path.isfile(filepath): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
21 raise IOError('File not found: "'+filepath+'"') |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
22 self.filepath = filepath |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
23 self.name = None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
24 self.id = None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
25 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
26 def __iter__(self): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
27 return self.reads() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
28 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
29 def reads(self): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
30 with open(self.filepath, 'rU') as filehandle: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
31 read = None |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
32 line_type = 'first' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
33 for line_raw in filehandle: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
34 line = line_raw.strip() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
35 if not line: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
36 continue # allow empty lines |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
37 # Determine what kind of line we're in |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
38 if line.startswith('@'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
39 if line_type == 'first': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
40 line_type = 'name' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
41 elif line_type == 'plus': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
42 line_type = 'qual' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
43 elif line_type == 'qual': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
44 # Determine if it's another qual line or a name line. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
45 # If the quality scores observed so far already cover the whole read, we've seen all |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
46 # the quality information already. It should be a name line. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
47 if len(read.qual) >= len(read.seq): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
48 line_type = 'name' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
49 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
50 line_type = 'qual' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
51 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
52 raise FormatError('"@" starts line in wrong context:\n'+line_raw) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
53 elif line.startswith('+'): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
54 if line_type == 'seq': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
55 line_type = 'plus' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
56 elif line_type == 'qual': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
57 pass |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
58 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
59 raise FormatError('"+" starts line in wrong context:\n'+line_raw) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
60 elif line_type == 'name': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
61 line_type = 'seq' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
62 elif line_type == 'plus': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
63 line_type = 'qual' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
64 elif line_type == 'first': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
65 raise FormatError('First line must start with a "@":\n'+line_raw) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
66 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
67 raise FormatError('Invalid parser state: line_type "{}", first char "{}":\n{}' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
68 .format(line_type, line[0], line_raw)) |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
69 if line_type == 'name': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
70 # Return the previous read. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
71 if read is not None: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
72 yield read |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
73 read = Read() |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
74 read.name = line[1:] # remove ">" |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
75 if read.name: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
76 read.id = read.name.split()[0] |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
77 else: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
78 read.id = '' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
79 elif line_type == 'seq': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
80 read.seq += line |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
81 elif line_type == 'qual': |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
82 read.qual += line |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
83 # Return the last read. |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
84 if read is not None: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
85 yield read |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
86 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
87 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
88 class Read(object): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
89 def __init__(self): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
90 self.seq = '' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
91 self.qual = '' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
92 self.id = '' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
93 self.name = '' |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
94 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
95 |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
96 class FormatError(Exception): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
97 def __init__(self, message=None): |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
98 if message: |
e4d75f9efb90
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
nick
parents:
diff
changeset
|
99 Exception.__init__(self, message) |