Mercurial > repos > nick > duplex
comparison utils/fastqreader.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:836fa4fe9494 | 18:e4d75f9efb90 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import os | |
| 3 __version__ = '0.5' | |
| 4 | |
| 5 | |
| 6 class FastqReadGenerator(object): | |
| 7 """A simple FASTQ parser that returns reads one at a time. | |
| 8 Handles multi-line read/quality values. | |
| 9 Usage: | |
| 10 fastq = FastqReadGenerator('/home/user/sequence.fq') | |
| 11 for read in fastq: | |
| 12 print "There is a read with this identifier: "+read.id | |
| 13 print "(Its full name is "+read.name+".)" | |
| 14 print "Its sequence is: "+read.seq | |
| 15 print "Its quality is: "+read.qual | |
| 16 All values (id, name, seq, qual) are whitespace-stripped. | |
| 17 """ | |
| 18 | |
| 19 def __init__(self, filepath): | |
| 20 if not os.path.isfile(filepath): | |
| 21 raise IOError('File not found: "'+filepath+'"') | |
| 22 self.filepath = filepath | |
| 23 self.name = None | |
| 24 self.id = None | |
| 25 | |
| 26 def __iter__(self): | |
| 27 return self.reads() | |
| 28 | |
| 29 def reads(self): | |
| 30 with open(self.filepath, 'rU') as filehandle: | |
| 31 read = None | |
| 32 line_type = 'first' | |
| 33 for line_raw in filehandle: | |
| 34 line = line_raw.strip() | |
| 35 if not line: | |
| 36 continue # allow empty lines | |
| 37 # Determine what kind of line we're in | |
| 38 if line.startswith('@'): | |
| 39 if line_type == 'first': | |
| 40 line_type = 'name' | |
| 41 elif line_type == 'plus': | |
| 42 line_type = 'qual' | |
| 43 elif line_type == 'qual': | |
| 44 # Determine if it's another qual line or a name line. | |
| 45 # If the quality scores observed so far already cover the whole read, we've seen all | |
| 46 # the quality information already. It should be a name line. | |
| 47 if len(read.qual) >= len(read.seq): | |
| 48 line_type = 'name' | |
| 49 else: | |
| 50 line_type = 'qual' | |
| 51 else: | |
| 52 raise FormatError('"@" starts line in wrong context:\n'+line_raw) | |
| 53 elif line.startswith('+'): | |
| 54 if line_type == 'seq': | |
| 55 line_type = 'plus' | |
| 56 elif line_type == 'qual': | |
| 57 pass | |
| 58 else: | |
| 59 raise FormatError('"+" starts line in wrong context:\n'+line_raw) | |
| 60 elif line_type == 'name': | |
| 61 line_type = 'seq' | |
| 62 elif line_type == 'plus': | |
| 63 line_type = 'qual' | |
| 64 elif line_type == 'first': | |
| 65 raise FormatError('First line must start with a "@":\n'+line_raw) | |
| 66 else: | |
| 67 raise FormatError('Invalid parser state: line_type "{}", first char "{}":\n{}' | |
| 68 .format(line_type, line[0], line_raw)) | |
| 69 if line_type == 'name': | |
| 70 # Return the previous read. | |
| 71 if read is not None: | |
| 72 yield read | |
| 73 read = Read() | |
| 74 read.name = line[1:] # remove ">" | |
| 75 if read.name: | |
| 76 read.id = read.name.split()[0] | |
| 77 else: | |
| 78 read.id = '' | |
| 79 elif line_type == 'seq': | |
| 80 read.seq += line | |
| 81 elif line_type == 'qual': | |
| 82 read.qual += line | |
| 83 # Return the last read. | |
| 84 if read is not None: | |
| 85 yield read | |
| 86 | |
| 87 | |
| 88 class Read(object): | |
| 89 def __init__(self): | |
| 90 self.seq = '' | |
| 91 self.qual = '' | |
| 92 self.id = '' | |
| 93 self.name = '' | |
| 94 | |
| 95 | |
| 96 class FormatError(Exception): | |
| 97 def __init__(self, message=None): | |
| 98 if message: | |
| 99 Exception.__init__(self, message) |
