Mercurial > repos > nick > duplex
view utils/fastqreader.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python import os __version__ = '0.5' class FastqReadGenerator(object): """A simple FASTQ parser that returns reads one at a time. Handles multi-line read/quality values. Usage: fastq = FastqReadGenerator('/home/user/sequence.fq') for read in fastq: print "There is a read with this identifier: "+read.id print "(Its full name is "+read.name+".)" print "Its sequence is: "+read.seq print "Its quality is: "+read.qual All values (id, name, seq, qual) are whitespace-stripped. """ def __init__(self, filepath): if not os.path.isfile(filepath): raise IOError('File not found: "'+filepath+'"') self.filepath = filepath self.name = None self.id = None def __iter__(self): return self.reads() def reads(self): with open(self.filepath, 'rU') as filehandle: read = None line_type = 'first' for line_raw in filehandle: line = line_raw.strip() if not line: continue # allow empty lines # Determine what kind of line we're in if line.startswith('@'): if line_type == 'first': line_type = 'name' elif line_type == 'plus': line_type = 'qual' elif line_type == 'qual': # Determine if it's another qual line or a name line. # If the quality scores observed so far already cover the whole read, we've seen all # the quality information already. It should be a name line. if len(read.qual) >= len(read.seq): line_type = 'name' else: line_type = 'qual' else: raise FormatError('"@" starts line in wrong context:\n'+line_raw) elif line.startswith('+'): if line_type == 'seq': line_type = 'plus' elif line_type == 'qual': pass else: raise FormatError('"+" starts line in wrong context:\n'+line_raw) elif line_type == 'name': line_type = 'seq' elif line_type == 'plus': line_type = 'qual' elif line_type == 'first': raise FormatError('First line must start with a "@":\n'+line_raw) else: raise FormatError('Invalid parser state: line_type "{}", first char "{}":\n{}' .format(line_type, line[0], line_raw)) if line_type == 'name': # Return the previous read. if read is not None: yield read read = Read() read.name = line[1:] # remove ">" if read.name: read.id = read.name.split()[0] else: read.id = '' elif line_type == 'seq': read.seq += line elif line_type == 'qual': read.qual += line # Return the last read. if read is not None: yield read class Read(object): def __init__(self): self.seq = '' self.qual = '' self.id = '' self.name = '' class FormatError(Exception): def __init__(self, message=None): if message: Exception.__init__(self, message)