Mercurial > repos > nick > duplex
diff utils/fastqreader.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/fastqreader.py Thu Feb 02 18:44:31 2017 -0500 @@ -0,0 +1,99 @@ +#!/usr/bin/env python +import os +__version__ = '0.5' + + +class FastqReadGenerator(object): + """A simple FASTQ parser that returns reads one at a time. + Handles multi-line read/quality values. + Usage: + fastq = FastqReadGenerator('/home/user/sequence.fq') + for read in fastq: + print "There is a read with this identifier: "+read.id + print "(Its full name is "+read.name+".)" + print "Its sequence is: "+read.seq + print "Its quality is: "+read.qual + All values (id, name, seq, qual) are whitespace-stripped. + """ + + def __init__(self, filepath): + if not os.path.isfile(filepath): + raise IOError('File not found: "'+filepath+'"') + self.filepath = filepath + self.name = None + self.id = None + + def __iter__(self): + return self.reads() + + def reads(self): + with open(self.filepath, 'rU') as filehandle: + read = None + line_type = 'first' + for line_raw in filehandle: + line = line_raw.strip() + if not line: + continue # allow empty lines + # Determine what kind of line we're in + if line.startswith('@'): + if line_type == 'first': + line_type = 'name' + elif line_type == 'plus': + line_type = 'qual' + elif line_type == 'qual': + # Determine if it's another qual line or a name line. + # If the quality scores observed so far already cover the whole read, we've seen all + # the quality information already. It should be a name line. + if len(read.qual) >= len(read.seq): + line_type = 'name' + else: + line_type = 'qual' + else: + raise FormatError('"@" starts line in wrong context:\n'+line_raw) + elif line.startswith('+'): + if line_type == 'seq': + line_type = 'plus' + elif line_type == 'qual': + pass + else: + raise FormatError('"+" starts line in wrong context:\n'+line_raw) + elif line_type == 'name': + line_type = 'seq' + elif line_type == 'plus': + line_type = 'qual' + elif line_type == 'first': + raise FormatError('First line must start with a "@":\n'+line_raw) + else: + raise FormatError('Invalid parser state: line_type "{}", first char "{}":\n{}' + .format(line_type, line[0], line_raw)) + if line_type == 'name': + # Return the previous read. + if read is not None: + yield read + read = Read() + read.name = line[1:] # remove ">" + if read.name: + read.id = read.name.split()[0] + else: + read.id = '' + elif line_type == 'seq': + read.seq += line + elif line_type == 'qual': + read.qual += line + # Return the last read. + if read is not None: + yield read + + +class Read(object): + def __init__(self): + self.seq = '' + self.qual = '' + self.id = '' + self.name = '' + + +class FormatError(Exception): + def __init__(self, message=None): + if message: + Exception.__init__(self, message)