Mercurial > repos > nick > sequence_content_trimmer
comparison getreads.py @ 0:c824894e0827 draft
planemo upload commit bb6c51fa3c8501e779f3b266d17f4900d98fc431-dirty
| author | nick |
|---|---|
| date | Tue, 01 Dec 2015 21:11:51 -0500 |
| parents | |
| children | 7ef568cbf13b |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c824894e0827 |
|---|---|
| 1 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and | |
| 2 sequence. | |
| 3 All format parsers follow this API: | |
| 4 with open('sequence.fasta') as fasta: | |
| 5 for read in getreads.getparser(fasta, filetype='fasta'): | |
| 6 print "There is a sequence with this FASTA identifier: "+read.id | |
| 7 print "Its sequence is "+read.seq | |
| 8 The properties of Read are: | |
| 9 name: The entire FASTA header line, SAM column 1, etc. | |
| 10 id: The first whitespace-delimited part of the name. | |
| 11 seq: The sequence. | |
| 12 qual: The quality scores (unless the format is FASTA). | |
| 13 """ | |
| 14 | |
| 15 | |
| 16 def getparser(filehandle, filetype='fasta'): | |
| 17 if filetype == 'fasta': | |
| 18 return FastaReader(filehandle) | |
| 19 elif filetype == 'fastq': | |
| 20 return FastqReader(filehandle) | |
| 21 elif filetype == 'sam': | |
| 22 return SamReader(filehandle) | |
| 23 elif filetype == 'tsv': | |
| 24 return TsvReader(filehandle) | |
| 25 else: | |
| 26 raise ValueError('Illegal argument: filetype=\''+filetype+'\'') | |
| 27 | |
| 28 | |
| 29 class FormatError(Exception): | |
| 30 def __init__(self, message=None): | |
| 31 if message: | |
| 32 Exception.__init__(self, message) | |
| 33 | |
| 34 | |
| 35 class Read(object): | |
| 36 def __init__(self, name='', seq='', id_='', qual=''): | |
| 37 self.name = name | |
| 38 self.seq = seq | |
| 39 self.id = id_ | |
| 40 self.qual = qual | |
| 41 | |
| 42 | |
| 43 class Reader(object): | |
| 44 """Base class for all other parsers.""" | |
| 45 def __init__(self, filehandle): | |
| 46 self.filehandle = filehandle | |
| 47 def __iter__(self): | |
| 48 return self.parser() | |
| 49 | |
| 50 | |
| 51 class TsvReader(Reader): | |
| 52 """A parser for a simple tab-delimited format. | |
| 53 Column 1: name | |
| 54 Column 2: sequence | |
| 55 Column 3: quality scores (optional)""" | |
| 56 def parser(self): | |
| 57 for line in self.filehandle: | |
| 58 fields = line.rstrip('\r\n').split('\t') | |
| 59 if len(fields) < 2: | |
| 60 continue | |
| 61 read = Read() | |
| 62 read.name = fields[0] | |
| 63 if read.name: | |
| 64 read.id = read.name.split()[0] | |
| 65 read.seq = fields[1] | |
| 66 if len(fields) >= 3: | |
| 67 read.qual = fields[2] | |
| 68 yield read | |
| 69 | |
| 70 | |
| 71 class SamReader(Reader): | |
| 72 """A simple SAM parser. | |
| 73 Assumptions: | |
| 74 Lines starting with "@" with 3 fields are headers. All others are alignments. | |
| 75 All alignment lines have 11 or more fields. Other lines will be skipped. | |
| 76 """ | |
| 77 def parser(self): | |
| 78 for line in self.filehandle: | |
| 79 fields = line.split('\t') | |
| 80 if len(fields) < 11: | |
| 81 continue | |
| 82 # Skip headers. | |
| 83 if fields[0].startswith('@') and len(fields[0]) == 3: | |
| 84 continue | |
| 85 read = Read() | |
| 86 read.name = fields[0] | |
| 87 if read.name: | |
| 88 read.id = read.name.split()[0] | |
| 89 read.seq = fields[9] | |
| 90 read.qual = fields[10].rstrip('\r\n') | |
| 91 yield read | |
| 92 | |
| 93 | |
| 94 class FastaReader(Reader): | |
| 95 """A simple FASTA parser that reads one sequence at a time into memory.""" | |
| 96 def parser(self): | |
| 97 read = Read() | |
| 98 while True: | |
| 99 line_raw = self.filehandle.readline() | |
| 100 if not line_raw: | |
| 101 if read.seq: | |
| 102 yield read | |
| 103 raise StopIteration | |
| 104 line = line_raw.strip() | |
| 105 # Allow empty lines. | |
| 106 if not line: | |
| 107 continue | |
| 108 if line.startswith('>'): | |
| 109 if read.seq: | |
| 110 yield read | |
| 111 read = Read() | |
| 112 read.name = line[1:] # remove ">" | |
| 113 if read.name: | |
| 114 read.id = read.name.split()[0] | |
| 115 continue | |
| 116 else: | |
| 117 read.seq += line | |
| 118 | |
| 119 | |
| 120 class FastqReader(Reader): | |
| 121 """A simple FASTQ parser. Can handle multi-line sequences, though.""" | |
| 122 def parser(self): | |
| 123 read = Read() | |
| 124 state = 'header' | |
| 125 while True: | |
| 126 line_raw = self.filehandle.readline() | |
| 127 if not line_raw: | |
| 128 if read.seq: | |
| 129 yield read | |
| 130 raise StopIteration | |
| 131 line = line_raw.strip() | |
| 132 # Allow empty lines. | |
| 133 if not line: | |
| 134 continue | |
| 135 if state == 'header': | |
| 136 if not line.startswith('@'): | |
| 137 raise FormatError('line state = "header" but line does not start with "@"') | |
| 138 if read.seq: | |
| 139 yield read | |
| 140 read = Read() | |
| 141 read.name = line[1:] # remove '@' | |
| 142 if read.name: | |
| 143 read.id = read.name.split()[0] | |
| 144 state = 'sequence' | |
| 145 elif state == 'sequence': | |
| 146 if line.startswith('+'): | |
| 147 state = 'plus' | |
| 148 else: | |
| 149 read.seq += line | |
| 150 elif state == 'plus' or state == 'quality': | |
| 151 state = 'quality' | |
| 152 togo = len(read.seq) - len(read.qual) | |
| 153 read.qual += line[:togo] | |
| 154 # The end of the quality lines is when we have a quality string as long as the sequence. | |
| 155 if len(read.qual) >= len(read.seq): | |
| 156 state = 'header' |
