Mercurial > repos > bcclaywell > argo_navis
comparison venv/lib/python2.7/codecs.py @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author | bcclaywell |
---|---|
date | Mon, 12 Oct 2015 17:43:33 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d67268158946 |
---|---|
1 """ codecs -- Python Codec Registry, API and helpers. | |
2 | |
3 | |
4 Written by Marc-Andre Lemburg (mal@lemburg.com). | |
5 | |
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. | |
7 | |
8 """#" | |
9 | |
10 import __builtin__, sys | |
11 | |
12 ### Registry and builtin stateless codec functions | |
13 | |
14 try: | |
15 from _codecs import * | |
16 except ImportError, why: | |
17 raise SystemError('Failed to load the builtin codecs: %s' % why) | |
18 | |
19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", | |
20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", | |
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", | |
22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", | |
23 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", | |
24 "StreamReader", "StreamWriter", | |
25 "StreamReaderWriter", "StreamRecoder", | |
26 "getencoder", "getdecoder", "getincrementalencoder", | |
27 "getincrementaldecoder", "getreader", "getwriter", | |
28 "encode", "decode", "iterencode", "iterdecode", | |
29 "strict_errors", "ignore_errors", "replace_errors", | |
30 "xmlcharrefreplace_errors", "backslashreplace_errors", | |
31 "register_error", "lookup_error"] | |
32 | |
33 ### Constants | |
34 | |
35 # | |
36 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) | |
37 # and its possible byte string values | |
38 # for UTF8/UTF16/UTF32 output and little/big endian machines | |
39 # | |
40 | |
41 # UTF-8 | |
42 BOM_UTF8 = '\xef\xbb\xbf' | |
43 | |
44 # UTF-16, little endian | |
45 BOM_LE = BOM_UTF16_LE = '\xff\xfe' | |
46 | |
47 # UTF-16, big endian | |
48 BOM_BE = BOM_UTF16_BE = '\xfe\xff' | |
49 | |
50 # UTF-32, little endian | |
51 BOM_UTF32_LE = '\xff\xfe\x00\x00' | |
52 | |
53 # UTF-32, big endian | |
54 BOM_UTF32_BE = '\x00\x00\xfe\xff' | |
55 | |
56 if sys.byteorder == 'little': | |
57 | |
58 # UTF-16, native endianness | |
59 BOM = BOM_UTF16 = BOM_UTF16_LE | |
60 | |
61 # UTF-32, native endianness | |
62 BOM_UTF32 = BOM_UTF32_LE | |
63 | |
64 else: | |
65 | |
66 # UTF-16, native endianness | |
67 BOM = BOM_UTF16 = BOM_UTF16_BE | |
68 | |
69 # UTF-32, native endianness | |
70 BOM_UTF32 = BOM_UTF32_BE | |
71 | |
72 # Old broken names (don't use in new code) | |
73 BOM32_LE = BOM_UTF16_LE | |
74 BOM32_BE = BOM_UTF16_BE | |
75 BOM64_LE = BOM_UTF32_LE | |
76 BOM64_BE = BOM_UTF32_BE | |
77 | |
78 | |
79 ### Codec base classes (defining the API) | |
80 | |
81 class CodecInfo(tuple): | |
82 | |
83 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, | |
84 incrementalencoder=None, incrementaldecoder=None, name=None): | |
85 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) | |
86 self.name = name | |
87 self.encode = encode | |
88 self.decode = decode | |
89 self.incrementalencoder = incrementalencoder | |
90 self.incrementaldecoder = incrementaldecoder | |
91 self.streamwriter = streamwriter | |
92 self.streamreader = streamreader | |
93 return self | |
94 | |
95 def __repr__(self): | |
96 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) | |
97 | |
98 class Codec: | |
99 | |
100 """ Defines the interface for stateless encoders/decoders. | |
101 | |
102 The .encode()/.decode() methods may use different error | |
103 handling schemes by providing the errors argument. These | |
104 string values are predefined: | |
105 | |
106 'strict' - raise a ValueError error (or a subclass) | |
107 'ignore' - ignore the character and continue with the next | |
108 'replace' - replace with a suitable replacement character; | |
109 Python will use the official U+FFFD REPLACEMENT | |
110 CHARACTER for the builtin Unicode codecs on | |
111 decoding and '?' on encoding. | |
112 'xmlcharrefreplace' - Replace with the appropriate XML | |
113 character reference (only for encoding). | |
114 'backslashreplace' - Replace with backslashed escape sequences | |
115 (only for encoding). | |
116 | |
117 The set of allowed values can be extended via register_error. | |
118 | |
119 """ | |
120 def encode(self, input, errors='strict'): | |
121 | |
122 """ Encodes the object input and returns a tuple (output | |
123 object, length consumed). | |
124 | |
125 errors defines the error handling to apply. It defaults to | |
126 'strict' handling. | |
127 | |
128 The method may not store state in the Codec instance. Use | |
129 StreamCodec for codecs which have to keep state in order to | |
130 make encoding/decoding efficient. | |
131 | |
132 The encoder must be able to handle zero length input and | |
133 return an empty object of the output object type in this | |
134 situation. | |
135 | |
136 """ | |
137 raise NotImplementedError | |
138 | |
139 def decode(self, input, errors='strict'): | |
140 | |
141 """ Decodes the object input and returns a tuple (output | |
142 object, length consumed). | |
143 | |
144 input must be an object which provides the bf_getreadbuf | |
145 buffer slot. Python strings, buffer objects and memory | |
146 mapped files are examples of objects providing this slot. | |
147 | |
148 errors defines the error handling to apply. It defaults to | |
149 'strict' handling. | |
150 | |
151 The method may not store state in the Codec instance. Use | |
152 StreamCodec for codecs which have to keep state in order to | |
153 make encoding/decoding efficient. | |
154 | |
155 The decoder must be able to handle zero length input and | |
156 return an empty object of the output object type in this | |
157 situation. | |
158 | |
159 """ | |
160 raise NotImplementedError | |
161 | |
162 class IncrementalEncoder(object): | |
163 """ | |
164 An IncrementalEncoder encodes an input in multiple steps. The input can be | |
165 passed piece by piece to the encode() method. The IncrementalEncoder remembers | |
166 the state of the Encoding process between calls to encode(). | |
167 """ | |
168 def __init__(self, errors='strict'): | |
169 """ | |
170 Creates an IncrementalEncoder instance. | |
171 | |
172 The IncrementalEncoder may use different error handling schemes by | |
173 providing the errors keyword argument. See the module docstring | |
174 for a list of possible values. | |
175 """ | |
176 self.errors = errors | |
177 self.buffer = "" | |
178 | |
179 def encode(self, input, final=False): | |
180 """ | |
181 Encodes input and returns the resulting object. | |
182 """ | |
183 raise NotImplementedError | |
184 | |
185 def reset(self): | |
186 """ | |
187 Resets the encoder to the initial state. | |
188 """ | |
189 | |
190 def getstate(self): | |
191 """ | |
192 Return the current state of the encoder. | |
193 """ | |
194 return 0 | |
195 | |
196 def setstate(self, state): | |
197 """ | |
198 Set the current state of the encoder. state must have been | |
199 returned by getstate(). | |
200 """ | |
201 | |
202 class BufferedIncrementalEncoder(IncrementalEncoder): | |
203 """ | |
204 This subclass of IncrementalEncoder can be used as the baseclass for an | |
205 incremental encoder if the encoder must keep some of the output in a | |
206 buffer between calls to encode(). | |
207 """ | |
208 def __init__(self, errors='strict'): | |
209 IncrementalEncoder.__init__(self, errors) | |
210 self.buffer = "" # unencoded input that is kept between calls to encode() | |
211 | |
212 def _buffer_encode(self, input, errors, final): | |
213 # Overwrite this method in subclasses: It must encode input | |
214 # and return an (output, length consumed) tuple | |
215 raise NotImplementedError | |
216 | |
217 def encode(self, input, final=False): | |
218 # encode input (taking the buffer into account) | |
219 data = self.buffer + input | |
220 (result, consumed) = self._buffer_encode(data, self.errors, final) | |
221 # keep unencoded input until the next call | |
222 self.buffer = data[consumed:] | |
223 return result | |
224 | |
225 def reset(self): | |
226 IncrementalEncoder.reset(self) | |
227 self.buffer = "" | |
228 | |
229 def getstate(self): | |
230 return self.buffer or 0 | |
231 | |
232 def setstate(self, state): | |
233 self.buffer = state or "" | |
234 | |
235 class IncrementalDecoder(object): | |
236 """ | |
237 An IncrementalDecoder decodes an input in multiple steps. The input can be | |
238 passed piece by piece to the decode() method. The IncrementalDecoder | |
239 remembers the state of the decoding process between calls to decode(). | |
240 """ | |
241 def __init__(self, errors='strict'): | |
242 """ | |
243 Creates a IncrementalDecoder instance. | |
244 | |
245 The IncrementalDecoder may use different error handling schemes by | |
246 providing the errors keyword argument. See the module docstring | |
247 for a list of possible values. | |
248 """ | |
249 self.errors = errors | |
250 | |
251 def decode(self, input, final=False): | |
252 """ | |
253 Decodes input and returns the resulting object. | |
254 """ | |
255 raise NotImplementedError | |
256 | |
257 def reset(self): | |
258 """ | |
259 Resets the decoder to the initial state. | |
260 """ | |
261 | |
262 def getstate(self): | |
263 """ | |
264 Return the current state of the decoder. | |
265 | |
266 This must be a (buffered_input, additional_state_info) tuple. | |
267 buffered_input must be a bytes object containing bytes that | |
268 were passed to decode() that have not yet been converted. | |
269 additional_state_info must be a non-negative integer | |
270 representing the state of the decoder WITHOUT yet having | |
271 processed the contents of buffered_input. In the initial state | |
272 and after reset(), getstate() must return (b"", 0). | |
273 """ | |
274 return (b"", 0) | |
275 | |
276 def setstate(self, state): | |
277 """ | |
278 Set the current state of the decoder. | |
279 | |
280 state must have been returned by getstate(). The effect of | |
281 setstate((b"", 0)) must be equivalent to reset(). | |
282 """ | |
283 | |
284 class BufferedIncrementalDecoder(IncrementalDecoder): | |
285 """ | |
286 This subclass of IncrementalDecoder can be used as the baseclass for an | |
287 incremental decoder if the decoder must be able to handle incomplete byte | |
288 sequences. | |
289 """ | |
290 def __init__(self, errors='strict'): | |
291 IncrementalDecoder.__init__(self, errors) | |
292 self.buffer = "" # undecoded input that is kept between calls to decode() | |
293 | |
294 def _buffer_decode(self, input, errors, final): | |
295 # Overwrite this method in subclasses: It must decode input | |
296 # and return an (output, length consumed) tuple | |
297 raise NotImplementedError | |
298 | |
299 def decode(self, input, final=False): | |
300 # decode input (taking the buffer into account) | |
301 data = self.buffer + input | |
302 (result, consumed) = self._buffer_decode(data, self.errors, final) | |
303 # keep undecoded input until the next call | |
304 self.buffer = data[consumed:] | |
305 return result | |
306 | |
307 def reset(self): | |
308 IncrementalDecoder.reset(self) | |
309 self.buffer = "" | |
310 | |
311 def getstate(self): | |
312 # additional state info is always 0 | |
313 return (self.buffer, 0) | |
314 | |
315 def setstate(self, state): | |
316 # ignore additional state info | |
317 self.buffer = state[0] | |
318 | |
319 # | |
320 # The StreamWriter and StreamReader class provide generic working | |
321 # interfaces which can be used to implement new encoding submodules | |
322 # very easily. See encodings/utf_8.py for an example on how this is | |
323 # done. | |
324 # | |
325 | |
326 class StreamWriter(Codec): | |
327 | |
328 def __init__(self, stream, errors='strict'): | |
329 | |
330 """ Creates a StreamWriter instance. | |
331 | |
332 stream must be a file-like object open for writing | |
333 (binary) data. | |
334 | |
335 The StreamWriter may use different error handling | |
336 schemes by providing the errors keyword argument. These | |
337 parameters are predefined: | |
338 | |
339 'strict' - raise a ValueError (or a subclass) | |
340 'ignore' - ignore the character and continue with the next | |
341 'replace'- replace with a suitable replacement character | |
342 'xmlcharrefreplace' - Replace with the appropriate XML | |
343 character reference. | |
344 'backslashreplace' - Replace with backslashed escape | |
345 sequences (only for encoding). | |
346 | |
347 The set of allowed parameter values can be extended via | |
348 register_error. | |
349 """ | |
350 self.stream = stream | |
351 self.errors = errors | |
352 | |
353 def write(self, object): | |
354 | |
355 """ Writes the object's contents encoded to self.stream. | |
356 """ | |
357 data, consumed = self.encode(object, self.errors) | |
358 self.stream.write(data) | |
359 | |
360 def writelines(self, list): | |
361 | |
362 """ Writes the concatenated list of strings to the stream | |
363 using .write(). | |
364 """ | |
365 self.write(''.join(list)) | |
366 | |
367 def reset(self): | |
368 | |
369 """ Flushes and resets the codec buffers used for keeping state. | |
370 | |
371 Calling this method should ensure that the data on the | |
372 output is put into a clean state, that allows appending | |
373 of new fresh data without having to rescan the whole | |
374 stream to recover state. | |
375 | |
376 """ | |
377 pass | |
378 | |
379 def seek(self, offset, whence=0): | |
380 self.stream.seek(offset, whence) | |
381 if whence == 0 and offset == 0: | |
382 self.reset() | |
383 | |
384 def __getattr__(self, name, | |
385 getattr=getattr): | |
386 | |
387 """ Inherit all other methods from the underlying stream. | |
388 """ | |
389 return getattr(self.stream, name) | |
390 | |
391 def __enter__(self): | |
392 return self | |
393 | |
394 def __exit__(self, type, value, tb): | |
395 self.stream.close() | |
396 | |
397 ### | |
398 | |
399 class StreamReader(Codec): | |
400 | |
401 def __init__(self, stream, errors='strict'): | |
402 | |
403 """ Creates a StreamReader instance. | |
404 | |
405 stream must be a file-like object open for reading | |
406 (binary) data. | |
407 | |
408 The StreamReader may use different error handling | |
409 schemes by providing the errors keyword argument. These | |
410 parameters are predefined: | |
411 | |
412 'strict' - raise a ValueError (or a subclass) | |
413 'ignore' - ignore the character and continue with the next | |
414 'replace'- replace with a suitable replacement character; | |
415 | |
416 The set of allowed parameter values can be extended via | |
417 register_error. | |
418 """ | |
419 self.stream = stream | |
420 self.errors = errors | |
421 self.bytebuffer = "" | |
422 # For str->str decoding this will stay a str | |
423 # For str->unicode decoding the first read will promote it to unicode | |
424 self.charbuffer = "" | |
425 self.linebuffer = None | |
426 | |
427 def decode(self, input, errors='strict'): | |
428 raise NotImplementedError | |
429 | |
430 def read(self, size=-1, chars=-1, firstline=False): | |
431 | |
432 """ Decodes data from the stream self.stream and returns the | |
433 resulting object. | |
434 | |
435 chars indicates the number of characters to read from the | |
436 stream. read() will never return more than chars | |
437 characters, but it might return less, if there are not enough | |
438 characters available. | |
439 | |
440 size indicates the approximate maximum number of bytes to | |
441 read from the stream for decoding purposes. The decoder | |
442 can modify this setting as appropriate. The default value | |
443 -1 indicates to read and decode as much as possible. size | |
444 is intended to prevent having to decode huge files in one | |
445 step. | |
446 | |
447 If firstline is true, and a UnicodeDecodeError happens | |
448 after the first line terminator in the input only the first line | |
449 will be returned, the rest of the input will be kept until the | |
450 next call to read(). | |
451 | |
452 The method should use a greedy read strategy meaning that | |
453 it should read as much data as is allowed within the | |
454 definition of the encoding and the given size, e.g. if | |
455 optional encoding endings or state markers are available | |
456 on the stream, these should be read too. | |
457 """ | |
458 # If we have lines cached, first merge them back into characters | |
459 if self.linebuffer: | |
460 self.charbuffer = "".join(self.linebuffer) | |
461 self.linebuffer = None | |
462 | |
463 # read until we get the required number of characters (if available) | |
464 while True: | |
465 # can the request be satisfied from the character buffer? | |
466 if chars >= 0: | |
467 if len(self.charbuffer) >= chars: | |
468 break | |
469 elif size >= 0: | |
470 if len(self.charbuffer) >= size: | |
471 break | |
472 # we need more data | |
473 if size < 0: | |
474 newdata = self.stream.read() | |
475 else: | |
476 newdata = self.stream.read(size) | |
477 # decode bytes (those remaining from the last call included) | |
478 data = self.bytebuffer + newdata | |
479 try: | |
480 newchars, decodedbytes = self.decode(data, self.errors) | |
481 except UnicodeDecodeError, exc: | |
482 if firstline: | |
483 newchars, decodedbytes = self.decode(data[:exc.start], self.errors) | |
484 lines = newchars.splitlines(True) | |
485 if len(lines)<=1: | |
486 raise | |
487 else: | |
488 raise | |
489 # keep undecoded bytes until the next call | |
490 self.bytebuffer = data[decodedbytes:] | |
491 # put new characters in the character buffer | |
492 self.charbuffer += newchars | |
493 # there was no data available | |
494 if not newdata: | |
495 break | |
496 if chars < 0: | |
497 # Return everything we've got | |
498 result = self.charbuffer | |
499 self.charbuffer = "" | |
500 else: | |
501 # Return the first chars characters | |
502 result = self.charbuffer[:chars] | |
503 self.charbuffer = self.charbuffer[chars:] | |
504 return result | |
505 | |
506 def readline(self, size=None, keepends=True): | |
507 | |
508 """ Read one line from the input stream and return the | |
509 decoded data. | |
510 | |
511 size, if given, is passed as size argument to the | |
512 read() method. | |
513 | |
514 """ | |
515 # If we have lines cached from an earlier read, return | |
516 # them unconditionally | |
517 if self.linebuffer: | |
518 line = self.linebuffer[0] | |
519 del self.linebuffer[0] | |
520 if len(self.linebuffer) == 1: | |
521 # revert to charbuffer mode; we might need more data | |
522 # next time | |
523 self.charbuffer = self.linebuffer[0] | |
524 self.linebuffer = None | |
525 if not keepends: | |
526 line = line.splitlines(False)[0] | |
527 return line | |
528 | |
529 readsize = size or 72 | |
530 line = "" | |
531 # If size is given, we call read() only once | |
532 while True: | |
533 data = self.read(readsize, firstline=True) | |
534 if data: | |
535 # If we're at a "\r" read one extra character (which might | |
536 # be a "\n") to get a proper line ending. If the stream is | |
537 # temporarily exhausted we return the wrong line ending. | |
538 if data.endswith("\r"): | |
539 data += self.read(size=1, chars=1) | |
540 | |
541 line += data | |
542 lines = line.splitlines(True) | |
543 if lines: | |
544 if len(lines) > 1: | |
545 # More than one line result; the first line is a full line | |
546 # to return | |
547 line = lines[0] | |
548 del lines[0] | |
549 if len(lines) > 1: | |
550 # cache the remaining lines | |
551 lines[-1] += self.charbuffer | |
552 self.linebuffer = lines | |
553 self.charbuffer = None | |
554 else: | |
555 # only one remaining line, put it back into charbuffer | |
556 self.charbuffer = lines[0] + self.charbuffer | |
557 if not keepends: | |
558 line = line.splitlines(False)[0] | |
559 break | |
560 line0withend = lines[0] | |
561 line0withoutend = lines[0].splitlines(False)[0] | |
562 if line0withend != line0withoutend: # We really have a line end | |
563 # Put the rest back together and keep it until the next call | |
564 self.charbuffer = "".join(lines[1:]) + self.charbuffer | |
565 if keepends: | |
566 line = line0withend | |
567 else: | |
568 line = line0withoutend | |
569 break | |
570 # we didn't get anything or this was our only try | |
571 if not data or size is not None: | |
572 if line and not keepends: | |
573 line = line.splitlines(False)[0] | |
574 break | |
575 if readsize<8000: | |
576 readsize *= 2 | |
577 return line | |
578 | |
579 def readlines(self, sizehint=None, keepends=True): | |
580 | |
581 """ Read all lines available on the input stream | |
582 and return them as list of lines. | |
583 | |
584 Line breaks are implemented using the codec's decoder | |
585 method and are included in the list entries. | |
586 | |
587 sizehint, if given, is ignored since there is no efficient | |
588 way to finding the true end-of-line. | |
589 | |
590 """ | |
591 data = self.read() | |
592 return data.splitlines(keepends) | |
593 | |
594 def reset(self): | |
595 | |
596 """ Resets the codec buffers used for keeping state. | |
597 | |
598 Note that no stream repositioning should take place. | |
599 This method is primarily intended to be able to recover | |
600 from decoding errors. | |
601 | |
602 """ | |
603 self.bytebuffer = "" | |
604 self.charbuffer = u"" | |
605 self.linebuffer = None | |
606 | |
607 def seek(self, offset, whence=0): | |
608 """ Set the input stream's current position. | |
609 | |
610 Resets the codec buffers used for keeping state. | |
611 """ | |
612 self.stream.seek(offset, whence) | |
613 self.reset() | |
614 | |
615 def next(self): | |
616 | |
617 """ Return the next decoded line from the input stream.""" | |
618 line = self.readline() | |
619 if line: | |
620 return line | |
621 raise StopIteration | |
622 | |
623 def __iter__(self): | |
624 return self | |
625 | |
626 def __getattr__(self, name, | |
627 getattr=getattr): | |
628 | |
629 """ Inherit all other methods from the underlying stream. | |
630 """ | |
631 return getattr(self.stream, name) | |
632 | |
633 def __enter__(self): | |
634 return self | |
635 | |
636 def __exit__(self, type, value, tb): | |
637 self.stream.close() | |
638 | |
639 ### | |
640 | |
641 class StreamReaderWriter: | |
642 | |
643 """ StreamReaderWriter instances allow wrapping streams which | |
644 work in both read and write modes. | |
645 | |
646 The design is such that one can use the factory functions | |
647 returned by the codec.lookup() function to construct the | |
648 instance. | |
649 | |
650 """ | |
651 # Optional attributes set by the file wrappers below | |
652 encoding = 'unknown' | |
653 | |
654 def __init__(self, stream, Reader, Writer, errors='strict'): | |
655 | |
656 """ Creates a StreamReaderWriter instance. | |
657 | |
658 stream must be a Stream-like object. | |
659 | |
660 Reader, Writer must be factory functions or classes | |
661 providing the StreamReader, StreamWriter interface resp. | |
662 | |
663 Error handling is done in the same way as defined for the | |
664 StreamWriter/Readers. | |
665 | |
666 """ | |
667 self.stream = stream | |
668 self.reader = Reader(stream, errors) | |
669 self.writer = Writer(stream, errors) | |
670 self.errors = errors | |
671 | |
672 def read(self, size=-1): | |
673 | |
674 return self.reader.read(size) | |
675 | |
676 def readline(self, size=None): | |
677 | |
678 return self.reader.readline(size) | |
679 | |
680 def readlines(self, sizehint=None): | |
681 | |
682 return self.reader.readlines(sizehint) | |
683 | |
684 def next(self): | |
685 | |
686 """ Return the next decoded line from the input stream.""" | |
687 return self.reader.next() | |
688 | |
689 def __iter__(self): | |
690 return self | |
691 | |
692 def write(self, data): | |
693 | |
694 return self.writer.write(data) | |
695 | |
696 def writelines(self, list): | |
697 | |
698 return self.writer.writelines(list) | |
699 | |
700 def reset(self): | |
701 | |
702 self.reader.reset() | |
703 self.writer.reset() | |
704 | |
705 def seek(self, offset, whence=0): | |
706 self.stream.seek(offset, whence) | |
707 self.reader.reset() | |
708 if whence == 0 and offset == 0: | |
709 self.writer.reset() | |
710 | |
711 def __getattr__(self, name, | |
712 getattr=getattr): | |
713 | |
714 """ Inherit all other methods from the underlying stream. | |
715 """ | |
716 return getattr(self.stream, name) | |
717 | |
718 # these are needed to make "with codecs.open(...)" work properly | |
719 | |
720 def __enter__(self): | |
721 return self | |
722 | |
723 def __exit__(self, type, value, tb): | |
724 self.stream.close() | |
725 | |
726 ### | |
727 | |
728 class StreamRecoder: | |
729 | |
730 """ StreamRecoder instances provide a frontend - backend | |
731 view of encoding data. | |
732 | |
733 They use the complete set of APIs returned by the | |
734 codecs.lookup() function to implement their task. | |
735 | |
736 Data written to the stream is first decoded into an | |
737 intermediate format (which is dependent on the given codec | |
738 combination) and then written to the stream using an instance | |
739 of the provided Writer class. | |
740 | |
741 In the other direction, data is read from the stream using a | |
742 Reader instance and then return encoded data to the caller. | |
743 | |
744 """ | |
745 # Optional attributes set by the file wrappers below | |
746 data_encoding = 'unknown' | |
747 file_encoding = 'unknown' | |
748 | |
749 def __init__(self, stream, encode, decode, Reader, Writer, | |
750 errors='strict'): | |
751 | |
752 """ Creates a StreamRecoder instance which implements a two-way | |
753 conversion: encode and decode work on the frontend (the | |
754 input to .read() and output of .write()) while | |
755 Reader and Writer work on the backend (reading and | |
756 writing to the stream). | |
757 | |
758 You can use these objects to do transparent direct | |
759 recodings from e.g. latin-1 to utf-8 and back. | |
760 | |
761 stream must be a file-like object. | |
762 | |
763 encode, decode must adhere to the Codec interface, Reader, | |
764 Writer must be factory functions or classes providing the | |
765 StreamReader, StreamWriter interface resp. | |
766 | |
767 encode and decode are needed for the frontend translation, | |
768 Reader and Writer for the backend translation. Unicode is | |
769 used as intermediate encoding. | |
770 | |
771 Error handling is done in the same way as defined for the | |
772 StreamWriter/Readers. | |
773 | |
774 """ | |
775 self.stream = stream | |
776 self.encode = encode | |
777 self.decode = decode | |
778 self.reader = Reader(stream, errors) | |
779 self.writer = Writer(stream, errors) | |
780 self.errors = errors | |
781 | |
782 def read(self, size=-1): | |
783 | |
784 data = self.reader.read(size) | |
785 data, bytesencoded = self.encode(data, self.errors) | |
786 return data | |
787 | |
788 def readline(self, size=None): | |
789 | |
790 if size is None: | |
791 data = self.reader.readline() | |
792 else: | |
793 data = self.reader.readline(size) | |
794 data, bytesencoded = self.encode(data, self.errors) | |
795 return data | |
796 | |
797 def readlines(self, sizehint=None): | |
798 | |
799 data = self.reader.read() | |
800 data, bytesencoded = self.encode(data, self.errors) | |
801 return data.splitlines(1) | |
802 | |
803 def next(self): | |
804 | |
805 """ Return the next decoded line from the input stream.""" | |
806 data = self.reader.next() | |
807 data, bytesencoded = self.encode(data, self.errors) | |
808 return data | |
809 | |
810 def __iter__(self): | |
811 return self | |
812 | |
813 def write(self, data): | |
814 | |
815 data, bytesdecoded = self.decode(data, self.errors) | |
816 return self.writer.write(data) | |
817 | |
818 def writelines(self, list): | |
819 | |
820 data = ''.join(list) | |
821 data, bytesdecoded = self.decode(data, self.errors) | |
822 return self.writer.write(data) | |
823 | |
824 def reset(self): | |
825 | |
826 self.reader.reset() | |
827 self.writer.reset() | |
828 | |
829 def __getattr__(self, name, | |
830 getattr=getattr): | |
831 | |
832 """ Inherit all other methods from the underlying stream. | |
833 """ | |
834 return getattr(self.stream, name) | |
835 | |
836 def __enter__(self): | |
837 return self | |
838 | |
839 def __exit__(self, type, value, tb): | |
840 self.stream.close() | |
841 | |
842 ### Shortcuts | |
843 | |
844 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): | |
845 | |
846 """ Open an encoded file using the given mode and return | |
847 a wrapped version providing transparent encoding/decoding. | |
848 | |
849 Note: The wrapped version will only accept the object format | |
850 defined by the codecs, i.e. Unicode objects for most builtin | |
851 codecs. Output is also codec dependent and will usually be | |
852 Unicode as well. | |
853 | |
854 Files are always opened in binary mode, even if no binary mode | |
855 was specified. This is done to avoid data loss due to encodings | |
856 using 8-bit values. The default file mode is 'rb' meaning to | |
857 open the file in binary read mode. | |
858 | |
859 encoding specifies the encoding which is to be used for the | |
860 file. | |
861 | |
862 errors may be given to define the error handling. It defaults | |
863 to 'strict' which causes ValueErrors to be raised in case an | |
864 encoding error occurs. | |
865 | |
866 buffering has the same meaning as for the builtin open() API. | |
867 It defaults to line buffered. | |
868 | |
869 The returned wrapped file object provides an extra attribute | |
870 .encoding which allows querying the used encoding. This | |
871 attribute is only available if an encoding was specified as | |
872 parameter. | |
873 | |
874 """ | |
875 if encoding is not None: | |
876 if 'U' in mode: | |
877 # No automatic conversion of '\n' is done on reading and writing | |
878 mode = mode.strip().replace('U', '') | |
879 if mode[:1] not in set('rwa'): | |
880 mode = 'r' + mode | |
881 if 'b' not in mode: | |
882 # Force opening of the file in binary mode | |
883 mode = mode + 'b' | |
884 file = __builtin__.open(filename, mode, buffering) | |
885 if encoding is None: | |
886 return file | |
887 info = lookup(encoding) | |
888 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) | |
889 # Add attributes to simplify introspection | |
890 srw.encoding = encoding | |
891 return srw | |
892 | |
893 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): | |
894 | |
895 """ Return a wrapped version of file which provides transparent | |
896 encoding translation. | |
897 | |
898 Strings written to the wrapped file are interpreted according | |
899 to the given data_encoding and then written to the original | |
900 file as string using file_encoding. The intermediate encoding | |
901 will usually be Unicode but depends on the specified codecs. | |
902 | |
903 Strings are read from the file using file_encoding and then | |
904 passed back to the caller as string using data_encoding. | |
905 | |
906 If file_encoding is not given, it defaults to data_encoding. | |
907 | |
908 errors may be given to define the error handling. It defaults | |
909 to 'strict' which causes ValueErrors to be raised in case an | |
910 encoding error occurs. | |
911 | |
912 The returned wrapped file object provides two extra attributes | |
913 .data_encoding and .file_encoding which reflect the given | |
914 parameters of the same name. The attributes can be used for | |
915 introspection by Python programs. | |
916 | |
917 """ | |
918 if file_encoding is None: | |
919 file_encoding = data_encoding | |
920 data_info = lookup(data_encoding) | |
921 file_info = lookup(file_encoding) | |
922 sr = StreamRecoder(file, data_info.encode, data_info.decode, | |
923 file_info.streamreader, file_info.streamwriter, errors) | |
924 # Add attributes to simplify introspection | |
925 sr.data_encoding = data_encoding | |
926 sr.file_encoding = file_encoding | |
927 return sr | |
928 | |
929 ### Helpers for codec lookup | |
930 | |
931 def getencoder(encoding): | |
932 | |
933 """ Lookup up the codec for the given encoding and return | |
934 its encoder function. | |
935 | |
936 Raises a LookupError in case the encoding cannot be found. | |
937 | |
938 """ | |
939 return lookup(encoding).encode | |
940 | |
941 def getdecoder(encoding): | |
942 | |
943 """ Lookup up the codec for the given encoding and return | |
944 its decoder function. | |
945 | |
946 Raises a LookupError in case the encoding cannot be found. | |
947 | |
948 """ | |
949 return lookup(encoding).decode | |
950 | |
951 def getincrementalencoder(encoding): | |
952 | |
953 """ Lookup up the codec for the given encoding and return | |
954 its IncrementalEncoder class or factory function. | |
955 | |
956 Raises a LookupError in case the encoding cannot be found | |
957 or the codecs doesn't provide an incremental encoder. | |
958 | |
959 """ | |
960 encoder = lookup(encoding).incrementalencoder | |
961 if encoder is None: | |
962 raise LookupError(encoding) | |
963 return encoder | |
964 | |
965 def getincrementaldecoder(encoding): | |
966 | |
967 """ Lookup up the codec for the given encoding and return | |
968 its IncrementalDecoder class or factory function. | |
969 | |
970 Raises a LookupError in case the encoding cannot be found | |
971 or the codecs doesn't provide an incremental decoder. | |
972 | |
973 """ | |
974 decoder = lookup(encoding).incrementaldecoder | |
975 if decoder is None: | |
976 raise LookupError(encoding) | |
977 return decoder | |
978 | |
979 def getreader(encoding): | |
980 | |
981 """ Lookup up the codec for the given encoding and return | |
982 its StreamReader class or factory function. | |
983 | |
984 Raises a LookupError in case the encoding cannot be found. | |
985 | |
986 """ | |
987 return lookup(encoding).streamreader | |
988 | |
989 def getwriter(encoding): | |
990 | |
991 """ Lookup up the codec for the given encoding and return | |
992 its StreamWriter class or factory function. | |
993 | |
994 Raises a LookupError in case the encoding cannot be found. | |
995 | |
996 """ | |
997 return lookup(encoding).streamwriter | |
998 | |
999 def iterencode(iterator, encoding, errors='strict', **kwargs): | |
1000 """ | |
1001 Encoding iterator. | |
1002 | |
1003 Encodes the input strings from the iterator using a IncrementalEncoder. | |
1004 | |
1005 errors and kwargs are passed through to the IncrementalEncoder | |
1006 constructor. | |
1007 """ | |
1008 encoder = getincrementalencoder(encoding)(errors, **kwargs) | |
1009 for input in iterator: | |
1010 output = encoder.encode(input) | |
1011 if output: | |
1012 yield output | |
1013 output = encoder.encode("", True) | |
1014 if output: | |
1015 yield output | |
1016 | |
1017 def iterdecode(iterator, encoding, errors='strict', **kwargs): | |
1018 """ | |
1019 Decoding iterator. | |
1020 | |
1021 Decodes the input strings from the iterator using a IncrementalDecoder. | |
1022 | |
1023 errors and kwargs are passed through to the IncrementalDecoder | |
1024 constructor. | |
1025 """ | |
1026 decoder = getincrementaldecoder(encoding)(errors, **kwargs) | |
1027 for input in iterator: | |
1028 output = decoder.decode(input) | |
1029 if output: | |
1030 yield output | |
1031 output = decoder.decode("", True) | |
1032 if output: | |
1033 yield output | |
1034 | |
1035 ### Helpers for charmap-based codecs | |
1036 | |
1037 def make_identity_dict(rng): | |
1038 | |
1039 """ make_identity_dict(rng) -> dict | |
1040 | |
1041 Return a dictionary where elements of the rng sequence are | |
1042 mapped to themselves. | |
1043 | |
1044 """ | |
1045 res = {} | |
1046 for i in rng: | |
1047 res[i]=i | |
1048 return res | |
1049 | |
1050 def make_encoding_map(decoding_map): | |
1051 | |
1052 """ Creates an encoding map from a decoding map. | |
1053 | |
1054 If a target mapping in the decoding map occurs multiple | |
1055 times, then that target is mapped to None (undefined mapping), | |
1056 causing an exception when encountered by the charmap codec | |
1057 during translation. | |
1058 | |
1059 One example where this happens is cp875.py which decodes | |
1060 multiple character to \\u001a. | |
1061 | |
1062 """ | |
1063 m = {} | |
1064 for k,v in decoding_map.items(): | |
1065 if not v in m: | |
1066 m[v] = k | |
1067 else: | |
1068 m[v] = None | |
1069 return m | |
1070 | |
1071 ### error handlers | |
1072 | |
1073 try: | |
1074 strict_errors = lookup_error("strict") | |
1075 ignore_errors = lookup_error("ignore") | |
1076 replace_errors = lookup_error("replace") | |
1077 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") | |
1078 backslashreplace_errors = lookup_error("backslashreplace") | |
1079 except LookupError: | |
1080 # In --disable-unicode builds, these error handler are missing | |
1081 strict_errors = None | |
1082 ignore_errors = None | |
1083 replace_errors = None | |
1084 xmlcharrefreplace_errors = None | |
1085 backslashreplace_errors = None | |
1086 | |
1087 # Tell modulefinder that using codecs probably needs the encodings | |
1088 # package | |
1089 _false = 0 | |
1090 if _false: | |
1091 import encodings | |
1092 | |
1093 ### Tests | |
1094 | |
1095 if __name__ == '__main__': | |
1096 | |
1097 # Make stdout translate Latin-1 output into UTF-8 output | |
1098 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') | |
1099 | |
1100 # Have stdin translate Latin-1 input into UTF-8 input | |
1101 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') |