comparison venv/lib/python2.7/codecs.py @ 0:d67268158946 draft

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author bcclaywell
date Mon, 12 Oct 2015 17:43:33 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d67268158946
1 """ codecs -- Python Codec Registry, API and helpers.
2
3
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8 """#"
9
10 import __builtin__, sys
11
12 ### Registry and builtin stateless codec functions
13
14 try:
15 from _codecs import *
16 except ImportError, why:
17 raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24 "StreamReader", "StreamWriter",
25 "StreamReaderWriter", "StreamRecoder",
26 "getencoder", "getdecoder", "getincrementalencoder",
27 "getincrementaldecoder", "getreader", "getwriter",
28 "encode", "decode", "iterencode", "iterdecode",
29 "strict_errors", "ignore_errors", "replace_errors",
30 "xmlcharrefreplace_errors", "backslashreplace_errors",
31 "register_error", "lookup_error"]
32
33 ### Constants
34
35 #
36 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37 # and its possible byte string values
38 # for UTF8/UTF16/UTF32 output and little/big endian machines
39 #
40
41 # UTF-8
42 BOM_UTF8 = '\xef\xbb\xbf'
43
44 # UTF-16, little endian
45 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
46
47 # UTF-16, big endian
48 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
49
50 # UTF-32, little endian
51 BOM_UTF32_LE = '\xff\xfe\x00\x00'
52
53 # UTF-32, big endian
54 BOM_UTF32_BE = '\x00\x00\xfe\xff'
55
56 if sys.byteorder == 'little':
57
58 # UTF-16, native endianness
59 BOM = BOM_UTF16 = BOM_UTF16_LE
60
61 # UTF-32, native endianness
62 BOM_UTF32 = BOM_UTF32_LE
63
64 else:
65
66 # UTF-16, native endianness
67 BOM = BOM_UTF16 = BOM_UTF16_BE
68
69 # UTF-32, native endianness
70 BOM_UTF32 = BOM_UTF32_BE
71
72 # Old broken names (don't use in new code)
73 BOM32_LE = BOM_UTF16_LE
74 BOM32_BE = BOM_UTF16_BE
75 BOM64_LE = BOM_UTF32_LE
76 BOM64_BE = BOM_UTF32_BE
77
78
79 ### Codec base classes (defining the API)
80
81 class CodecInfo(tuple):
82
83 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
84 incrementalencoder=None, incrementaldecoder=None, name=None):
85 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
86 self.name = name
87 self.encode = encode
88 self.decode = decode
89 self.incrementalencoder = incrementalencoder
90 self.incrementaldecoder = incrementaldecoder
91 self.streamwriter = streamwriter
92 self.streamreader = streamreader
93 return self
94
95 def __repr__(self):
96 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
97
98 class Codec:
99
100 """ Defines the interface for stateless encoders/decoders.
101
102 The .encode()/.decode() methods may use different error
103 handling schemes by providing the errors argument. These
104 string values are predefined:
105
106 'strict' - raise a ValueError error (or a subclass)
107 'ignore' - ignore the character and continue with the next
108 'replace' - replace with a suitable replacement character;
109 Python will use the official U+FFFD REPLACEMENT
110 CHARACTER for the builtin Unicode codecs on
111 decoding and '?' on encoding.
112 'xmlcharrefreplace' - Replace with the appropriate XML
113 character reference (only for encoding).
114 'backslashreplace' - Replace with backslashed escape sequences
115 (only for encoding).
116
117 The set of allowed values can be extended via register_error.
118
119 """
120 def encode(self, input, errors='strict'):
121
122 """ Encodes the object input and returns a tuple (output
123 object, length consumed).
124
125 errors defines the error handling to apply. It defaults to
126 'strict' handling.
127
128 The method may not store state in the Codec instance. Use
129 StreamCodec for codecs which have to keep state in order to
130 make encoding/decoding efficient.
131
132 The encoder must be able to handle zero length input and
133 return an empty object of the output object type in this
134 situation.
135
136 """
137 raise NotImplementedError
138
139 def decode(self, input, errors='strict'):
140
141 """ Decodes the object input and returns a tuple (output
142 object, length consumed).
143
144 input must be an object which provides the bf_getreadbuf
145 buffer slot. Python strings, buffer objects and memory
146 mapped files are examples of objects providing this slot.
147
148 errors defines the error handling to apply. It defaults to
149 'strict' handling.
150
151 The method may not store state in the Codec instance. Use
152 StreamCodec for codecs which have to keep state in order to
153 make encoding/decoding efficient.
154
155 The decoder must be able to handle zero length input and
156 return an empty object of the output object type in this
157 situation.
158
159 """
160 raise NotImplementedError
161
162 class IncrementalEncoder(object):
163 """
164 An IncrementalEncoder encodes an input in multiple steps. The input can be
165 passed piece by piece to the encode() method. The IncrementalEncoder remembers
166 the state of the Encoding process between calls to encode().
167 """
168 def __init__(self, errors='strict'):
169 """
170 Creates an IncrementalEncoder instance.
171
172 The IncrementalEncoder may use different error handling schemes by
173 providing the errors keyword argument. See the module docstring
174 for a list of possible values.
175 """
176 self.errors = errors
177 self.buffer = ""
178
179 def encode(self, input, final=False):
180 """
181 Encodes input and returns the resulting object.
182 """
183 raise NotImplementedError
184
185 def reset(self):
186 """
187 Resets the encoder to the initial state.
188 """
189
190 def getstate(self):
191 """
192 Return the current state of the encoder.
193 """
194 return 0
195
196 def setstate(self, state):
197 """
198 Set the current state of the encoder. state must have been
199 returned by getstate().
200 """
201
202 class BufferedIncrementalEncoder(IncrementalEncoder):
203 """
204 This subclass of IncrementalEncoder can be used as the baseclass for an
205 incremental encoder if the encoder must keep some of the output in a
206 buffer between calls to encode().
207 """
208 def __init__(self, errors='strict'):
209 IncrementalEncoder.__init__(self, errors)
210 self.buffer = "" # unencoded input that is kept between calls to encode()
211
212 def _buffer_encode(self, input, errors, final):
213 # Overwrite this method in subclasses: It must encode input
214 # and return an (output, length consumed) tuple
215 raise NotImplementedError
216
217 def encode(self, input, final=False):
218 # encode input (taking the buffer into account)
219 data = self.buffer + input
220 (result, consumed) = self._buffer_encode(data, self.errors, final)
221 # keep unencoded input until the next call
222 self.buffer = data[consumed:]
223 return result
224
225 def reset(self):
226 IncrementalEncoder.reset(self)
227 self.buffer = ""
228
229 def getstate(self):
230 return self.buffer or 0
231
232 def setstate(self, state):
233 self.buffer = state or ""
234
235 class IncrementalDecoder(object):
236 """
237 An IncrementalDecoder decodes an input in multiple steps. The input can be
238 passed piece by piece to the decode() method. The IncrementalDecoder
239 remembers the state of the decoding process between calls to decode().
240 """
241 def __init__(self, errors='strict'):
242 """
243 Creates a IncrementalDecoder instance.
244
245 The IncrementalDecoder may use different error handling schemes by
246 providing the errors keyword argument. See the module docstring
247 for a list of possible values.
248 """
249 self.errors = errors
250
251 def decode(self, input, final=False):
252 """
253 Decodes input and returns the resulting object.
254 """
255 raise NotImplementedError
256
257 def reset(self):
258 """
259 Resets the decoder to the initial state.
260 """
261
262 def getstate(self):
263 """
264 Return the current state of the decoder.
265
266 This must be a (buffered_input, additional_state_info) tuple.
267 buffered_input must be a bytes object containing bytes that
268 were passed to decode() that have not yet been converted.
269 additional_state_info must be a non-negative integer
270 representing the state of the decoder WITHOUT yet having
271 processed the contents of buffered_input. In the initial state
272 and after reset(), getstate() must return (b"", 0).
273 """
274 return (b"", 0)
275
276 def setstate(self, state):
277 """
278 Set the current state of the decoder.
279
280 state must have been returned by getstate(). The effect of
281 setstate((b"", 0)) must be equivalent to reset().
282 """
283
284 class BufferedIncrementalDecoder(IncrementalDecoder):
285 """
286 This subclass of IncrementalDecoder can be used as the baseclass for an
287 incremental decoder if the decoder must be able to handle incomplete byte
288 sequences.
289 """
290 def __init__(self, errors='strict'):
291 IncrementalDecoder.__init__(self, errors)
292 self.buffer = "" # undecoded input that is kept between calls to decode()
293
294 def _buffer_decode(self, input, errors, final):
295 # Overwrite this method in subclasses: It must decode input
296 # and return an (output, length consumed) tuple
297 raise NotImplementedError
298
299 def decode(self, input, final=False):
300 # decode input (taking the buffer into account)
301 data = self.buffer + input
302 (result, consumed) = self._buffer_decode(data, self.errors, final)
303 # keep undecoded input until the next call
304 self.buffer = data[consumed:]
305 return result
306
307 def reset(self):
308 IncrementalDecoder.reset(self)
309 self.buffer = ""
310
311 def getstate(self):
312 # additional state info is always 0
313 return (self.buffer, 0)
314
315 def setstate(self, state):
316 # ignore additional state info
317 self.buffer = state[0]
318
319 #
320 # The StreamWriter and StreamReader class provide generic working
321 # interfaces which can be used to implement new encoding submodules
322 # very easily. See encodings/utf_8.py for an example on how this is
323 # done.
324 #
325
326 class StreamWriter(Codec):
327
328 def __init__(self, stream, errors='strict'):
329
330 """ Creates a StreamWriter instance.
331
332 stream must be a file-like object open for writing
333 (binary) data.
334
335 The StreamWriter may use different error handling
336 schemes by providing the errors keyword argument. These
337 parameters are predefined:
338
339 'strict' - raise a ValueError (or a subclass)
340 'ignore' - ignore the character and continue with the next
341 'replace'- replace with a suitable replacement character
342 'xmlcharrefreplace' - Replace with the appropriate XML
343 character reference.
344 'backslashreplace' - Replace with backslashed escape
345 sequences (only for encoding).
346
347 The set of allowed parameter values can be extended via
348 register_error.
349 """
350 self.stream = stream
351 self.errors = errors
352
353 def write(self, object):
354
355 """ Writes the object's contents encoded to self.stream.
356 """
357 data, consumed = self.encode(object, self.errors)
358 self.stream.write(data)
359
360 def writelines(self, list):
361
362 """ Writes the concatenated list of strings to the stream
363 using .write().
364 """
365 self.write(''.join(list))
366
367 def reset(self):
368
369 """ Flushes and resets the codec buffers used for keeping state.
370
371 Calling this method should ensure that the data on the
372 output is put into a clean state, that allows appending
373 of new fresh data without having to rescan the whole
374 stream to recover state.
375
376 """
377 pass
378
379 def seek(self, offset, whence=0):
380 self.stream.seek(offset, whence)
381 if whence == 0 and offset == 0:
382 self.reset()
383
384 def __getattr__(self, name,
385 getattr=getattr):
386
387 """ Inherit all other methods from the underlying stream.
388 """
389 return getattr(self.stream, name)
390
391 def __enter__(self):
392 return self
393
394 def __exit__(self, type, value, tb):
395 self.stream.close()
396
397 ###
398
399 class StreamReader(Codec):
400
401 def __init__(self, stream, errors='strict'):
402
403 """ Creates a StreamReader instance.
404
405 stream must be a file-like object open for reading
406 (binary) data.
407
408 The StreamReader may use different error handling
409 schemes by providing the errors keyword argument. These
410 parameters are predefined:
411
412 'strict' - raise a ValueError (or a subclass)
413 'ignore' - ignore the character and continue with the next
414 'replace'- replace with a suitable replacement character;
415
416 The set of allowed parameter values can be extended via
417 register_error.
418 """
419 self.stream = stream
420 self.errors = errors
421 self.bytebuffer = ""
422 # For str->str decoding this will stay a str
423 # For str->unicode decoding the first read will promote it to unicode
424 self.charbuffer = ""
425 self.linebuffer = None
426
427 def decode(self, input, errors='strict'):
428 raise NotImplementedError
429
430 def read(self, size=-1, chars=-1, firstline=False):
431
432 """ Decodes data from the stream self.stream and returns the
433 resulting object.
434
435 chars indicates the number of characters to read from the
436 stream. read() will never return more than chars
437 characters, but it might return less, if there are not enough
438 characters available.
439
440 size indicates the approximate maximum number of bytes to
441 read from the stream for decoding purposes. The decoder
442 can modify this setting as appropriate. The default value
443 -1 indicates to read and decode as much as possible. size
444 is intended to prevent having to decode huge files in one
445 step.
446
447 If firstline is true, and a UnicodeDecodeError happens
448 after the first line terminator in the input only the first line
449 will be returned, the rest of the input will be kept until the
450 next call to read().
451
452 The method should use a greedy read strategy meaning that
453 it should read as much data as is allowed within the
454 definition of the encoding and the given size, e.g. if
455 optional encoding endings or state markers are available
456 on the stream, these should be read too.
457 """
458 # If we have lines cached, first merge them back into characters
459 if self.linebuffer:
460 self.charbuffer = "".join(self.linebuffer)
461 self.linebuffer = None
462
463 # read until we get the required number of characters (if available)
464 while True:
465 # can the request be satisfied from the character buffer?
466 if chars >= 0:
467 if len(self.charbuffer) >= chars:
468 break
469 elif size >= 0:
470 if len(self.charbuffer) >= size:
471 break
472 # we need more data
473 if size < 0:
474 newdata = self.stream.read()
475 else:
476 newdata = self.stream.read(size)
477 # decode bytes (those remaining from the last call included)
478 data = self.bytebuffer + newdata
479 try:
480 newchars, decodedbytes = self.decode(data, self.errors)
481 except UnicodeDecodeError, exc:
482 if firstline:
483 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
484 lines = newchars.splitlines(True)
485 if len(lines)<=1:
486 raise
487 else:
488 raise
489 # keep undecoded bytes until the next call
490 self.bytebuffer = data[decodedbytes:]
491 # put new characters in the character buffer
492 self.charbuffer += newchars
493 # there was no data available
494 if not newdata:
495 break
496 if chars < 0:
497 # Return everything we've got
498 result = self.charbuffer
499 self.charbuffer = ""
500 else:
501 # Return the first chars characters
502 result = self.charbuffer[:chars]
503 self.charbuffer = self.charbuffer[chars:]
504 return result
505
506 def readline(self, size=None, keepends=True):
507
508 """ Read one line from the input stream and return the
509 decoded data.
510
511 size, if given, is passed as size argument to the
512 read() method.
513
514 """
515 # If we have lines cached from an earlier read, return
516 # them unconditionally
517 if self.linebuffer:
518 line = self.linebuffer[0]
519 del self.linebuffer[0]
520 if len(self.linebuffer) == 1:
521 # revert to charbuffer mode; we might need more data
522 # next time
523 self.charbuffer = self.linebuffer[0]
524 self.linebuffer = None
525 if not keepends:
526 line = line.splitlines(False)[0]
527 return line
528
529 readsize = size or 72
530 line = ""
531 # If size is given, we call read() only once
532 while True:
533 data = self.read(readsize, firstline=True)
534 if data:
535 # If we're at a "\r" read one extra character (which might
536 # be a "\n") to get a proper line ending. If the stream is
537 # temporarily exhausted we return the wrong line ending.
538 if data.endswith("\r"):
539 data += self.read(size=1, chars=1)
540
541 line += data
542 lines = line.splitlines(True)
543 if lines:
544 if len(lines) > 1:
545 # More than one line result; the first line is a full line
546 # to return
547 line = lines[0]
548 del lines[0]
549 if len(lines) > 1:
550 # cache the remaining lines
551 lines[-1] += self.charbuffer
552 self.linebuffer = lines
553 self.charbuffer = None
554 else:
555 # only one remaining line, put it back into charbuffer
556 self.charbuffer = lines[0] + self.charbuffer
557 if not keepends:
558 line = line.splitlines(False)[0]
559 break
560 line0withend = lines[0]
561 line0withoutend = lines[0].splitlines(False)[0]
562 if line0withend != line0withoutend: # We really have a line end
563 # Put the rest back together and keep it until the next call
564 self.charbuffer = "".join(lines[1:]) + self.charbuffer
565 if keepends:
566 line = line0withend
567 else:
568 line = line0withoutend
569 break
570 # we didn't get anything or this was our only try
571 if not data or size is not None:
572 if line and not keepends:
573 line = line.splitlines(False)[0]
574 break
575 if readsize<8000:
576 readsize *= 2
577 return line
578
579 def readlines(self, sizehint=None, keepends=True):
580
581 """ Read all lines available on the input stream
582 and return them as list of lines.
583
584 Line breaks are implemented using the codec's decoder
585 method and are included in the list entries.
586
587 sizehint, if given, is ignored since there is no efficient
588 way to finding the true end-of-line.
589
590 """
591 data = self.read()
592 return data.splitlines(keepends)
593
594 def reset(self):
595
596 """ Resets the codec buffers used for keeping state.
597
598 Note that no stream repositioning should take place.
599 This method is primarily intended to be able to recover
600 from decoding errors.
601
602 """
603 self.bytebuffer = ""
604 self.charbuffer = u""
605 self.linebuffer = None
606
607 def seek(self, offset, whence=0):
608 """ Set the input stream's current position.
609
610 Resets the codec buffers used for keeping state.
611 """
612 self.stream.seek(offset, whence)
613 self.reset()
614
615 def next(self):
616
617 """ Return the next decoded line from the input stream."""
618 line = self.readline()
619 if line:
620 return line
621 raise StopIteration
622
623 def __iter__(self):
624 return self
625
626 def __getattr__(self, name,
627 getattr=getattr):
628
629 """ Inherit all other methods from the underlying stream.
630 """
631 return getattr(self.stream, name)
632
633 def __enter__(self):
634 return self
635
636 def __exit__(self, type, value, tb):
637 self.stream.close()
638
639 ###
640
641 class StreamReaderWriter:
642
643 """ StreamReaderWriter instances allow wrapping streams which
644 work in both read and write modes.
645
646 The design is such that one can use the factory functions
647 returned by the codec.lookup() function to construct the
648 instance.
649
650 """
651 # Optional attributes set by the file wrappers below
652 encoding = 'unknown'
653
654 def __init__(self, stream, Reader, Writer, errors='strict'):
655
656 """ Creates a StreamReaderWriter instance.
657
658 stream must be a Stream-like object.
659
660 Reader, Writer must be factory functions or classes
661 providing the StreamReader, StreamWriter interface resp.
662
663 Error handling is done in the same way as defined for the
664 StreamWriter/Readers.
665
666 """
667 self.stream = stream
668 self.reader = Reader(stream, errors)
669 self.writer = Writer(stream, errors)
670 self.errors = errors
671
672 def read(self, size=-1):
673
674 return self.reader.read(size)
675
676 def readline(self, size=None):
677
678 return self.reader.readline(size)
679
680 def readlines(self, sizehint=None):
681
682 return self.reader.readlines(sizehint)
683
684 def next(self):
685
686 """ Return the next decoded line from the input stream."""
687 return self.reader.next()
688
689 def __iter__(self):
690 return self
691
692 def write(self, data):
693
694 return self.writer.write(data)
695
696 def writelines(self, list):
697
698 return self.writer.writelines(list)
699
700 def reset(self):
701
702 self.reader.reset()
703 self.writer.reset()
704
705 def seek(self, offset, whence=0):
706 self.stream.seek(offset, whence)
707 self.reader.reset()
708 if whence == 0 and offset == 0:
709 self.writer.reset()
710
711 def __getattr__(self, name,
712 getattr=getattr):
713
714 """ Inherit all other methods from the underlying stream.
715 """
716 return getattr(self.stream, name)
717
718 # these are needed to make "with codecs.open(...)" work properly
719
720 def __enter__(self):
721 return self
722
723 def __exit__(self, type, value, tb):
724 self.stream.close()
725
726 ###
727
728 class StreamRecoder:
729
730 """ StreamRecoder instances provide a frontend - backend
731 view of encoding data.
732
733 They use the complete set of APIs returned by the
734 codecs.lookup() function to implement their task.
735
736 Data written to the stream is first decoded into an
737 intermediate format (which is dependent on the given codec
738 combination) and then written to the stream using an instance
739 of the provided Writer class.
740
741 In the other direction, data is read from the stream using a
742 Reader instance and then return encoded data to the caller.
743
744 """
745 # Optional attributes set by the file wrappers below
746 data_encoding = 'unknown'
747 file_encoding = 'unknown'
748
749 def __init__(self, stream, encode, decode, Reader, Writer,
750 errors='strict'):
751
752 """ Creates a StreamRecoder instance which implements a two-way
753 conversion: encode and decode work on the frontend (the
754 input to .read() and output of .write()) while
755 Reader and Writer work on the backend (reading and
756 writing to the stream).
757
758 You can use these objects to do transparent direct
759 recodings from e.g. latin-1 to utf-8 and back.
760
761 stream must be a file-like object.
762
763 encode, decode must adhere to the Codec interface, Reader,
764 Writer must be factory functions or classes providing the
765 StreamReader, StreamWriter interface resp.
766
767 encode and decode are needed for the frontend translation,
768 Reader and Writer for the backend translation. Unicode is
769 used as intermediate encoding.
770
771 Error handling is done in the same way as defined for the
772 StreamWriter/Readers.
773
774 """
775 self.stream = stream
776 self.encode = encode
777 self.decode = decode
778 self.reader = Reader(stream, errors)
779 self.writer = Writer(stream, errors)
780 self.errors = errors
781
782 def read(self, size=-1):
783
784 data = self.reader.read(size)
785 data, bytesencoded = self.encode(data, self.errors)
786 return data
787
788 def readline(self, size=None):
789
790 if size is None:
791 data = self.reader.readline()
792 else:
793 data = self.reader.readline(size)
794 data, bytesencoded = self.encode(data, self.errors)
795 return data
796
797 def readlines(self, sizehint=None):
798
799 data = self.reader.read()
800 data, bytesencoded = self.encode(data, self.errors)
801 return data.splitlines(1)
802
803 def next(self):
804
805 """ Return the next decoded line from the input stream."""
806 data = self.reader.next()
807 data, bytesencoded = self.encode(data, self.errors)
808 return data
809
810 def __iter__(self):
811 return self
812
813 def write(self, data):
814
815 data, bytesdecoded = self.decode(data, self.errors)
816 return self.writer.write(data)
817
818 def writelines(self, list):
819
820 data = ''.join(list)
821 data, bytesdecoded = self.decode(data, self.errors)
822 return self.writer.write(data)
823
824 def reset(self):
825
826 self.reader.reset()
827 self.writer.reset()
828
829 def __getattr__(self, name,
830 getattr=getattr):
831
832 """ Inherit all other methods from the underlying stream.
833 """
834 return getattr(self.stream, name)
835
836 def __enter__(self):
837 return self
838
839 def __exit__(self, type, value, tb):
840 self.stream.close()
841
842 ### Shortcuts
843
844 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
845
846 """ Open an encoded file using the given mode and return
847 a wrapped version providing transparent encoding/decoding.
848
849 Note: The wrapped version will only accept the object format
850 defined by the codecs, i.e. Unicode objects for most builtin
851 codecs. Output is also codec dependent and will usually be
852 Unicode as well.
853
854 Files are always opened in binary mode, even if no binary mode
855 was specified. This is done to avoid data loss due to encodings
856 using 8-bit values. The default file mode is 'rb' meaning to
857 open the file in binary read mode.
858
859 encoding specifies the encoding which is to be used for the
860 file.
861
862 errors may be given to define the error handling. It defaults
863 to 'strict' which causes ValueErrors to be raised in case an
864 encoding error occurs.
865
866 buffering has the same meaning as for the builtin open() API.
867 It defaults to line buffered.
868
869 The returned wrapped file object provides an extra attribute
870 .encoding which allows querying the used encoding. This
871 attribute is only available if an encoding was specified as
872 parameter.
873
874 """
875 if encoding is not None:
876 if 'U' in mode:
877 # No automatic conversion of '\n' is done on reading and writing
878 mode = mode.strip().replace('U', '')
879 if mode[:1] not in set('rwa'):
880 mode = 'r' + mode
881 if 'b' not in mode:
882 # Force opening of the file in binary mode
883 mode = mode + 'b'
884 file = __builtin__.open(filename, mode, buffering)
885 if encoding is None:
886 return file
887 info = lookup(encoding)
888 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
889 # Add attributes to simplify introspection
890 srw.encoding = encoding
891 return srw
892
893 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
894
895 """ Return a wrapped version of file which provides transparent
896 encoding translation.
897
898 Strings written to the wrapped file are interpreted according
899 to the given data_encoding and then written to the original
900 file as string using file_encoding. The intermediate encoding
901 will usually be Unicode but depends on the specified codecs.
902
903 Strings are read from the file using file_encoding and then
904 passed back to the caller as string using data_encoding.
905
906 If file_encoding is not given, it defaults to data_encoding.
907
908 errors may be given to define the error handling. It defaults
909 to 'strict' which causes ValueErrors to be raised in case an
910 encoding error occurs.
911
912 The returned wrapped file object provides two extra attributes
913 .data_encoding and .file_encoding which reflect the given
914 parameters of the same name. The attributes can be used for
915 introspection by Python programs.
916
917 """
918 if file_encoding is None:
919 file_encoding = data_encoding
920 data_info = lookup(data_encoding)
921 file_info = lookup(file_encoding)
922 sr = StreamRecoder(file, data_info.encode, data_info.decode,
923 file_info.streamreader, file_info.streamwriter, errors)
924 # Add attributes to simplify introspection
925 sr.data_encoding = data_encoding
926 sr.file_encoding = file_encoding
927 return sr
928
929 ### Helpers for codec lookup
930
931 def getencoder(encoding):
932
933 """ Lookup up the codec for the given encoding and return
934 its encoder function.
935
936 Raises a LookupError in case the encoding cannot be found.
937
938 """
939 return lookup(encoding).encode
940
941 def getdecoder(encoding):
942
943 """ Lookup up the codec for the given encoding and return
944 its decoder function.
945
946 Raises a LookupError in case the encoding cannot be found.
947
948 """
949 return lookup(encoding).decode
950
951 def getincrementalencoder(encoding):
952
953 """ Lookup up the codec for the given encoding and return
954 its IncrementalEncoder class or factory function.
955
956 Raises a LookupError in case the encoding cannot be found
957 or the codecs doesn't provide an incremental encoder.
958
959 """
960 encoder = lookup(encoding).incrementalencoder
961 if encoder is None:
962 raise LookupError(encoding)
963 return encoder
964
965 def getincrementaldecoder(encoding):
966
967 """ Lookup up the codec for the given encoding and return
968 its IncrementalDecoder class or factory function.
969
970 Raises a LookupError in case the encoding cannot be found
971 or the codecs doesn't provide an incremental decoder.
972
973 """
974 decoder = lookup(encoding).incrementaldecoder
975 if decoder is None:
976 raise LookupError(encoding)
977 return decoder
978
979 def getreader(encoding):
980
981 """ Lookup up the codec for the given encoding and return
982 its StreamReader class or factory function.
983
984 Raises a LookupError in case the encoding cannot be found.
985
986 """
987 return lookup(encoding).streamreader
988
989 def getwriter(encoding):
990
991 """ Lookup up the codec for the given encoding and return
992 its StreamWriter class or factory function.
993
994 Raises a LookupError in case the encoding cannot be found.
995
996 """
997 return lookup(encoding).streamwriter
998
999 def iterencode(iterator, encoding, errors='strict', **kwargs):
1000 """
1001 Encoding iterator.
1002
1003 Encodes the input strings from the iterator using a IncrementalEncoder.
1004
1005 errors and kwargs are passed through to the IncrementalEncoder
1006 constructor.
1007 """
1008 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1009 for input in iterator:
1010 output = encoder.encode(input)
1011 if output:
1012 yield output
1013 output = encoder.encode("", True)
1014 if output:
1015 yield output
1016
1017 def iterdecode(iterator, encoding, errors='strict', **kwargs):
1018 """
1019 Decoding iterator.
1020
1021 Decodes the input strings from the iterator using a IncrementalDecoder.
1022
1023 errors and kwargs are passed through to the IncrementalDecoder
1024 constructor.
1025 """
1026 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1027 for input in iterator:
1028 output = decoder.decode(input)
1029 if output:
1030 yield output
1031 output = decoder.decode("", True)
1032 if output:
1033 yield output
1034
1035 ### Helpers for charmap-based codecs
1036
1037 def make_identity_dict(rng):
1038
1039 """ make_identity_dict(rng) -> dict
1040
1041 Return a dictionary where elements of the rng sequence are
1042 mapped to themselves.
1043
1044 """
1045 res = {}
1046 for i in rng:
1047 res[i]=i
1048 return res
1049
1050 def make_encoding_map(decoding_map):
1051
1052 """ Creates an encoding map from a decoding map.
1053
1054 If a target mapping in the decoding map occurs multiple
1055 times, then that target is mapped to None (undefined mapping),
1056 causing an exception when encountered by the charmap codec
1057 during translation.
1058
1059 One example where this happens is cp875.py which decodes
1060 multiple character to \\u001a.
1061
1062 """
1063 m = {}
1064 for k,v in decoding_map.items():
1065 if not v in m:
1066 m[v] = k
1067 else:
1068 m[v] = None
1069 return m
1070
1071 ### error handlers
1072
1073 try:
1074 strict_errors = lookup_error("strict")
1075 ignore_errors = lookup_error("ignore")
1076 replace_errors = lookup_error("replace")
1077 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1078 backslashreplace_errors = lookup_error("backslashreplace")
1079 except LookupError:
1080 # In --disable-unicode builds, these error handler are missing
1081 strict_errors = None
1082 ignore_errors = None
1083 replace_errors = None
1084 xmlcharrefreplace_errors = None
1085 backslashreplace_errors = None
1086
1087 # Tell modulefinder that using codecs probably needs the encodings
1088 # package
1089 _false = 0
1090 if _false:
1091 import encodings
1092
1093 ### Tests
1094
1095 if __name__ == '__main__':
1096
1097 # Make stdout translate Latin-1 output into UTF-8 output
1098 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1099
1100 # Have stdin translate Latin-1 input into UTF-8 input
1101 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')