annotate venv/lib/python2.7/site-packages/requests/packages/chardet/chardistribution.py @ 0:d67268158946 draft

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author bcclaywell
date Mon, 12 Oct 2015 17:43:33 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
1 ######################## BEGIN LICENSE BLOCK ########################
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
2 # The Original Code is Mozilla Communicator client code.
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
3 #
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
4 # The Initial Developer of the Original Code is
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
5 # Netscape Communications Corporation.
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
6 # Portions created by the Initial Developer are Copyright (C) 1998
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
7 # the Initial Developer. All Rights Reserved.
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
8 #
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
9 # Contributor(s):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
10 # Mark Pilgrim - port to Python
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
11 #
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
12 # This library is free software; you can redistribute it and/or
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
13 # modify it under the terms of the GNU Lesser General Public
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
14 # License as published by the Free Software Foundation; either
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
15 # version 2.1 of the License, or (at your option) any later version.
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
16 #
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
17 # This library is distributed in the hope that it will be useful,
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
20 # Lesser General Public License for more details.
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
21 #
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
22 # You should have received a copy of the GNU Lesser General Public
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
23 # License along with this library; if not, write to the Free Software
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
25 # 02110-1301 USA
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
26 ######################### END LICENSE BLOCK #########################
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
27
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
28 from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
29 EUCTW_TYPICAL_DISTRIBUTION_RATIO)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
30 from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
31 EUCKR_TYPICAL_DISTRIBUTION_RATIO)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
32 from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
33 GB2312_TYPICAL_DISTRIBUTION_RATIO)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
34 from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
35 BIG5_TYPICAL_DISTRIBUTION_RATIO)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
36 from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
37 JIS_TYPICAL_DISTRIBUTION_RATIO)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
38 from .compat import wrap_ord
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
39
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
40 ENOUGH_DATA_THRESHOLD = 1024
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
41 SURE_YES = 0.99
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
42 SURE_NO = 0.01
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
43 MINIMUM_DATA_THRESHOLD = 3
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
44
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
45
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
46 class CharDistributionAnalysis:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
47 def __init__(self):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
48 # Mapping table to get frequency order from char order (get from
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
49 # GetOrder())
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
50 self._mCharToFreqOrder = None
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
51 self._mTableSize = None # Size of above table
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
52 # This is a constant value which varies from language to language,
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
53 # used in calculating confidence. See
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
54 # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
55 # for further detail.
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
56 self._mTypicalDistributionRatio = None
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
57 self.reset()
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
58
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
59 def reset(self):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
60 """reset analyser, clear any state"""
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
61 # If this flag is set to True, detection is done and conclusion has
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
62 # been made
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
63 self._mDone = False
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
64 self._mTotalChars = 0 # Total characters encountered
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
65 # The number of characters whose frequency order is less than 512
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
66 self._mFreqChars = 0
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
67
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
68 def feed(self, aBuf, aCharLen):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
69 """feed a character with known length"""
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
70 if aCharLen == 2:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
71 # we only care about 2-bytes character in our distribution analysis
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
72 order = self.get_order(aBuf)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
73 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
74 order = -1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
75 if order >= 0:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
76 self._mTotalChars += 1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
77 # order is valid
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
78 if order < self._mTableSize:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
79 if 512 > self._mCharToFreqOrder[order]:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
80 self._mFreqChars += 1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
81
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
82 def get_confidence(self):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
83 """return confidence based on existing data"""
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
84 # if we didn't receive any character in our consideration range,
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
85 # return negative answer
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
86 if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
87 return SURE_NO
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
88
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
89 if self._mTotalChars != self._mFreqChars:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
90 r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
91 * self._mTypicalDistributionRatio))
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
92 if r < SURE_YES:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
93 return r
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
94
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
95 # normalize confidence (we don't want to be 100% sure)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
96 return SURE_YES
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
97
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
98 def got_enough_data(self):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
99 # It is not necessary to receive all data to draw conclusion.
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
100 # For charset detection, certain amount of data is enough
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
101 return self._mTotalChars > ENOUGH_DATA_THRESHOLD
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
102
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
103 def get_order(self, aBuf):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
104 # We do not handle characters based on the original encoding string,
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
105 # but convert this encoding string to a number, here called order.
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
106 # This allows multiple encodings of a language to share one frequency
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
107 # table.
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
108 return -1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
109
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
110
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
111 class EUCTWDistributionAnalysis(CharDistributionAnalysis):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
112 def __init__(self):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
113 CharDistributionAnalysis.__init__(self)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
114 self._mCharToFreqOrder = EUCTWCharToFreqOrder
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
115 self._mTableSize = EUCTW_TABLE_SIZE
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
116 self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
117
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
118 def get_order(self, aBuf):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
119 # for euc-TW encoding, we are interested
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
120 # first byte range: 0xc4 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
121 # second byte range: 0xa1 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
122 # no validation needed here. State machine has done that
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
123 first_char = wrap_ord(aBuf[0])
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
124 if first_char >= 0xC4:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
125 return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
126 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
127 return -1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
128
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
129
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
130 class EUCKRDistributionAnalysis(CharDistributionAnalysis):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
131 def __init__(self):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
132 CharDistributionAnalysis.__init__(self)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
133 self._mCharToFreqOrder = EUCKRCharToFreqOrder
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
134 self._mTableSize = EUCKR_TABLE_SIZE
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
135 self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
136
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
137 def get_order(self, aBuf):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
138 # for euc-KR encoding, we are interested
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
139 # first byte range: 0xb0 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
140 # second byte range: 0xa1 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
141 # no validation needed here. State machine has done that
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
142 first_char = wrap_ord(aBuf[0])
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
143 if first_char >= 0xB0:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
144 return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
145 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
146 return -1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
147
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
148
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
149 class GB2312DistributionAnalysis(CharDistributionAnalysis):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
150 def __init__(self):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
151 CharDistributionAnalysis.__init__(self)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
152 self._mCharToFreqOrder = GB2312CharToFreqOrder
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
153 self._mTableSize = GB2312_TABLE_SIZE
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
154 self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
155
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
156 def get_order(self, aBuf):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
157 # for GB2312 encoding, we are interested
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
158 # first byte range: 0xb0 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
159 # second byte range: 0xa1 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
160 # no validation needed here. State machine has done that
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
161 first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
162 if (first_char >= 0xB0) and (second_char >= 0xA1):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
163 return 94 * (first_char - 0xB0) + second_char - 0xA1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
164 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
165 return -1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
166
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
167
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
168 class Big5DistributionAnalysis(CharDistributionAnalysis):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
169 def __init__(self):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
170 CharDistributionAnalysis.__init__(self)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
171 self._mCharToFreqOrder = Big5CharToFreqOrder
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
172 self._mTableSize = BIG5_TABLE_SIZE
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
173 self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
174
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
175 def get_order(self, aBuf):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
176 # for big5 encoding, we are interested
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
177 # first byte range: 0xa4 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
178 # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
179 # no validation needed here. State machine has done that
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
180 first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
181 if first_char >= 0xA4:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
182 if second_char >= 0xA1:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
183 return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
184 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
185 return 157 * (first_char - 0xA4) + second_char - 0x40
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
186 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
187 return -1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
188
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
189
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
190 class SJISDistributionAnalysis(CharDistributionAnalysis):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
191 def __init__(self):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
192 CharDistributionAnalysis.__init__(self)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
193 self._mCharToFreqOrder = JISCharToFreqOrder
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
194 self._mTableSize = JIS_TABLE_SIZE
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
195 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
196
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
197 def get_order(self, aBuf):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
198 # for sjis encoding, we are interested
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
199 # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
200 # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
201 # no validation needed here. State machine has done that
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
202 first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
203 if (first_char >= 0x81) and (first_char <= 0x9F):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
204 order = 188 * (first_char - 0x81)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
205 elif (first_char >= 0xE0) and (first_char <= 0xEF):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
206 order = 188 * (first_char - 0xE0 + 31)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
207 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
208 return -1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
209 order = order + second_char - 0x40
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
210 if second_char > 0x7F:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
211 order = -1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
212 return order
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
213
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
214
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
215 class EUCJPDistributionAnalysis(CharDistributionAnalysis):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
216 def __init__(self):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
217 CharDistributionAnalysis.__init__(self)
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
218 self._mCharToFreqOrder = JISCharToFreqOrder
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
219 self._mTableSize = JIS_TABLE_SIZE
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
220 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
221
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
222 def get_order(self, aBuf):
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
223 # for euc-JP encoding, we are interested
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
224 # first byte range: 0xa0 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
225 # second byte range: 0xa1 -- 0xfe
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
226 # no validation needed here. State machine has done that
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
227 char = wrap_ord(aBuf[0])
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
228 if char >= 0xA0:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
229 return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
230 else:
d67268158946 planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff changeset
231 return -1