Mercurial > repos > yating-l > jbrowsearchivecreator
comparison test/lib/python2.7/encodings/idna.py @ 3:7d1a9a91b989 draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit d583ac16a6c6942730ea536eb59cc37941816030-dirty
author | yating-l |
---|---|
date | Thu, 18 May 2017 18:37:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:3e2160197902 | 3:7d1a9a91b989 |
---|---|
1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) | |
2 | |
3 import stringprep, re, codecs | |
4 from unicodedata import ucd_3_2_0 as unicodedata | |
5 | |
6 # IDNA section 3.1 | |
7 dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") | |
8 | |
9 # IDNA section 5 | |
10 ace_prefix = "xn--" | |
11 uace_prefix = unicode(ace_prefix, "ascii") | |
12 | |
13 # This assumes query strings, so AllowUnassigned is true | |
14 def nameprep(label): | |
15 # Map | |
16 newlabel = [] | |
17 for c in label: | |
18 if stringprep.in_table_b1(c): | |
19 # Map to nothing | |
20 continue | |
21 newlabel.append(stringprep.map_table_b2(c)) | |
22 label = u"".join(newlabel) | |
23 | |
24 # Normalize | |
25 label = unicodedata.normalize("NFKC", label) | |
26 | |
27 # Prohibit | |
28 for c in label: | |
29 if stringprep.in_table_c12(c) or \ | |
30 stringprep.in_table_c22(c) or \ | |
31 stringprep.in_table_c3(c) or \ | |
32 stringprep.in_table_c4(c) or \ | |
33 stringprep.in_table_c5(c) or \ | |
34 stringprep.in_table_c6(c) or \ | |
35 stringprep.in_table_c7(c) or \ | |
36 stringprep.in_table_c8(c) or \ | |
37 stringprep.in_table_c9(c): | |
38 raise UnicodeError("Invalid character %r" % c) | |
39 | |
40 # Check bidi | |
41 RandAL = map(stringprep.in_table_d1, label) | |
42 for c in RandAL: | |
43 if c: | |
44 # There is a RandAL char in the string. Must perform further | |
45 # tests: | |
46 # 1) The characters in section 5.8 MUST be prohibited. | |
47 # This is table C.8, which was already checked | |
48 # 2) If a string contains any RandALCat character, the string | |
49 # MUST NOT contain any LCat character. | |
50 if filter(stringprep.in_table_d2, label): | |
51 raise UnicodeError("Violation of BIDI requirement 2") | |
52 | |
53 # 3) If a string contains any RandALCat character, a | |
54 # RandALCat character MUST be the first character of the | |
55 # string, and a RandALCat character MUST be the last | |
56 # character of the string. | |
57 if not RandAL[0] or not RandAL[-1]: | |
58 raise UnicodeError("Violation of BIDI requirement 3") | |
59 | |
60 return label | |
61 | |
62 def ToASCII(label): | |
63 try: | |
64 # Step 1: try ASCII | |
65 label = label.encode("ascii") | |
66 except UnicodeError: | |
67 pass | |
68 else: | |
69 # Skip to step 3: UseSTD3ASCIIRules is false, so | |
70 # Skip to step 8. | |
71 if 0 < len(label) < 64: | |
72 return label | |
73 raise UnicodeError("label empty or too long") | |
74 | |
75 # Step 2: nameprep | |
76 label = nameprep(label) | |
77 | |
78 # Step 3: UseSTD3ASCIIRules is false | |
79 # Step 4: try ASCII | |
80 try: | |
81 label = label.encode("ascii") | |
82 except UnicodeError: | |
83 pass | |
84 else: | |
85 # Skip to step 8. | |
86 if 0 < len(label) < 64: | |
87 return label | |
88 raise UnicodeError("label empty or too long") | |
89 | |
90 # Step 5: Check ACE prefix | |
91 if label.startswith(uace_prefix): | |
92 raise UnicodeError("Label starts with ACE prefix") | |
93 | |
94 # Step 6: Encode with PUNYCODE | |
95 label = label.encode("punycode") | |
96 | |
97 # Step 7: Prepend ACE prefix | |
98 label = ace_prefix + label | |
99 | |
100 # Step 8: Check size | |
101 if 0 < len(label) < 64: | |
102 return label | |
103 raise UnicodeError("label empty or too long") | |
104 | |
105 def ToUnicode(label): | |
106 # Step 1: Check for ASCII | |
107 if isinstance(label, str): | |
108 pure_ascii = True | |
109 else: | |
110 try: | |
111 label = label.encode("ascii") | |
112 pure_ascii = True | |
113 except UnicodeError: | |
114 pure_ascii = False | |
115 if not pure_ascii: | |
116 # Step 2: Perform nameprep | |
117 label = nameprep(label) | |
118 # It doesn't say this, but apparently, it should be ASCII now | |
119 try: | |
120 label = label.encode("ascii") | |
121 except UnicodeError: | |
122 raise UnicodeError("Invalid character in IDN label") | |
123 # Step 3: Check for ACE prefix | |
124 if not label.startswith(ace_prefix): | |
125 return unicode(label, "ascii") | |
126 | |
127 # Step 4: Remove ACE prefix | |
128 label1 = label[len(ace_prefix):] | |
129 | |
130 # Step 5: Decode using PUNYCODE | |
131 result = label1.decode("punycode") | |
132 | |
133 # Step 6: Apply ToASCII | |
134 label2 = ToASCII(result) | |
135 | |
136 # Step 7: Compare the result of step 6 with the one of step 3 | |
137 # label2 will already be in lower case. | |
138 if label.lower() != label2: | |
139 raise UnicodeError("IDNA does not round-trip", label, label2) | |
140 | |
141 # Step 8: return the result of step 5 | |
142 return result | |
143 | |
144 ### Codec APIs | |
145 | |
146 class Codec(codecs.Codec): | |
147 def encode(self,input,errors='strict'): | |
148 | |
149 if errors != 'strict': | |
150 # IDNA is quite clear that implementations must be strict | |
151 raise UnicodeError("unsupported error handling "+errors) | |
152 | |
153 if not input: | |
154 return "", 0 | |
155 | |
156 result = [] | |
157 labels = dots.split(input) | |
158 if labels and len(labels[-1])==0: | |
159 trailing_dot = '.' | |
160 del labels[-1] | |
161 else: | |
162 trailing_dot = '' | |
163 for label in labels: | |
164 result.append(ToASCII(label)) | |
165 # Join with U+002E | |
166 return ".".join(result)+trailing_dot, len(input) | |
167 | |
168 def decode(self,input,errors='strict'): | |
169 | |
170 if errors != 'strict': | |
171 raise UnicodeError("Unsupported error handling "+errors) | |
172 | |
173 if not input: | |
174 return u"", 0 | |
175 | |
176 # IDNA allows decoding to operate on Unicode strings, too. | |
177 if isinstance(input, unicode): | |
178 labels = dots.split(input) | |
179 else: | |
180 # Must be ASCII string | |
181 input = str(input) | |
182 unicode(input, "ascii") | |
183 labels = input.split(".") | |
184 | |
185 if labels and len(labels[-1]) == 0: | |
186 trailing_dot = u'.' | |
187 del labels[-1] | |
188 else: | |
189 trailing_dot = u'' | |
190 | |
191 result = [] | |
192 for label in labels: | |
193 result.append(ToUnicode(label)) | |
194 | |
195 return u".".join(result)+trailing_dot, len(input) | |
196 | |
197 class IncrementalEncoder(codecs.BufferedIncrementalEncoder): | |
198 def _buffer_encode(self, input, errors, final): | |
199 if errors != 'strict': | |
200 # IDNA is quite clear that implementations must be strict | |
201 raise UnicodeError("unsupported error handling "+errors) | |
202 | |
203 if not input: | |
204 return ("", 0) | |
205 | |
206 labels = dots.split(input) | |
207 trailing_dot = u'' | |
208 if labels: | |
209 if not labels[-1]: | |
210 trailing_dot = '.' | |
211 del labels[-1] | |
212 elif not final: | |
213 # Keep potentially unfinished label until the next call | |
214 del labels[-1] | |
215 if labels: | |
216 trailing_dot = '.' | |
217 | |
218 result = [] | |
219 size = 0 | |
220 for label in labels: | |
221 result.append(ToASCII(label)) | |
222 if size: | |
223 size += 1 | |
224 size += len(label) | |
225 | |
226 # Join with U+002E | |
227 result = ".".join(result) + trailing_dot | |
228 size += len(trailing_dot) | |
229 return (result, size) | |
230 | |
231 class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | |
232 def _buffer_decode(self, input, errors, final): | |
233 if errors != 'strict': | |
234 raise UnicodeError("Unsupported error handling "+errors) | |
235 | |
236 if not input: | |
237 return (u"", 0) | |
238 | |
239 # IDNA allows decoding to operate on Unicode strings, too. | |
240 if isinstance(input, unicode): | |
241 labels = dots.split(input) | |
242 else: | |
243 # Must be ASCII string | |
244 input = str(input) | |
245 unicode(input, "ascii") | |
246 labels = input.split(".") | |
247 | |
248 trailing_dot = u'' | |
249 if labels: | |
250 if not labels[-1]: | |
251 trailing_dot = u'.' | |
252 del labels[-1] | |
253 elif not final: | |
254 # Keep potentially unfinished label until the next call | |
255 del labels[-1] | |
256 if labels: | |
257 trailing_dot = u'.' | |
258 | |
259 result = [] | |
260 size = 0 | |
261 for label in labels: | |
262 result.append(ToUnicode(label)) | |
263 if size: | |
264 size += 1 | |
265 size += len(label) | |
266 | |
267 result = u".".join(result) + trailing_dot | |
268 size += len(trailing_dot) | |
269 return (result, size) | |
270 | |
271 class StreamWriter(Codec,codecs.StreamWriter): | |
272 pass | |
273 | |
274 class StreamReader(Codec,codecs.StreamReader): | |
275 pass | |
276 | |
277 ### encodings module API | |
278 | |
279 def getregentry(): | |
280 return codecs.CodecInfo( | |
281 name='idna', | |
282 encode=Codec().encode, | |
283 decode=Codec().decode, | |
284 incrementalencoder=IncrementalEncoder, | |
285 incrementaldecoder=IncrementalDecoder, | |
286 streamwriter=StreamWriter, | |
287 streamreader=StreamReader, | |
288 ) |