Mercurial > repos > yating-l > jbrowsearchivecreator
comparison test/lib/python2.7/encodings/idna.py @ 3:7d1a9a91b989 draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit d583ac16a6c6942730ea536eb59cc37941816030-dirty
| author | yating-l |
|---|---|
| date | Thu, 18 May 2017 18:37:28 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:3e2160197902 | 3:7d1a9a91b989 |
|---|---|
| 1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) | |
| 2 | |
| 3 import stringprep, re, codecs | |
| 4 from unicodedata import ucd_3_2_0 as unicodedata | |
| 5 | |
| 6 # IDNA section 3.1 | |
| 7 dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") | |
| 8 | |
| 9 # IDNA section 5 | |
| 10 ace_prefix = "xn--" | |
| 11 uace_prefix = unicode(ace_prefix, "ascii") | |
| 12 | |
| 13 # This assumes query strings, so AllowUnassigned is true | |
| 14 def nameprep(label): | |
| 15 # Map | |
| 16 newlabel = [] | |
| 17 for c in label: | |
| 18 if stringprep.in_table_b1(c): | |
| 19 # Map to nothing | |
| 20 continue | |
| 21 newlabel.append(stringprep.map_table_b2(c)) | |
| 22 label = u"".join(newlabel) | |
| 23 | |
| 24 # Normalize | |
| 25 label = unicodedata.normalize("NFKC", label) | |
| 26 | |
| 27 # Prohibit | |
| 28 for c in label: | |
| 29 if stringprep.in_table_c12(c) or \ | |
| 30 stringprep.in_table_c22(c) or \ | |
| 31 stringprep.in_table_c3(c) or \ | |
| 32 stringprep.in_table_c4(c) or \ | |
| 33 stringprep.in_table_c5(c) or \ | |
| 34 stringprep.in_table_c6(c) or \ | |
| 35 stringprep.in_table_c7(c) or \ | |
| 36 stringprep.in_table_c8(c) or \ | |
| 37 stringprep.in_table_c9(c): | |
| 38 raise UnicodeError("Invalid character %r" % c) | |
| 39 | |
| 40 # Check bidi | |
| 41 RandAL = map(stringprep.in_table_d1, label) | |
| 42 for c in RandAL: | |
| 43 if c: | |
| 44 # There is a RandAL char in the string. Must perform further | |
| 45 # tests: | |
| 46 # 1) The characters in section 5.8 MUST be prohibited. | |
| 47 # This is table C.8, which was already checked | |
| 48 # 2) If a string contains any RandALCat character, the string | |
| 49 # MUST NOT contain any LCat character. | |
| 50 if filter(stringprep.in_table_d2, label): | |
| 51 raise UnicodeError("Violation of BIDI requirement 2") | |
| 52 | |
| 53 # 3) If a string contains any RandALCat character, a | |
| 54 # RandALCat character MUST be the first character of the | |
| 55 # string, and a RandALCat character MUST be the last | |
| 56 # character of the string. | |
| 57 if not RandAL[0] or not RandAL[-1]: | |
| 58 raise UnicodeError("Violation of BIDI requirement 3") | |
| 59 | |
| 60 return label | |
| 61 | |
| 62 def ToASCII(label): | |
| 63 try: | |
| 64 # Step 1: try ASCII | |
| 65 label = label.encode("ascii") | |
| 66 except UnicodeError: | |
| 67 pass | |
| 68 else: | |
| 69 # Skip to step 3: UseSTD3ASCIIRules is false, so | |
| 70 # Skip to step 8. | |
| 71 if 0 < len(label) < 64: | |
| 72 return label | |
| 73 raise UnicodeError("label empty or too long") | |
| 74 | |
| 75 # Step 2: nameprep | |
| 76 label = nameprep(label) | |
| 77 | |
| 78 # Step 3: UseSTD3ASCIIRules is false | |
| 79 # Step 4: try ASCII | |
| 80 try: | |
| 81 label = label.encode("ascii") | |
| 82 except UnicodeError: | |
| 83 pass | |
| 84 else: | |
| 85 # Skip to step 8. | |
| 86 if 0 < len(label) < 64: | |
| 87 return label | |
| 88 raise UnicodeError("label empty or too long") | |
| 89 | |
| 90 # Step 5: Check ACE prefix | |
| 91 if label.startswith(uace_prefix): | |
| 92 raise UnicodeError("Label starts with ACE prefix") | |
| 93 | |
| 94 # Step 6: Encode with PUNYCODE | |
| 95 label = label.encode("punycode") | |
| 96 | |
| 97 # Step 7: Prepend ACE prefix | |
| 98 label = ace_prefix + label | |
| 99 | |
| 100 # Step 8: Check size | |
| 101 if 0 < len(label) < 64: | |
| 102 return label | |
| 103 raise UnicodeError("label empty or too long") | |
| 104 | |
| 105 def ToUnicode(label): | |
| 106 # Step 1: Check for ASCII | |
| 107 if isinstance(label, str): | |
| 108 pure_ascii = True | |
| 109 else: | |
| 110 try: | |
| 111 label = label.encode("ascii") | |
| 112 pure_ascii = True | |
| 113 except UnicodeError: | |
| 114 pure_ascii = False | |
| 115 if not pure_ascii: | |
| 116 # Step 2: Perform nameprep | |
| 117 label = nameprep(label) | |
| 118 # It doesn't say this, but apparently, it should be ASCII now | |
| 119 try: | |
| 120 label = label.encode("ascii") | |
| 121 except UnicodeError: | |
| 122 raise UnicodeError("Invalid character in IDN label") | |
| 123 # Step 3: Check for ACE prefix | |
| 124 if not label.startswith(ace_prefix): | |
| 125 return unicode(label, "ascii") | |
| 126 | |
| 127 # Step 4: Remove ACE prefix | |
| 128 label1 = label[len(ace_prefix):] | |
| 129 | |
| 130 # Step 5: Decode using PUNYCODE | |
| 131 result = label1.decode("punycode") | |
| 132 | |
| 133 # Step 6: Apply ToASCII | |
| 134 label2 = ToASCII(result) | |
| 135 | |
| 136 # Step 7: Compare the result of step 6 with the one of step 3 | |
| 137 # label2 will already be in lower case. | |
| 138 if label.lower() != label2: | |
| 139 raise UnicodeError("IDNA does not round-trip", label, label2) | |
| 140 | |
| 141 # Step 8: return the result of step 5 | |
| 142 return result | |
| 143 | |
| 144 ### Codec APIs | |
| 145 | |
| 146 class Codec(codecs.Codec): | |
| 147 def encode(self,input,errors='strict'): | |
| 148 | |
| 149 if errors != 'strict': | |
| 150 # IDNA is quite clear that implementations must be strict | |
| 151 raise UnicodeError("unsupported error handling "+errors) | |
| 152 | |
| 153 if not input: | |
| 154 return "", 0 | |
| 155 | |
| 156 result = [] | |
| 157 labels = dots.split(input) | |
| 158 if labels and len(labels[-1])==0: | |
| 159 trailing_dot = '.' | |
| 160 del labels[-1] | |
| 161 else: | |
| 162 trailing_dot = '' | |
| 163 for label in labels: | |
| 164 result.append(ToASCII(label)) | |
| 165 # Join with U+002E | |
| 166 return ".".join(result)+trailing_dot, len(input) | |
| 167 | |
| 168 def decode(self,input,errors='strict'): | |
| 169 | |
| 170 if errors != 'strict': | |
| 171 raise UnicodeError("Unsupported error handling "+errors) | |
| 172 | |
| 173 if not input: | |
| 174 return u"", 0 | |
| 175 | |
| 176 # IDNA allows decoding to operate on Unicode strings, too. | |
| 177 if isinstance(input, unicode): | |
| 178 labels = dots.split(input) | |
| 179 else: | |
| 180 # Must be ASCII string | |
| 181 input = str(input) | |
| 182 unicode(input, "ascii") | |
| 183 labels = input.split(".") | |
| 184 | |
| 185 if labels and len(labels[-1]) == 0: | |
| 186 trailing_dot = u'.' | |
| 187 del labels[-1] | |
| 188 else: | |
| 189 trailing_dot = u'' | |
| 190 | |
| 191 result = [] | |
| 192 for label in labels: | |
| 193 result.append(ToUnicode(label)) | |
| 194 | |
| 195 return u".".join(result)+trailing_dot, len(input) | |
| 196 | |
| 197 class IncrementalEncoder(codecs.BufferedIncrementalEncoder): | |
| 198 def _buffer_encode(self, input, errors, final): | |
| 199 if errors != 'strict': | |
| 200 # IDNA is quite clear that implementations must be strict | |
| 201 raise UnicodeError("unsupported error handling "+errors) | |
| 202 | |
| 203 if not input: | |
| 204 return ("", 0) | |
| 205 | |
| 206 labels = dots.split(input) | |
| 207 trailing_dot = u'' | |
| 208 if labels: | |
| 209 if not labels[-1]: | |
| 210 trailing_dot = '.' | |
| 211 del labels[-1] | |
| 212 elif not final: | |
| 213 # Keep potentially unfinished label until the next call | |
| 214 del labels[-1] | |
| 215 if labels: | |
| 216 trailing_dot = '.' | |
| 217 | |
| 218 result = [] | |
| 219 size = 0 | |
| 220 for label in labels: | |
| 221 result.append(ToASCII(label)) | |
| 222 if size: | |
| 223 size += 1 | |
| 224 size += len(label) | |
| 225 | |
| 226 # Join with U+002E | |
| 227 result = ".".join(result) + trailing_dot | |
| 228 size += len(trailing_dot) | |
| 229 return (result, size) | |
| 230 | |
| 231 class IncrementalDecoder(codecs.BufferedIncrementalDecoder): | |
| 232 def _buffer_decode(self, input, errors, final): | |
| 233 if errors != 'strict': | |
| 234 raise UnicodeError("Unsupported error handling "+errors) | |
| 235 | |
| 236 if not input: | |
| 237 return (u"", 0) | |
| 238 | |
| 239 # IDNA allows decoding to operate on Unicode strings, too. | |
| 240 if isinstance(input, unicode): | |
| 241 labels = dots.split(input) | |
| 242 else: | |
| 243 # Must be ASCII string | |
| 244 input = str(input) | |
| 245 unicode(input, "ascii") | |
| 246 labels = input.split(".") | |
| 247 | |
| 248 trailing_dot = u'' | |
| 249 if labels: | |
| 250 if not labels[-1]: | |
| 251 trailing_dot = u'.' | |
| 252 del labels[-1] | |
| 253 elif not final: | |
| 254 # Keep potentially unfinished label until the next call | |
| 255 del labels[-1] | |
| 256 if labels: | |
| 257 trailing_dot = u'.' | |
| 258 | |
| 259 result = [] | |
| 260 size = 0 | |
| 261 for label in labels: | |
| 262 result.append(ToUnicode(label)) | |
| 263 if size: | |
| 264 size += 1 | |
| 265 size += len(label) | |
| 266 | |
| 267 result = u".".join(result) + trailing_dot | |
| 268 size += len(trailing_dot) | |
| 269 return (result, size) | |
| 270 | |
| 271 class StreamWriter(Codec,codecs.StreamWriter): | |
| 272 pass | |
| 273 | |
| 274 class StreamReader(Codec,codecs.StreamReader): | |
| 275 pass | |
| 276 | |
| 277 ### encodings module API | |
| 278 | |
| 279 def getregentry(): | |
| 280 return codecs.CodecInfo( | |
| 281 name='idna', | |
| 282 encode=Codec().encode, | |
| 283 decode=Codec().decode, | |
| 284 incrementalencoder=IncrementalEncoder, | |
| 285 incrementaldecoder=IncrementalDecoder, | |
| 286 streamwriter=StreamWriter, | |
| 287 streamreader=StreamReader, | |
| 288 ) |
