|
160
|
1 /* Copyright libuv contributors. All rights reserved.
|
|
|
2 *
|
|
|
3 * Permission to use, copy, modify, and/or distribute this software for any
|
|
|
4 * purpose with or without fee is hereby granted, provided that the above
|
|
|
5 * copyright notice and this permission notice appear in all copies.
|
|
|
6 *
|
|
|
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
|
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
|
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
|
10 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
|
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
|
12 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
|
13 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
14 */
|
|
|
15
|
|
|
16 /* Derived from https://github.com/bnoordhuis/punycode
|
|
|
17 * but updated to support IDNA 2008.
|
|
|
18 */
|
|
|
19
|
|
|
20 #include "uv.h"
|
|
|
21 #include "uv-common.h"
|
|
|
22 #include "idna.h"
|
|
|
23 #include <assert.h>
|
|
|
24 #include <string.h>
|
|
|
25 #include <limits.h> /* UINT_MAX */
|
|
|
26
|
|
|
27
|
|
|
28 static int32_t uv__wtf8_decode1(const char** input) {
|
|
|
29 uint32_t code_point;
|
|
|
30 uint8_t b1;
|
|
|
31 uint8_t b2;
|
|
|
32 uint8_t b3;
|
|
|
33 uint8_t b4;
|
|
|
34
|
|
|
35 b1 = **input;
|
|
|
36 if (b1 <= 0x7F)
|
|
|
37 return b1; /* ASCII code point */
|
|
|
38 if (b1 < 0xC2)
|
|
|
39 return -1; /* invalid: continuation byte */
|
|
|
40 code_point = b1;
|
|
|
41
|
|
|
42 b2 = *++*input;
|
|
|
43 if ((b2 & 0xC0) != 0x80)
|
|
|
44 return -1; /* invalid: not a continuation byte */
|
|
|
45 code_point = (code_point << 6) | (b2 & 0x3F);
|
|
|
46 if (b1 <= 0xDF)
|
|
|
47 return 0x7FF & code_point; /* two-byte character */
|
|
|
48
|
|
|
49 b3 = *++*input;
|
|
|
50 if ((b3 & 0xC0) != 0x80)
|
|
|
51 return -1; /* invalid: not a continuation byte */
|
|
|
52 code_point = (code_point << 6) | (b3 & 0x3F);
|
|
|
53 if (b1 <= 0xEF)
|
|
|
54 return 0xFFFF & code_point; /* three-byte character */
|
|
|
55
|
|
|
56 b4 = *++*input;
|
|
|
57 if ((b4 & 0xC0) != 0x80)
|
|
|
58 return -1; /* invalid: not a continuation byte */
|
|
|
59 code_point = (code_point << 6) | (b4 & 0x3F);
|
|
|
60 if (b1 <= 0xF4) {
|
|
|
61 code_point &= 0x1FFFFF;
|
|
|
62 if (code_point <= 0x10FFFF)
|
|
|
63 return code_point; /* four-byte character */
|
|
|
64 }
|
|
|
65
|
|
|
66 /* code point too large */
|
|
|
67 return -1;
|
|
|
68 }
|
|
|
69
|
|
|
70
|
|
|
71 static unsigned uv__utf8_decode1_slow(const char** p,
|
|
|
72 const char* pe,
|
|
|
73 unsigned a) {
|
|
|
74 unsigned b;
|
|
|
75 unsigned c;
|
|
|
76 unsigned d;
|
|
|
77 unsigned min;
|
|
|
78
|
|
|
79 if (a > 0xF7)
|
|
|
80 return -1;
|
|
|
81
|
|
|
82 switch (pe - *p) {
|
|
|
83 default:
|
|
|
84 if (a > 0xEF) {
|
|
|
85 min = 0x10000;
|
|
|
86 a = a & 7;
|
|
|
87 b = (unsigned char) *(*p)++;
|
|
|
88 c = (unsigned char) *(*p)++;
|
|
|
89 d = (unsigned char) *(*p)++;
|
|
|
90 break;
|
|
|
91 }
|
|
|
92 /* Fall through. */
|
|
|
93 case 2:
|
|
|
94 if (a > 0xDF) {
|
|
|
95 min = 0x800;
|
|
|
96 b = 0x80 | (a & 15);
|
|
|
97 c = (unsigned char) *(*p)++;
|
|
|
98 d = (unsigned char) *(*p)++;
|
|
|
99 a = 0;
|
|
|
100 break;
|
|
|
101 }
|
|
|
102 /* Fall through. */
|
|
|
103 case 1:
|
|
|
104 if (a > 0xBF) {
|
|
|
105 min = 0x80;
|
|
|
106 b = 0x80;
|
|
|
107 c = 0x80 | (a & 31);
|
|
|
108 d = (unsigned char) *(*p)++;
|
|
|
109 a = 0;
|
|
|
110 break;
|
|
|
111 }
|
|
|
112 /* Fall through. */
|
|
|
113 case 0:
|
|
|
114 return -1; /* Invalid continuation byte. */
|
|
|
115 }
|
|
|
116
|
|
|
117 if (0x80 != (0xC0 & (b ^ c ^ d)))
|
|
|
118 return -1; /* Invalid sequence. */
|
|
|
119
|
|
|
120 b &= 63;
|
|
|
121 c &= 63;
|
|
|
122 d &= 63;
|
|
|
123 a = (a << 18) | (b << 12) | (c << 6) | d;
|
|
|
124
|
|
|
125 if (a < min)
|
|
|
126 return -1; /* Overlong sequence. */
|
|
|
127
|
|
|
128 if (a > 0x10FFFF)
|
|
|
129 return -1; /* Four-byte sequence > U+10FFFF. */
|
|
|
130
|
|
|
131 if (a >= 0xD800 && a <= 0xDFFF)
|
|
|
132 return -1; /* Surrogate pair. */
|
|
|
133
|
|
|
134 return a;
|
|
|
135 }
|
|
|
136
|
|
|
137
|
|
|
138 unsigned uv__utf8_decode1(const char** p, const char* pe) {
|
|
|
139 unsigned a;
|
|
|
140
|
|
|
141 assert(*p < pe);
|
|
|
142
|
|
|
143 a = (unsigned char) *(*p)++;
|
|
|
144
|
|
|
145 if (a < 128)
|
|
|
146 return a; /* ASCII, common case. */
|
|
|
147
|
|
|
148 return uv__utf8_decode1_slow(p, pe, a);
|
|
|
149 }
|
|
|
150
|
|
|
151
|
|
|
152 static int uv__idna_toascii_label(const char* s, const char* se,
|
|
|
153 char** d, char* de) {
|
|
|
154 static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
|
|
|
155 const char* ss;
|
|
|
156 unsigned c;
|
|
|
157 unsigned h;
|
|
|
158 unsigned k;
|
|
|
159 unsigned n;
|
|
|
160 unsigned m;
|
|
|
161 unsigned q;
|
|
|
162 unsigned t;
|
|
|
163 unsigned x;
|
|
|
164 unsigned y;
|
|
|
165 unsigned bias;
|
|
|
166 unsigned delta;
|
|
|
167 unsigned todo;
|
|
|
168 int first;
|
|
|
169
|
|
|
170 h = 0;
|
|
|
171 ss = s;
|
|
|
172 todo = 0;
|
|
|
173
|
|
|
174 /* Note: after this loop we've visited all UTF-8 characters and know
|
|
|
175 * they're legal so we no longer need to check for decode errors.
|
|
|
176 */
|
|
|
177 while (s < se) {
|
|
|
178 c = uv__utf8_decode1(&s, se);
|
|
|
179
|
|
|
180 if (c == UINT_MAX)
|
|
|
181 return UV_EINVAL;
|
|
|
182
|
|
|
183 if (c < 128)
|
|
|
184 h++;
|
|
|
185 else
|
|
|
186 todo++;
|
|
|
187 }
|
|
|
188
|
|
|
189 /* Only write "xn--" when there are non-ASCII characters. */
|
|
|
190 if (todo > 0) {
|
|
|
191 if (*d < de) *(*d)++ = 'x';
|
|
|
192 if (*d < de) *(*d)++ = 'n';
|
|
|
193 if (*d < de) *(*d)++ = '-';
|
|
|
194 if (*d < de) *(*d)++ = '-';
|
|
|
195 }
|
|
|
196
|
|
|
197 /* Write ASCII characters. */
|
|
|
198 x = 0;
|
|
|
199 s = ss;
|
|
|
200 while (s < se) {
|
|
|
201 c = uv__utf8_decode1(&s, se);
|
|
|
202 assert(c != UINT_MAX);
|
|
|
203
|
|
|
204 if (c > 127)
|
|
|
205 continue;
|
|
|
206
|
|
|
207 if (*d < de)
|
|
|
208 *(*d)++ = c;
|
|
|
209
|
|
|
210 if (++x == h)
|
|
|
211 break; /* Visited all ASCII characters. */
|
|
|
212 }
|
|
|
213
|
|
|
214 if (todo == 0)
|
|
|
215 return h;
|
|
|
216
|
|
|
217 /* Only write separator when we've written ASCII characters first. */
|
|
|
218 if (h > 0)
|
|
|
219 if (*d < de)
|
|
|
220 *(*d)++ = '-';
|
|
|
221
|
|
|
222 n = 128;
|
|
|
223 bias = 72;
|
|
|
224 delta = 0;
|
|
|
225 first = 1;
|
|
|
226
|
|
|
227 while (todo > 0) {
|
|
|
228 m = -1;
|
|
|
229 s = ss;
|
|
|
230
|
|
|
231 while (s < se) {
|
|
|
232 c = uv__utf8_decode1(&s, se);
|
|
|
233 assert(c != UINT_MAX);
|
|
|
234
|
|
|
235 if (c >= n)
|
|
|
236 if (c < m)
|
|
|
237 m = c;
|
|
|
238 }
|
|
|
239
|
|
|
240 x = m - n;
|
|
|
241 y = h + 1;
|
|
|
242
|
|
|
243 if (x > ~delta / y)
|
|
|
244 return UV_E2BIG; /* Overflow. */
|
|
|
245
|
|
|
246 delta += x * y;
|
|
|
247 n = m;
|
|
|
248
|
|
|
249 s = ss;
|
|
|
250 while (s < se) {
|
|
|
251 c = uv__utf8_decode1(&s, se);
|
|
|
252 assert(c != UINT_MAX);
|
|
|
253
|
|
|
254 if (c < n)
|
|
|
255 if (++delta == 0)
|
|
|
256 return UV_E2BIG; /* Overflow. */
|
|
|
257
|
|
|
258 if (c != n)
|
|
|
259 continue;
|
|
|
260
|
|
|
261 for (k = 36, q = delta; /* empty */; k += 36) {
|
|
|
262 t = 1;
|
|
|
263
|
|
|
264 if (k > bias)
|
|
|
265 t = k - bias;
|
|
|
266
|
|
|
267 if (t > 26)
|
|
|
268 t = 26;
|
|
|
269
|
|
|
270 if (q < t)
|
|
|
271 break;
|
|
|
272
|
|
|
273 /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
|
|
|
274 * 10 <= y <= 35, we can optimize the long division
|
|
|
275 * into a table-based reciprocal multiplication.
|
|
|
276 */
|
|
|
277 x = q - t;
|
|
|
278 y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */
|
|
|
279 q = x / y;
|
|
|
280 t = t + x % y; /* 1 <= t <= 35 because of y. */
|
|
|
281
|
|
|
282 if (*d < de)
|
|
|
283 *(*d)++ = alphabet[t];
|
|
|
284 }
|
|
|
285
|
|
|
286 if (*d < de)
|
|
|
287 *(*d)++ = alphabet[q];
|
|
|
288
|
|
|
289 delta /= 2;
|
|
|
290
|
|
|
291 if (first) {
|
|
|
292 delta /= 350;
|
|
|
293 first = 0;
|
|
|
294 }
|
|
|
295
|
|
|
296 /* No overflow check is needed because |delta| was just
|
|
|
297 * divided by 2 and |delta+delta >= delta + delta/h|.
|
|
|
298 */
|
|
|
299 h++;
|
|
|
300 delta += delta / h;
|
|
|
301
|
|
|
302 for (bias = 0; delta > 35 * 26 / 2; bias += 36)
|
|
|
303 delta /= 35;
|
|
|
304
|
|
|
305 bias += 36 * delta / (delta + 38);
|
|
|
306 delta = 0;
|
|
|
307 todo--;
|
|
|
308 }
|
|
|
309
|
|
|
310 delta++;
|
|
|
311 n++;
|
|
|
312 }
|
|
|
313
|
|
|
314 return 0;
|
|
|
315 }
|
|
|
316
|
|
|
317
|
|
|
318 ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
|
|
|
319 const char* si;
|
|
|
320 const char* st;
|
|
|
321 unsigned c;
|
|
|
322 char* ds;
|
|
|
323 int rc;
|
|
|
324
|
|
|
325 if (s == se)
|
|
|
326 return UV_EINVAL;
|
|
|
327
|
|
|
328 ds = d;
|
|
|
329
|
|
|
330 si = s;
|
|
|
331 while (si < se) {
|
|
|
332 st = si;
|
|
|
333 c = uv__utf8_decode1(&si, se);
|
|
|
334
|
|
|
335 if (c == UINT_MAX)
|
|
|
336 return UV_EINVAL;
|
|
|
337
|
|
|
338 if (c != '.')
|
|
|
339 if (c != 0x3002) /* 。 */
|
|
|
340 if (c != 0xFF0E) /* . */
|
|
|
341 if (c != 0xFF61) /* 。 */
|
|
|
342 continue;
|
|
|
343
|
|
|
344 rc = uv__idna_toascii_label(s, st, &d, de);
|
|
|
345
|
|
|
346 if (rc < 0)
|
|
|
347 return rc;
|
|
|
348
|
|
|
349 if (d < de)
|
|
|
350 *d++ = '.';
|
|
|
351
|
|
|
352 s = si;
|
|
|
353 }
|
|
|
354
|
|
|
355 if (s < se) {
|
|
|
356 rc = uv__idna_toascii_label(s, se, &d, de);
|
|
|
357
|
|
|
358 if (rc < 0)
|
|
|
359 return rc;
|
|
|
360 }
|
|
|
361
|
|
|
362 if (d >= de)
|
|
|
363 return UV_EINVAL;
|
|
|
364
|
|
|
365 *d++ = '\0';
|
|
|
366 return d - ds; /* Number of bytes written. */
|
|
|
367 }
|
|
|
368
|
|
|
369
|
|
|
370 ssize_t uv_wtf8_length_as_utf16(const char* source_ptr) {
|
|
|
371 size_t w_target_len = 0;
|
|
|
372 int32_t code_point;
|
|
|
373
|
|
|
374 do {
|
|
|
375 code_point = uv__wtf8_decode1(&source_ptr);
|
|
|
376 if (code_point < 0)
|
|
|
377 return -1;
|
|
|
378 if (code_point > 0xFFFF)
|
|
|
379 w_target_len++;
|
|
|
380 w_target_len++;
|
|
|
381 } while (*source_ptr++);
|
|
|
382
|
|
|
383 return w_target_len;
|
|
|
384 }
|
|
|
385
|
|
|
386
|
|
|
387 void uv_wtf8_to_utf16(const char* source_ptr,
|
|
|
388 uint16_t* w_target,
|
|
|
389 size_t w_target_len) {
|
|
|
390 int32_t code_point;
|
|
|
391
|
|
|
392 do {
|
|
|
393 code_point = uv__wtf8_decode1(&source_ptr);
|
|
|
394 /* uv_wtf8_length_as_utf16 should have been called and checked first. */
|
|
|
395 assert(code_point >= 0);
|
|
|
396 if (code_point > 0xFFFF) {
|
|
|
397 assert(code_point < 0x10FFFF);
|
|
|
398 *w_target++ = (((code_point - 0x10000) >> 10) + 0xD800);
|
|
|
399 *w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00;
|
|
|
400 w_target_len -= 2;
|
|
|
401 } else {
|
|
|
402 *w_target++ = code_point;
|
|
|
403 w_target_len -= 1;
|
|
|
404 }
|
|
|
405 } while (*source_ptr++);
|
|
|
406
|
|
|
407 (void)w_target_len;
|
|
|
408 assert(w_target_len == 0);
|
|
|
409 }
|
|
|
410
|
|
|
411
|
|
|
412 static int32_t uv__get_surrogate_value(const uint16_t* w_source_ptr,
|
|
|
413 ssize_t w_source_len) {
|
|
|
414 uint16_t u;
|
|
|
415 uint16_t next;
|
|
|
416
|
|
|
417 u = w_source_ptr[0];
|
|
|
418 if (u >= 0xD800 && u <= 0xDBFF && w_source_len != 1) {
|
|
|
419 next = w_source_ptr[1];
|
|
|
420 if (next >= 0xDC00 && next <= 0xDFFF)
|
|
|
421 return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00);
|
|
|
422 }
|
|
|
423 return u;
|
|
|
424 }
|
|
|
425
|
|
|
426
|
|
|
427 size_t uv_utf16_length_as_wtf8(const uint16_t* w_source_ptr,
|
|
|
428 ssize_t w_source_len) {
|
|
|
429 size_t target_len;
|
|
|
430 int32_t code_point;
|
|
|
431
|
|
|
432 target_len = 0;
|
|
|
433 while (w_source_len) {
|
|
|
434 code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
|
|
|
435 /* Can be invalid UTF-8 but must be valid WTF-8. */
|
|
|
436 assert(code_point >= 0);
|
|
|
437 if (w_source_len < 0 && code_point == 0)
|
|
|
438 break;
|
|
|
439 if (code_point < 0x80)
|
|
|
440 target_len += 1;
|
|
|
441 else if (code_point < 0x800)
|
|
|
442 target_len += 2;
|
|
|
443 else if (code_point < 0x10000)
|
|
|
444 target_len += 3;
|
|
|
445 else {
|
|
|
446 target_len += 4;
|
|
|
447 w_source_ptr++;
|
|
|
448 if (w_source_len > 0)
|
|
|
449 w_source_len--;
|
|
|
450 }
|
|
|
451 w_source_ptr++;
|
|
|
452 if (w_source_len > 0)
|
|
|
453 w_source_len--;
|
|
|
454 }
|
|
|
455
|
|
|
456 return target_len;
|
|
|
457 }
|
|
|
458
|
|
|
459
|
|
|
460 int uv_utf16_to_wtf8(const uint16_t* w_source_ptr,
|
|
|
461 ssize_t w_source_len,
|
|
|
462 char** target_ptr,
|
|
|
463 size_t* target_len_ptr) {
|
|
|
464 size_t target_len;
|
|
|
465 char* target;
|
|
|
466 char* target_end;
|
|
|
467 int32_t code_point;
|
|
|
468
|
|
|
469 /* If *target_ptr is provided, then *target_len_ptr must be its length
|
|
|
470 * (excluding space for NUL), otherwise we will compute the target_len_ptr
|
|
|
471 * length and may return a new allocation in *target_ptr if target_ptr is
|
|
|
472 * provided. */
|
|
|
473 if (target_ptr == NULL || *target_ptr == NULL) {
|
|
|
474 target_len = uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
|
|
|
475 if (target_len_ptr != NULL)
|
|
|
476 *target_len_ptr = target_len;
|
|
|
477 } else {
|
|
|
478 target_len = *target_len_ptr;
|
|
|
479 }
|
|
|
480
|
|
|
481 if (target_ptr == NULL)
|
|
|
482 return 0;
|
|
|
483
|
|
|
484 if (*target_ptr == NULL) {
|
|
|
485 target = uv__malloc(target_len + 1);
|
|
|
486 if (target == NULL) {
|
|
|
487 return UV_ENOMEM;
|
|
|
488 }
|
|
|
489 *target_ptr = target;
|
|
|
490 } else {
|
|
|
491 target = *target_ptr;
|
|
|
492 }
|
|
|
493
|
|
|
494 target_end = target + target_len;
|
|
|
495
|
|
|
496 while (target != target_end && w_source_len) {
|
|
|
497 code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
|
|
|
498 /* Can be invalid UTF-8 but must be valid WTF-8. */
|
|
|
499 assert(code_point >= 0);
|
|
|
500 if (w_source_len < 0 && code_point == 0) {
|
|
|
501 w_source_len = 0;
|
|
|
502 break;
|
|
|
503 }
|
|
|
504 if (code_point < 0x80) {
|
|
|
505 *target++ = code_point;
|
|
|
506 } else if (code_point < 0x800) {
|
|
|
507 *target++ = 0xC0 | (code_point >> 6);
|
|
|
508 if (target == target_end)
|
|
|
509 break;
|
|
|
510 *target++ = 0x80 | (code_point & 0x3F);
|
|
|
511 } else if (code_point < 0x10000) {
|
|
|
512 *target++ = 0xE0 | (code_point >> 12);
|
|
|
513 if (target == target_end)
|
|
|
514 break;
|
|
|
515 *target++ = 0x80 | ((code_point >> 6) & 0x3F);
|
|
|
516 if (target == target_end)
|
|
|
517 break;
|
|
|
518 *target++ = 0x80 | (code_point & 0x3F);
|
|
|
519 } else {
|
|
|
520 *target++ = 0xF0 | (code_point >> 18);
|
|
|
521 if (target == target_end)
|
|
|
522 break;
|
|
|
523 *target++ = 0x80 | ((code_point >> 12) & 0x3F);
|
|
|
524 if (target == target_end)
|
|
|
525 break;
|
|
|
526 *target++ = 0x80 | ((code_point >> 6) & 0x3F);
|
|
|
527 if (target == target_end)
|
|
|
528 break;
|
|
|
529 *target++ = 0x80 | (code_point & 0x3F);
|
|
|
530 /* uv__get_surrogate_value consumed 2 input characters */
|
|
|
531 w_source_ptr++;
|
|
|
532 if (w_source_len > 0)
|
|
|
533 w_source_len--;
|
|
|
534 }
|
|
|
535 target_len = target - *target_ptr;
|
|
|
536 w_source_ptr++;
|
|
|
537 if (w_source_len > 0)
|
|
|
538 w_source_len--;
|
|
|
539 }
|
|
|
540
|
|
|
541 if (target != target_end && target_len_ptr != NULL)
|
|
|
542 /* Did not fill all of the provided buffer, so update the target_len_ptr
|
|
|
543 * output with the space used. */
|
|
|
544 *target_len_ptr = target - *target_ptr;
|
|
|
545
|
|
|
546 /* Check if input fit into target exactly. */
|
|
|
547 if (w_source_len < 0 && target == target_end && w_source_ptr[0] == 0)
|
|
|
548 w_source_len = 0;
|
|
|
549
|
|
|
550 *target++ = '\0';
|
|
|
551
|
|
|
552 /* Characters remained after filling the buffer, compute the remaining length now. */
|
|
|
553 if (w_source_len) {
|
|
|
554 if (target_len_ptr != NULL)
|
|
|
555 *target_len_ptr = target_len + uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
|
|
|
556 return UV_ENOBUFS;
|
|
|
557 }
|
|
|
558
|
|
|
559 return 0;
|
|
|
560 }
|