Mercurial
comparison third_party/libuv/src/idna.c @ 160:948de3f54cea
[ThirdParty] Added libuv
| author | June Park <parkjune1995@gmail.com> |
|---|---|
| date | Wed, 14 Jan 2026 19:39:52 -0800 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 159:05cf9467a1c3 | 160:948de3f54cea |
|---|---|
| 1 /* Copyright libuv contributors. All rights reserved. | |
| 2 * | |
| 3 * Permission to use, copy, modify, and/or distribute this software for any | |
| 4 * purpose with or without fee is hereby granted, provided that the above | |
| 5 * copyright notice and this permission notice appear in all copies. | |
| 6 * | |
| 7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
| 8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
| 9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
| 10 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
| 11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
| 12 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
| 13 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
| 14 */ | |
| 15 | |
| 16 /* Derived from https://github.com/bnoordhuis/punycode | |
| 17 * but updated to support IDNA 2008. | |
| 18 */ | |
| 19 | |
| 20 #include "uv.h" | |
| 21 #include "uv-common.h" | |
| 22 #include "idna.h" | |
| 23 #include <assert.h> | |
| 24 #include <string.h> | |
| 25 #include <limits.h> /* UINT_MAX */ | |
| 26 | |
| 27 | |
| 28 static int32_t uv__wtf8_decode1(const char** input) { | |
| 29 uint32_t code_point; | |
| 30 uint8_t b1; | |
| 31 uint8_t b2; | |
| 32 uint8_t b3; | |
| 33 uint8_t b4; | |
| 34 | |
| 35 b1 = **input; | |
| 36 if (b1 <= 0x7F) | |
| 37 return b1; /* ASCII code point */ | |
| 38 if (b1 < 0xC2) | |
| 39 return -1; /* invalid: continuation byte */ | |
| 40 code_point = b1; | |
| 41 | |
| 42 b2 = *++*input; | |
| 43 if ((b2 & 0xC0) != 0x80) | |
| 44 return -1; /* invalid: not a continuation byte */ | |
| 45 code_point = (code_point << 6) | (b2 & 0x3F); | |
| 46 if (b1 <= 0xDF) | |
| 47 return 0x7FF & code_point; /* two-byte character */ | |
| 48 | |
| 49 b3 = *++*input; | |
| 50 if ((b3 & 0xC0) != 0x80) | |
| 51 return -1; /* invalid: not a continuation byte */ | |
| 52 code_point = (code_point << 6) | (b3 & 0x3F); | |
| 53 if (b1 <= 0xEF) | |
| 54 return 0xFFFF & code_point; /* three-byte character */ | |
| 55 | |
| 56 b4 = *++*input; | |
| 57 if ((b4 & 0xC0) != 0x80) | |
| 58 return -1; /* invalid: not a continuation byte */ | |
| 59 code_point = (code_point << 6) | (b4 & 0x3F); | |
| 60 if (b1 <= 0xF4) { | |
| 61 code_point &= 0x1FFFFF; | |
| 62 if (code_point <= 0x10FFFF) | |
| 63 return code_point; /* four-byte character */ | |
| 64 } | |
| 65 | |
| 66 /* code point too large */ | |
| 67 return -1; | |
| 68 } | |
| 69 | |
| 70 | |
| 71 static unsigned uv__utf8_decode1_slow(const char** p, | |
| 72 const char* pe, | |
| 73 unsigned a) { | |
| 74 unsigned b; | |
| 75 unsigned c; | |
| 76 unsigned d; | |
| 77 unsigned min; | |
| 78 | |
| 79 if (a > 0xF7) | |
| 80 return -1; | |
| 81 | |
| 82 switch (pe - *p) { | |
| 83 default: | |
| 84 if (a > 0xEF) { | |
| 85 min = 0x10000; | |
| 86 a = a & 7; | |
| 87 b = (unsigned char) *(*p)++; | |
| 88 c = (unsigned char) *(*p)++; | |
| 89 d = (unsigned char) *(*p)++; | |
| 90 break; | |
| 91 } | |
| 92 /* Fall through. */ | |
| 93 case 2: | |
| 94 if (a > 0xDF) { | |
| 95 min = 0x800; | |
| 96 b = 0x80 | (a & 15); | |
| 97 c = (unsigned char) *(*p)++; | |
| 98 d = (unsigned char) *(*p)++; | |
| 99 a = 0; | |
| 100 break; | |
| 101 } | |
| 102 /* Fall through. */ | |
| 103 case 1: | |
| 104 if (a > 0xBF) { | |
| 105 min = 0x80; | |
| 106 b = 0x80; | |
| 107 c = 0x80 | (a & 31); | |
| 108 d = (unsigned char) *(*p)++; | |
| 109 a = 0; | |
| 110 break; | |
| 111 } | |
| 112 /* Fall through. */ | |
| 113 case 0: | |
| 114 return -1; /* Invalid continuation byte. */ | |
| 115 } | |
| 116 | |
| 117 if (0x80 != (0xC0 & (b ^ c ^ d))) | |
| 118 return -1; /* Invalid sequence. */ | |
| 119 | |
| 120 b &= 63; | |
| 121 c &= 63; | |
| 122 d &= 63; | |
| 123 a = (a << 18) | (b << 12) | (c << 6) | d; | |
| 124 | |
| 125 if (a < min) | |
| 126 return -1; /* Overlong sequence. */ | |
| 127 | |
| 128 if (a > 0x10FFFF) | |
| 129 return -1; /* Four-byte sequence > U+10FFFF. */ | |
| 130 | |
| 131 if (a >= 0xD800 && a <= 0xDFFF) | |
| 132 return -1; /* Surrogate pair. */ | |
| 133 | |
| 134 return a; | |
| 135 } | |
| 136 | |
| 137 | |
| 138 unsigned uv__utf8_decode1(const char** p, const char* pe) { | |
| 139 unsigned a; | |
| 140 | |
| 141 assert(*p < pe); | |
| 142 | |
| 143 a = (unsigned char) *(*p)++; | |
| 144 | |
| 145 if (a < 128) | |
| 146 return a; /* ASCII, common case. */ | |
| 147 | |
| 148 return uv__utf8_decode1_slow(p, pe, a); | |
| 149 } | |
| 150 | |
| 151 | |
| 152 static int uv__idna_toascii_label(const char* s, const char* se, | |
| 153 char** d, char* de) { | |
| 154 static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789"; | |
| 155 const char* ss; | |
| 156 unsigned c; | |
| 157 unsigned h; | |
| 158 unsigned k; | |
| 159 unsigned n; | |
| 160 unsigned m; | |
| 161 unsigned q; | |
| 162 unsigned t; | |
| 163 unsigned x; | |
| 164 unsigned y; | |
| 165 unsigned bias; | |
| 166 unsigned delta; | |
| 167 unsigned todo; | |
| 168 int first; | |
| 169 | |
| 170 h = 0; | |
| 171 ss = s; | |
| 172 todo = 0; | |
| 173 | |
| 174 /* Note: after this loop we've visited all UTF-8 characters and know | |
| 175 * they're legal so we no longer need to check for decode errors. | |
| 176 */ | |
| 177 while (s < se) { | |
| 178 c = uv__utf8_decode1(&s, se); | |
| 179 | |
| 180 if (c == UINT_MAX) | |
| 181 return UV_EINVAL; | |
| 182 | |
| 183 if (c < 128) | |
| 184 h++; | |
| 185 else | |
| 186 todo++; | |
| 187 } | |
| 188 | |
| 189 /* Only write "xn--" when there are non-ASCII characters. */ | |
| 190 if (todo > 0) { | |
| 191 if (*d < de) *(*d)++ = 'x'; | |
| 192 if (*d < de) *(*d)++ = 'n'; | |
| 193 if (*d < de) *(*d)++ = '-'; | |
| 194 if (*d < de) *(*d)++ = '-'; | |
| 195 } | |
| 196 | |
| 197 /* Write ASCII characters. */ | |
| 198 x = 0; | |
| 199 s = ss; | |
| 200 while (s < se) { | |
| 201 c = uv__utf8_decode1(&s, se); | |
| 202 assert(c != UINT_MAX); | |
| 203 | |
| 204 if (c > 127) | |
| 205 continue; | |
| 206 | |
| 207 if (*d < de) | |
| 208 *(*d)++ = c; | |
| 209 | |
| 210 if (++x == h) | |
| 211 break; /* Visited all ASCII characters. */ | |
| 212 } | |
| 213 | |
| 214 if (todo == 0) | |
| 215 return h; | |
| 216 | |
| 217 /* Only write separator when we've written ASCII characters first. */ | |
| 218 if (h > 0) | |
| 219 if (*d < de) | |
| 220 *(*d)++ = '-'; | |
| 221 | |
| 222 n = 128; | |
| 223 bias = 72; | |
| 224 delta = 0; | |
| 225 first = 1; | |
| 226 | |
| 227 while (todo > 0) { | |
| 228 m = -1; | |
| 229 s = ss; | |
| 230 | |
| 231 while (s < se) { | |
| 232 c = uv__utf8_decode1(&s, se); | |
| 233 assert(c != UINT_MAX); | |
| 234 | |
| 235 if (c >= n) | |
| 236 if (c < m) | |
| 237 m = c; | |
| 238 } | |
| 239 | |
| 240 x = m - n; | |
| 241 y = h + 1; | |
| 242 | |
| 243 if (x > ~delta / y) | |
| 244 return UV_E2BIG; /* Overflow. */ | |
| 245 | |
| 246 delta += x * y; | |
| 247 n = m; | |
| 248 | |
| 249 s = ss; | |
| 250 while (s < se) { | |
| 251 c = uv__utf8_decode1(&s, se); | |
| 252 assert(c != UINT_MAX); | |
| 253 | |
| 254 if (c < n) | |
| 255 if (++delta == 0) | |
| 256 return UV_E2BIG; /* Overflow. */ | |
| 257 | |
| 258 if (c != n) | |
| 259 continue; | |
| 260 | |
| 261 for (k = 36, q = delta; /* empty */; k += 36) { | |
| 262 t = 1; | |
| 263 | |
| 264 if (k > bias) | |
| 265 t = k - bias; | |
| 266 | |
| 267 if (t > 26) | |
| 268 t = 26; | |
| 269 | |
| 270 if (q < t) | |
| 271 break; | |
| 272 | |
| 273 /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore | |
| 274 * 10 <= y <= 35, we can optimize the long division | |
| 275 * into a table-based reciprocal multiplication. | |
| 276 */ | |
| 277 x = q - t; | |
| 278 y = 36 - t; /* 10 <= y <= 35 since 1 <= t <= 26. */ | |
| 279 q = x / y; | |
| 280 t = t + x % y; /* 1 <= t <= 35 because of y. */ | |
| 281 | |
| 282 if (*d < de) | |
| 283 *(*d)++ = alphabet[t]; | |
| 284 } | |
| 285 | |
| 286 if (*d < de) | |
| 287 *(*d)++ = alphabet[q]; | |
| 288 | |
| 289 delta /= 2; | |
| 290 | |
| 291 if (first) { | |
| 292 delta /= 350; | |
| 293 first = 0; | |
| 294 } | |
| 295 | |
| 296 /* No overflow check is needed because |delta| was just | |
| 297 * divided by 2 and |delta+delta >= delta + delta/h|. | |
| 298 */ | |
| 299 h++; | |
| 300 delta += delta / h; | |
| 301 | |
| 302 for (bias = 0; delta > 35 * 26 / 2; bias += 36) | |
| 303 delta /= 35; | |
| 304 | |
| 305 bias += 36 * delta / (delta + 38); | |
| 306 delta = 0; | |
| 307 todo--; | |
| 308 } | |
| 309 | |
| 310 delta++; | |
| 311 n++; | |
| 312 } | |
| 313 | |
| 314 return 0; | |
| 315 } | |
| 316 | |
| 317 | |
| 318 ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de) { | |
| 319 const char* si; | |
| 320 const char* st; | |
| 321 unsigned c; | |
| 322 char* ds; | |
| 323 int rc; | |
| 324 | |
| 325 if (s == se) | |
| 326 return UV_EINVAL; | |
| 327 | |
| 328 ds = d; | |
| 329 | |
| 330 si = s; | |
| 331 while (si < se) { | |
| 332 st = si; | |
| 333 c = uv__utf8_decode1(&si, se); | |
| 334 | |
| 335 if (c == UINT_MAX) | |
| 336 return UV_EINVAL; | |
| 337 | |
| 338 if (c != '.') | |
| 339 if (c != 0x3002) /* 。 */ | |
| 340 if (c != 0xFF0E) /* . */ | |
| 341 if (c != 0xFF61) /* 。 */ | |
| 342 continue; | |
| 343 | |
| 344 rc = uv__idna_toascii_label(s, st, &d, de); | |
| 345 | |
| 346 if (rc < 0) | |
| 347 return rc; | |
| 348 | |
| 349 if (d < de) | |
| 350 *d++ = '.'; | |
| 351 | |
| 352 s = si; | |
| 353 } | |
| 354 | |
| 355 if (s < se) { | |
| 356 rc = uv__idna_toascii_label(s, se, &d, de); | |
| 357 | |
| 358 if (rc < 0) | |
| 359 return rc; | |
| 360 } | |
| 361 | |
| 362 if (d >= de) | |
| 363 return UV_EINVAL; | |
| 364 | |
| 365 *d++ = '\0'; | |
| 366 return d - ds; /* Number of bytes written. */ | |
| 367 } | |
| 368 | |
| 369 | |
| 370 ssize_t uv_wtf8_length_as_utf16(const char* source_ptr) { | |
| 371 size_t w_target_len = 0; | |
| 372 int32_t code_point; | |
| 373 | |
| 374 do { | |
| 375 code_point = uv__wtf8_decode1(&source_ptr); | |
| 376 if (code_point < 0) | |
| 377 return -1; | |
| 378 if (code_point > 0xFFFF) | |
| 379 w_target_len++; | |
| 380 w_target_len++; | |
| 381 } while (*source_ptr++); | |
| 382 | |
| 383 return w_target_len; | |
| 384 } | |
| 385 | |
| 386 | |
| 387 void uv_wtf8_to_utf16(const char* source_ptr, | |
| 388 uint16_t* w_target, | |
| 389 size_t w_target_len) { | |
| 390 int32_t code_point; | |
| 391 | |
| 392 do { | |
| 393 code_point = uv__wtf8_decode1(&source_ptr); | |
| 394 /* uv_wtf8_length_as_utf16 should have been called and checked first. */ | |
| 395 assert(code_point >= 0); | |
| 396 if (code_point > 0xFFFF) { | |
| 397 assert(code_point < 0x10FFFF); | |
| 398 *w_target++ = (((code_point - 0x10000) >> 10) + 0xD800); | |
| 399 *w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00; | |
| 400 w_target_len -= 2; | |
| 401 } else { | |
| 402 *w_target++ = code_point; | |
| 403 w_target_len -= 1; | |
| 404 } | |
| 405 } while (*source_ptr++); | |
| 406 | |
| 407 (void)w_target_len; | |
| 408 assert(w_target_len == 0); | |
| 409 } | |
| 410 | |
| 411 | |
| 412 static int32_t uv__get_surrogate_value(const uint16_t* w_source_ptr, | |
| 413 ssize_t w_source_len) { | |
| 414 uint16_t u; | |
| 415 uint16_t next; | |
| 416 | |
| 417 u = w_source_ptr[0]; | |
| 418 if (u >= 0xD800 && u <= 0xDBFF && w_source_len != 1) { | |
| 419 next = w_source_ptr[1]; | |
| 420 if (next >= 0xDC00 && next <= 0xDFFF) | |
| 421 return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00); | |
| 422 } | |
| 423 return u; | |
| 424 } | |
| 425 | |
| 426 | |
| 427 size_t uv_utf16_length_as_wtf8(const uint16_t* w_source_ptr, | |
| 428 ssize_t w_source_len) { | |
| 429 size_t target_len; | |
| 430 int32_t code_point; | |
| 431 | |
| 432 target_len = 0; | |
| 433 while (w_source_len) { | |
| 434 code_point = uv__get_surrogate_value(w_source_ptr, w_source_len); | |
| 435 /* Can be invalid UTF-8 but must be valid WTF-8. */ | |
| 436 assert(code_point >= 0); | |
| 437 if (w_source_len < 0 && code_point == 0) | |
| 438 break; | |
| 439 if (code_point < 0x80) | |
| 440 target_len += 1; | |
| 441 else if (code_point < 0x800) | |
| 442 target_len += 2; | |
| 443 else if (code_point < 0x10000) | |
| 444 target_len += 3; | |
| 445 else { | |
| 446 target_len += 4; | |
| 447 w_source_ptr++; | |
| 448 if (w_source_len > 0) | |
| 449 w_source_len--; | |
| 450 } | |
| 451 w_source_ptr++; | |
| 452 if (w_source_len > 0) | |
| 453 w_source_len--; | |
| 454 } | |
| 455 | |
| 456 return target_len; | |
| 457 } | |
| 458 | |
| 459 | |
| 460 int uv_utf16_to_wtf8(const uint16_t* w_source_ptr, | |
| 461 ssize_t w_source_len, | |
| 462 char** target_ptr, | |
| 463 size_t* target_len_ptr) { | |
| 464 size_t target_len; | |
| 465 char* target; | |
| 466 char* target_end; | |
| 467 int32_t code_point; | |
| 468 | |
| 469 /* If *target_ptr is provided, then *target_len_ptr must be its length | |
| 470 * (excluding space for NUL), otherwise we will compute the target_len_ptr | |
| 471 * length and may return a new allocation in *target_ptr if target_ptr is | |
| 472 * provided. */ | |
| 473 if (target_ptr == NULL || *target_ptr == NULL) { | |
| 474 target_len = uv_utf16_length_as_wtf8(w_source_ptr, w_source_len); | |
| 475 if (target_len_ptr != NULL) | |
| 476 *target_len_ptr = target_len; | |
| 477 } else { | |
| 478 target_len = *target_len_ptr; | |
| 479 } | |
| 480 | |
| 481 if (target_ptr == NULL) | |
| 482 return 0; | |
| 483 | |
| 484 if (*target_ptr == NULL) { | |
| 485 target = uv__malloc(target_len + 1); | |
| 486 if (target == NULL) { | |
| 487 return UV_ENOMEM; | |
| 488 } | |
| 489 *target_ptr = target; | |
| 490 } else { | |
| 491 target = *target_ptr; | |
| 492 } | |
| 493 | |
| 494 target_end = target + target_len; | |
| 495 | |
| 496 while (target != target_end && w_source_len) { | |
| 497 code_point = uv__get_surrogate_value(w_source_ptr, w_source_len); | |
| 498 /* Can be invalid UTF-8 but must be valid WTF-8. */ | |
| 499 assert(code_point >= 0); | |
| 500 if (w_source_len < 0 && code_point == 0) { | |
| 501 w_source_len = 0; | |
| 502 break; | |
| 503 } | |
| 504 if (code_point < 0x80) { | |
| 505 *target++ = code_point; | |
| 506 } else if (code_point < 0x800) { | |
| 507 *target++ = 0xC0 | (code_point >> 6); | |
| 508 if (target == target_end) | |
| 509 break; | |
| 510 *target++ = 0x80 | (code_point & 0x3F); | |
| 511 } else if (code_point < 0x10000) { | |
| 512 *target++ = 0xE0 | (code_point >> 12); | |
| 513 if (target == target_end) | |
| 514 break; | |
| 515 *target++ = 0x80 | ((code_point >> 6) & 0x3F); | |
| 516 if (target == target_end) | |
| 517 break; | |
| 518 *target++ = 0x80 | (code_point & 0x3F); | |
| 519 } else { | |
| 520 *target++ = 0xF0 | (code_point >> 18); | |
| 521 if (target == target_end) | |
| 522 break; | |
| 523 *target++ = 0x80 | ((code_point >> 12) & 0x3F); | |
| 524 if (target == target_end) | |
| 525 break; | |
| 526 *target++ = 0x80 | ((code_point >> 6) & 0x3F); | |
| 527 if (target == target_end) | |
| 528 break; | |
| 529 *target++ = 0x80 | (code_point & 0x3F); | |
| 530 /* uv__get_surrogate_value consumed 2 input characters */ | |
| 531 w_source_ptr++; | |
| 532 if (w_source_len > 0) | |
| 533 w_source_len--; | |
| 534 } | |
| 535 target_len = target - *target_ptr; | |
| 536 w_source_ptr++; | |
| 537 if (w_source_len > 0) | |
| 538 w_source_len--; | |
| 539 } | |
| 540 | |
| 541 if (target != target_end && target_len_ptr != NULL) | |
| 542 /* Did not fill all of the provided buffer, so update the target_len_ptr | |
| 543 * output with the space used. */ | |
| 544 *target_len_ptr = target - *target_ptr; | |
| 545 | |
| 546 /* Check if input fit into target exactly. */ | |
| 547 if (w_source_len < 0 && target == target_end && w_source_ptr[0] == 0) | |
| 548 w_source_len = 0; | |
| 549 | |
| 550 *target++ = '\0'; | |
| 551 | |
| 552 /* Characters remained after filling the buffer, compute the remaining length now. */ | |
| 553 if (w_source_len) { | |
| 554 if (target_len_ptr != NULL) | |
| 555 *target_len_ptr = target_len + uv_utf16_length_as_wtf8(w_source_ptr, w_source_len); | |
| 556 return UV_ENOBUFS; | |
| 557 } | |
| 558 | |
| 559 return 0; | |
| 560 } |