Mercurial
comparison third_party/libuv/src/unix/linux.c @ 160:948de3f54cea
[ThirdParty] Added libuv
| author | June Park <parkjune1995@gmail.com> |
|---|---|
| date | Wed, 14 Jan 2026 19:39:52 -0800 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 159:05cf9467a1c3 | 160:948de3f54cea |
|---|---|
| 1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved. | |
| 2 * Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 3 * of this software and associated documentation files (the "Software"), to | |
| 4 * deal in the Software without restriction, including without limitation the | |
| 5 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | |
| 6 * sell copies of the Software, and to permit persons to whom the Software is | |
| 7 * furnished to do so, subject to the following conditions: | |
| 8 * | |
| 9 * The above copyright notice and this permission notice shall be included in | |
| 10 * all copies or substantial portions of the Software. | |
| 11 * | |
| 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
| 17 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
| 18 * IN THE SOFTWARE. | |
| 19 */ | |
| 20 | |
| 21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their | |
| 22 * EPOLL* counterparts. We use the POLL* variants in this file because that | |
| 23 * is what libuv uses elsewhere. | |
| 24 */ | |
| 25 | |
| 26 #include "uv.h" | |
| 27 #include "internal.h" | |
| 28 | |
| 29 #include <inttypes.h> | |
| 30 #include <stdatomic.h> | |
| 31 #include <stddef.h> /* offsetof */ | |
| 32 #include <stdint.h> | |
| 33 #include <stdio.h> | |
| 34 #include <stdlib.h> | |
| 35 #include <string.h> | |
| 36 #include <assert.h> | |
| 37 #include <errno.h> | |
| 38 | |
| 39 #include <fcntl.h> | |
| 40 #include <ifaddrs.h> | |
| 41 #include <net/ethernet.h> | |
| 42 #include <net/if.h> | |
| 43 #include <netpacket/packet.h> | |
| 44 #include <sys/epoll.h> | |
| 45 #include <sys/inotify.h> | |
| 46 #include <sys/mman.h> | |
| 47 #include <sys/param.h> | |
| 48 #include <sys/prctl.h> | |
| 49 #include <sys/socket.h> | |
| 50 #include <sys/stat.h> | |
| 51 #include <sys/syscall.h> | |
| 52 #include <sys/sysinfo.h> | |
| 53 #include <sys/sysmacros.h> | |
| 54 #include <sys/types.h> | |
| 55 #include <sys/utsname.h> | |
| 56 #include <time.h> | |
| 57 #include <unistd.h> | |
| 58 | |
| 59 #ifndef __NR_io_uring_setup | |
| 60 # define __NR_io_uring_setup 425 | |
| 61 #endif | |
| 62 | |
| 63 #ifndef __NR_io_uring_enter | |
| 64 # define __NR_io_uring_enter 426 | |
| 65 #endif | |
| 66 | |
| 67 #ifndef __NR_io_uring_register | |
| 68 # define __NR_io_uring_register 427 | |
| 69 #endif | |
| 70 | |
| 71 #ifndef __NR_copy_file_range | |
| 72 # if defined(__x86_64__) | |
| 73 # define __NR_copy_file_range 326 | |
| 74 # elif defined(__i386__) | |
| 75 # define __NR_copy_file_range 377 | |
| 76 # elif defined(__s390__) | |
| 77 # define __NR_copy_file_range 375 | |
| 78 # elif defined(__arm__) | |
| 79 # define __NR_copy_file_range 391 | |
| 80 # elif defined(__aarch64__) | |
| 81 # define __NR_copy_file_range 285 | |
| 82 # elif defined(__powerpc__) | |
| 83 # define __NR_copy_file_range 379 | |
| 84 # elif defined(__arc__) | |
| 85 # define __NR_copy_file_range 285 | |
| 86 # elif defined(__riscv) | |
| 87 # define __NR_copy_file_range 285 | |
| 88 # endif | |
| 89 #endif /* __NR_copy_file_range */ | |
| 90 | |
| 91 #ifndef __NR_statx | |
| 92 # if defined(__x86_64__) | |
| 93 # define __NR_statx 332 | |
| 94 # elif defined(__i386__) | |
| 95 # define __NR_statx 383 | |
| 96 # elif defined(__aarch64__) | |
| 97 # define __NR_statx 397 | |
| 98 # elif defined(__arm__) | |
| 99 # define __NR_statx 397 | |
| 100 # elif defined(__ppc__) | |
| 101 # define __NR_statx 383 | |
| 102 # elif defined(__s390__) | |
| 103 # define __NR_statx 379 | |
| 104 # elif defined(__riscv) | |
| 105 # define __NR_statx 291 | |
| 106 # endif | |
| 107 #endif /* __NR_statx */ | |
| 108 | |
| 109 #ifndef __NR_getrandom | |
| 110 # if defined(__x86_64__) | |
| 111 # define __NR_getrandom 318 | |
| 112 # elif defined(__i386__) | |
| 113 # define __NR_getrandom 355 | |
| 114 # elif defined(__aarch64__) | |
| 115 # define __NR_getrandom 384 | |
| 116 # elif defined(__arm__) | |
| 117 # define __NR_getrandom 384 | |
| 118 # elif defined(__ppc__) | |
| 119 # define __NR_getrandom 359 | |
| 120 # elif defined(__s390__) | |
| 121 # define __NR_getrandom 349 | |
| 122 # elif defined(__riscv) | |
| 123 # define __NR_getrandom 278 | |
| 124 # endif | |
| 125 #endif /* __NR_getrandom */ | |
| 126 | |
| 127 enum { | |
| 128 UV__IORING_SETUP_SQPOLL = 2u, | |
| 129 UV__IORING_SETUP_NO_SQARRAY = 0x10000u, | |
| 130 }; | |
| 131 | |
| 132 enum { | |
| 133 UV__IORING_FEAT_SINGLE_MMAP = 1u, | |
| 134 UV__IORING_FEAT_NODROP = 2u, | |
| 135 UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */ | |
| 136 }; | |
| 137 | |
| 138 enum { | |
| 139 UV__IORING_OP_READV = 1, | |
| 140 UV__IORING_OP_WRITEV = 2, | |
| 141 UV__IORING_OP_FSYNC = 3, | |
| 142 UV__IORING_OP_OPENAT = 18, | |
| 143 UV__IORING_OP_CLOSE = 19, | |
| 144 UV__IORING_OP_STATX = 21, | |
| 145 UV__IORING_OP_EPOLL_CTL = 29, | |
| 146 UV__IORING_OP_RENAMEAT = 35, | |
| 147 UV__IORING_OP_UNLINKAT = 36, | |
| 148 UV__IORING_OP_MKDIRAT = 37, | |
| 149 UV__IORING_OP_SYMLINKAT = 38, | |
| 150 UV__IORING_OP_LINKAT = 39, | |
| 151 UV__IORING_OP_FTRUNCATE = 55, | |
| 152 }; | |
| 153 | |
| 154 enum { | |
| 155 UV__IORING_ENTER_GETEVENTS = 1u, | |
| 156 UV__IORING_ENTER_SQ_WAKEUP = 2u, | |
| 157 }; | |
| 158 | |
| 159 enum { | |
| 160 UV__IORING_SQ_NEED_WAKEUP = 1u, | |
| 161 UV__IORING_SQ_CQ_OVERFLOW = 2u, | |
| 162 }; | |
| 163 | |
| 164 struct uv__io_cqring_offsets { | |
| 165 uint32_t head; | |
| 166 uint32_t tail; | |
| 167 uint32_t ring_mask; | |
| 168 uint32_t ring_entries; | |
| 169 uint32_t overflow; | |
| 170 uint32_t cqes; | |
| 171 uint64_t reserved0; | |
| 172 uint64_t reserved1; | |
| 173 }; | |
| 174 | |
| 175 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets)); | |
| 176 | |
| 177 struct uv__io_sqring_offsets { | |
| 178 uint32_t head; | |
| 179 uint32_t tail; | |
| 180 uint32_t ring_mask; | |
| 181 uint32_t ring_entries; | |
| 182 uint32_t flags; | |
| 183 uint32_t dropped; | |
| 184 uint32_t array; | |
| 185 uint32_t reserved0; | |
| 186 uint64_t reserved1; | |
| 187 }; | |
| 188 | |
| 189 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets)); | |
| 190 | |
| 191 struct uv__io_uring_cqe { | |
| 192 uint64_t user_data; | |
| 193 int32_t res; | |
| 194 uint32_t flags; | |
| 195 }; | |
| 196 | |
| 197 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe)); | |
| 198 | |
| 199 struct uv__io_uring_sqe { | |
| 200 uint8_t opcode; | |
| 201 uint8_t flags; | |
| 202 uint16_t ioprio; | |
| 203 int32_t fd; | |
| 204 union { | |
| 205 uint64_t off; | |
| 206 uint64_t addr2; | |
| 207 }; | |
| 208 union { | |
| 209 uint64_t addr; | |
| 210 }; | |
| 211 uint32_t len; | |
| 212 union { | |
| 213 uint32_t rw_flags; | |
| 214 uint32_t fsync_flags; | |
| 215 uint32_t open_flags; | |
| 216 uint32_t statx_flags; | |
| 217 }; | |
| 218 uint64_t user_data; | |
| 219 union { | |
| 220 uint16_t buf_index; | |
| 221 uint64_t pad[3]; | |
| 222 }; | |
| 223 }; | |
| 224 | |
| 225 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe)); | |
| 226 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode)); | |
| 227 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags)); | |
| 228 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio)); | |
| 229 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd)); | |
| 230 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off)); | |
| 231 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr)); | |
| 232 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len)); | |
| 233 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags)); | |
| 234 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data)); | |
| 235 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index)); | |
| 236 | |
| 237 struct uv__io_uring_params { | |
| 238 uint32_t sq_entries; | |
| 239 uint32_t cq_entries; | |
| 240 uint32_t flags; | |
| 241 uint32_t sq_thread_cpu; | |
| 242 uint32_t sq_thread_idle; | |
| 243 uint32_t features; | |
| 244 uint32_t reserved[4]; | |
| 245 struct uv__io_sqring_offsets sq_off; /* 40 bytes */ | |
| 246 struct uv__io_cqring_offsets cq_off; /* 40 bytes */ | |
| 247 }; | |
| 248 | |
| 249 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params)); | |
| 250 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off)); | |
| 251 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off)); | |
| 252 | |
| 253 STATIC_ASSERT(EPOLL_CTL_ADD < 4); | |
| 254 STATIC_ASSERT(EPOLL_CTL_DEL < 4); | |
| 255 STATIC_ASSERT(EPOLL_CTL_MOD < 4); | |
| 256 | |
| 257 struct watcher_list { | |
| 258 RB_ENTRY(watcher_list) entry; | |
| 259 struct uv__queue watchers; | |
| 260 int iterating; | |
| 261 char* path; | |
| 262 int wd; | |
| 263 }; | |
| 264 | |
| 265 struct watcher_root { | |
| 266 struct watcher_list* rbh_root; | |
| 267 }; | |
| 268 | |
| 269 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root); | |
| 270 static void uv__inotify_read(uv_loop_t* loop, | |
| 271 uv__io_t* w, | |
| 272 unsigned int revents); | |
| 273 static int compare_watchers(const struct watcher_list* a, | |
| 274 const struct watcher_list* b); | |
| 275 static void maybe_free_watcher_list(struct watcher_list* w, | |
| 276 uv_loop_t* loop); | |
| 277 | |
| 278 static void uv__epoll_ctl_flush(int epollfd, | |
| 279 struct uv__iou* ctl, | |
| 280 struct epoll_event (*events)[256]); | |
| 281 | |
| 282 static void uv__epoll_ctl_prep(int epollfd, | |
| 283 struct uv__iou* ctl, | |
| 284 struct epoll_event (*events)[256], | |
| 285 int op, | |
| 286 int fd, | |
| 287 struct epoll_event* e); | |
| 288 | |
| 289 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers) | |
| 290 | |
| 291 | |
| 292 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) { | |
| 293 /* This cast works because watcher_root is a struct with a pointer as its | |
| 294 * sole member. Such type punning is unsafe in the presence of strict | |
| 295 * pointer aliasing (and is just plain nasty) but that is why libuv | |
| 296 * is compiled with -fno-strict-aliasing. | |
| 297 */ | |
| 298 return (struct watcher_root*) &loop->inotify_watchers; | |
| 299 } | |
| 300 | |
| 301 | |
| 302 unsigned uv__kernel_version(void) { | |
| 303 static _Atomic unsigned cached_version; | |
| 304 struct utsname u; | |
| 305 unsigned version; | |
| 306 unsigned major; | |
| 307 unsigned minor; | |
| 308 unsigned patch; | |
| 309 char v_sig[256]; | |
| 310 char* needle; | |
| 311 | |
| 312 version = atomic_load_explicit(&cached_version, memory_order_relaxed); | |
| 313 if (version != 0) | |
| 314 return version; | |
| 315 | |
| 316 /* Check /proc/version_signature first as it's the way to get the mainline | |
| 317 * kernel version in Ubuntu. The format is: | |
| 318 * Ubuntu ubuntu_kernel_version mainline_kernel_version | |
| 319 * For example: | |
| 320 * Ubuntu 5.15.0-79.86-generic 5.15.111 | |
| 321 */ | |
| 322 if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig))) | |
| 323 if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch)) | |
| 324 goto calculate_version; | |
| 325 | |
| 326 if (-1 == uname(&u)) | |
| 327 return 0; | |
| 328 | |
| 329 /* In Debian we need to check `version` instead of `release` to extract the | |
| 330 * mainline kernel version. This is an example of how it looks like: | |
| 331 * #1 SMP Debian 5.10.46-4 (2021-08-03) | |
| 332 */ | |
| 333 needle = strstr(u.version, "Debian "); | |
| 334 if (needle != NULL) | |
| 335 if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch)) | |
| 336 goto calculate_version; | |
| 337 | |
| 338 if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch)) | |
| 339 return 0; | |
| 340 | |
| 341 /* Handle it when the process runs under the UNAME26 personality: | |
| 342 * | |
| 343 * - kernels >= 3.x identify as 2.6.40+x | |
| 344 * - kernels >= 4.x identify as 2.6.60+x | |
| 345 * | |
| 346 * UNAME26 is a poorly conceived hack that doesn't let us distinguish | |
| 347 * between 4.x kernels and 5.x/6.x kernels so we conservatively assume | |
| 348 * that 2.6.60+x means 4.x. | |
| 349 * | |
| 350 * Fun fact of the day: it's technically possible to observe the actual | |
| 351 * kernel version for a brief moment because uname() first copies out the | |
| 352 * real release string before overwriting it with the backcompat string. | |
| 353 */ | |
| 354 if (major == 2 && minor == 6) { | |
| 355 if (patch >= 60) { | |
| 356 major = 4; | |
| 357 minor = patch - 60; | |
| 358 patch = 0; | |
| 359 } else if (patch >= 40) { | |
| 360 major = 3; | |
| 361 minor = patch - 40; | |
| 362 patch = 0; | |
| 363 } | |
| 364 } | |
| 365 | |
| 366 calculate_version: | |
| 367 version = major * 65536 + minor * 256 + patch; | |
| 368 atomic_store_explicit(&cached_version, version, memory_order_relaxed); | |
| 369 | |
| 370 return version; | |
| 371 } | |
| 372 | |
| 373 | |
| 374 ssize_t | |
| 375 uv__fs_copy_file_range(int fd_in, | |
| 376 off_t* off_in, | |
| 377 int fd_out, | |
| 378 off_t* off_out, | |
| 379 size_t len, | |
| 380 unsigned int flags) | |
| 381 { | |
| 382 #ifdef __NR_copy_file_range | |
| 383 return syscall(__NR_copy_file_range, | |
| 384 fd_in, | |
| 385 off_in, | |
| 386 fd_out, | |
| 387 off_out, | |
| 388 len, | |
| 389 flags); | |
| 390 #else | |
| 391 return errno = ENOSYS, -1; | |
| 392 #endif | |
| 393 } | |
| 394 | |
| 395 | |
| 396 int uv__statx(int dirfd, | |
| 397 const char* path, | |
| 398 int flags, | |
| 399 unsigned int mask, | |
| 400 struct uv__statx* statxbuf) { | |
| 401 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30 | |
| 402 return errno = ENOSYS, -1; | |
| 403 #else | |
| 404 int rc; | |
| 405 | |
| 406 rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf); | |
| 407 if (rc >= 0) | |
| 408 uv__msan_unpoison(statxbuf, sizeof(*statxbuf)); | |
| 409 | |
| 410 return rc; | |
| 411 #endif | |
| 412 } | |
| 413 | |
| 414 | |
| 415 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) { | |
| 416 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28 | |
| 417 return errno = ENOSYS, -1; | |
| 418 #else | |
| 419 ssize_t rc; | |
| 420 | |
| 421 rc = syscall(__NR_getrandom, buf, buflen, flags); | |
| 422 if (rc >= 0) | |
| 423 uv__msan_unpoison(buf, buflen); | |
| 424 | |
| 425 return rc; | |
| 426 #endif | |
| 427 } | |
| 428 | |
| 429 | |
| 430 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) { | |
| 431 return syscall(__NR_io_uring_setup, entries, params); | |
| 432 } | |
| 433 | |
| 434 | |
| 435 int uv__io_uring_enter(int fd, | |
| 436 unsigned to_submit, | |
| 437 unsigned min_complete, | |
| 438 unsigned flags) { | |
| 439 /* io_uring_enter used to take a sigset_t but it's unused | |
| 440 * in newer kernels unless IORING_ENTER_EXT_ARG is set, | |
| 441 * in which case it takes a struct io_uring_getevents_arg. | |
| 442 */ | |
| 443 return syscall(__NR_io_uring_enter, | |
| 444 fd, | |
| 445 to_submit, | |
| 446 min_complete, | |
| 447 flags, | |
| 448 NULL, | |
| 449 0L); | |
| 450 } | |
| 451 | |
| 452 | |
| 453 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) { | |
| 454 return syscall(__NR_io_uring_register, fd, opcode, arg, nargs); | |
| 455 } | |
| 456 | |
| 457 | |
| 458 static int uv__use_io_uring(uint32_t flags) { | |
| 459 #if defined(__ANDROID_API__) | |
| 460 return 0; /* Possibly available but blocked by seccomp. */ | |
| 461 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4 | |
| 462 /* See https://github.com/libuv/libuv/issues/4158. */ | |
| 463 return 0; /* All 32 bits kernels appear buggy. */ | |
| 464 #elif defined(__powerpc64__) || defined(__ppc64__) | |
| 465 /* See https://github.com/libuv/libuv/issues/4283. */ | |
| 466 return 0; /* Random SIGSEGV in signal handler. */ | |
| 467 #else | |
| 468 /* Ternary: unknown=0, yes=1, no=-1 */ | |
| 469 static _Atomic int use_io_uring; | |
| 470 char* val; | |
| 471 int use; | |
| 472 | |
| 473 #if defined(__hppa__) | |
| 474 /* io_uring first supported on parisc in 6.1, functional in .51 | |
| 475 * https://lore.kernel.org/all/[email protected]/ | |
| 476 */ | |
| 477 if (uv__kernel_version() < /*6.1.51*/0x060133) | |
| 478 return 0; | |
| 479 #endif | |
| 480 | |
| 481 /* SQPOLL is all kinds of buggy but epoll batching should work fine. */ | |
| 482 if (0 == (flags & UV__IORING_SETUP_SQPOLL)) | |
| 483 return 1; | |
| 484 | |
| 485 /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */ | |
| 486 if (uv__kernel_version() < /*5.10.186*/0x050ABA) | |
| 487 return 0; | |
| 488 | |
| 489 use = atomic_load_explicit(&use_io_uring, memory_order_relaxed); | |
| 490 | |
| 491 if (use == 0) { | |
| 492 val = getenv("UV_USE_IO_URING"); | |
| 493 use = val != NULL && atoi(val) > 0 ? 1 : -1; | |
| 494 atomic_store_explicit(&use_io_uring, use, memory_order_relaxed); | |
| 495 } | |
| 496 | |
| 497 return use > 0; | |
| 498 #endif | |
| 499 } | |
| 500 | |
| 501 | |
| 502 static void uv__iou_init(int epollfd, | |
| 503 struct uv__iou* iou, | |
| 504 uint32_t entries, | |
| 505 uint32_t flags) { | |
| 506 struct uv__io_uring_params params; | |
| 507 struct epoll_event e; | |
| 508 size_t cqlen; | |
| 509 size_t sqlen; | |
| 510 size_t maxlen; | |
| 511 size_t sqelen; | |
| 512 unsigned kernel_version; | |
| 513 uint32_t* sqarray; | |
| 514 uint32_t i; | |
| 515 char* sq; | |
| 516 char* sqe; | |
| 517 int ringfd; | |
| 518 int no_sqarray; | |
| 519 | |
| 520 sq = MAP_FAILED; | |
| 521 sqe = MAP_FAILED; | |
| 522 | |
| 523 if (!uv__use_io_uring(flags)) | |
| 524 return; | |
| 525 | |
| 526 kernel_version = uv__kernel_version(); | |
| 527 no_sqarray = | |
| 528 UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600); | |
| 529 | |
| 530 /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement. | |
| 531 * Mostly academic because we check for a v5.13 kernel afterwards anyway. | |
| 532 */ | |
| 533 memset(¶ms, 0, sizeof(params)); | |
| 534 params.flags = flags | no_sqarray; | |
| 535 | |
| 536 if (flags & UV__IORING_SETUP_SQPOLL) | |
| 537 params.sq_thread_idle = 10; /* milliseconds */ | |
| 538 | |
| 539 /* Kernel returns a file descriptor with O_CLOEXEC flag set. */ | |
| 540 ringfd = uv__io_uring_setup(entries, ¶ms); | |
| 541 if (ringfd == -1) | |
| 542 return; | |
| 543 | |
| 544 /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're | |
| 545 * actually detecting is whether IORING_OP_STATX works with SQPOLL. | |
| 546 */ | |
| 547 if (!(params.features & UV__IORING_FEAT_RSRC_TAGS)) | |
| 548 goto fail; | |
| 549 | |
| 550 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */ | |
| 551 if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP)) | |
| 552 goto fail; | |
| 553 | |
| 554 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */ | |
| 555 if (!(params.features & UV__IORING_FEAT_NODROP)) | |
| 556 goto fail; | |
| 557 | |
| 558 sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t); | |
| 559 cqlen = | |
| 560 params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe); | |
| 561 maxlen = sqlen < cqlen ? cqlen : sqlen; | |
| 562 sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe); | |
| 563 | |
| 564 sq = mmap(0, | |
| 565 maxlen, | |
| 566 PROT_READ | PROT_WRITE, | |
| 567 MAP_SHARED | MAP_POPULATE, | |
| 568 ringfd, | |
| 569 0); /* IORING_OFF_SQ_RING */ | |
| 570 | |
| 571 sqe = mmap(0, | |
| 572 sqelen, | |
| 573 PROT_READ | PROT_WRITE, | |
| 574 MAP_SHARED | MAP_POPULATE, | |
| 575 ringfd, | |
| 576 0x10000000ull); /* IORING_OFF_SQES */ | |
| 577 | |
| 578 if (sq == MAP_FAILED || sqe == MAP_FAILED) | |
| 579 goto fail; | |
| 580 | |
| 581 if (flags & UV__IORING_SETUP_SQPOLL) { | |
| 582 /* Only interested in completion events. To get notified when | |
| 583 * the kernel pulls items from the submission ring, add POLLOUT. | |
| 584 */ | |
| 585 memset(&e, 0, sizeof(e)); | |
| 586 e.events = POLLIN; | |
| 587 e.data.fd = ringfd; | |
| 588 | |
| 589 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e)) | |
| 590 goto fail; | |
| 591 } | |
| 592 | |
| 593 iou->sqhead = (uint32_t*) (sq + params.sq_off.head); | |
| 594 iou->sqtail = (uint32_t*) (sq + params.sq_off.tail); | |
| 595 iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask); | |
| 596 iou->sqflags = (uint32_t*) (sq + params.sq_off.flags); | |
| 597 iou->cqhead = (uint32_t*) (sq + params.cq_off.head); | |
| 598 iou->cqtail = (uint32_t*) (sq + params.cq_off.tail); | |
| 599 iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask); | |
| 600 iou->sq = sq; | |
| 601 iou->cqe = sq + params.cq_off.cqes; | |
| 602 iou->sqe = sqe; | |
| 603 iou->sqlen = sqlen; | |
| 604 iou->cqlen = cqlen; | |
| 605 iou->maxlen = maxlen; | |
| 606 iou->sqelen = sqelen; | |
| 607 iou->ringfd = ringfd; | |
| 608 iou->in_flight = 0; | |
| 609 | |
| 610 if (no_sqarray) | |
| 611 return; | |
| 612 | |
| 613 sqarray = (uint32_t*) (sq + params.sq_off.array); | |
| 614 for (i = 0; i <= iou->sqmask; i++) | |
| 615 sqarray[i] = i; /* Slot -> sqe identity mapping. */ | |
| 616 | |
| 617 return; | |
| 618 | |
| 619 fail: | |
| 620 if (sq != MAP_FAILED) | |
| 621 munmap(sq, maxlen); | |
| 622 | |
| 623 if (sqe != MAP_FAILED) | |
| 624 munmap(sqe, sqelen); | |
| 625 | |
| 626 uv__close(ringfd); | |
| 627 } | |
| 628 | |
| 629 | |
| 630 static void uv__iou_delete(struct uv__iou* iou) { | |
| 631 if (iou->ringfd > -1) { | |
| 632 munmap(iou->sq, iou->maxlen); | |
| 633 munmap(iou->sqe, iou->sqelen); | |
| 634 uv__close(iou->ringfd); | |
| 635 iou->ringfd = -1; | |
| 636 } | |
| 637 } | |
| 638 | |
| 639 | |
| 640 int uv__platform_loop_init(uv_loop_t* loop) { | |
| 641 uv__loop_internal_fields_t* lfields; | |
| 642 | |
| 643 lfields = uv__get_internal_fields(loop); | |
| 644 lfields->ctl.ringfd = -1; | |
| 645 lfields->iou.ringfd = -2; /* "uninitialized" */ | |
| 646 | |
| 647 loop->inotify_watchers = NULL; | |
| 648 loop->inotify_fd = -1; | |
| 649 loop->backend_fd = epoll_create1(O_CLOEXEC); | |
| 650 | |
| 651 if (loop->backend_fd == -1) | |
| 652 return UV__ERR(errno); | |
| 653 | |
| 654 uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0); | |
| 655 | |
| 656 return 0; | |
| 657 } | |
| 658 | |
| 659 | |
| 660 int uv__io_fork(uv_loop_t* loop) { | |
| 661 int err; | |
| 662 struct watcher_list* root; | |
| 663 | |
| 664 root = uv__inotify_watchers(loop)->rbh_root; | |
| 665 | |
| 666 uv__close(loop->backend_fd); | |
| 667 loop->backend_fd = -1; | |
| 668 | |
| 669 /* TODO(bnoordhuis) Loses items from the submission and completion rings. */ | |
| 670 uv__platform_loop_delete(loop); | |
| 671 | |
| 672 err = uv__platform_loop_init(loop); | |
| 673 if (err) | |
| 674 return err; | |
| 675 | |
| 676 return uv__inotify_fork(loop, root); | |
| 677 } | |
| 678 | |
| 679 | |
| 680 void uv__platform_loop_delete(uv_loop_t* loop) { | |
| 681 uv__loop_internal_fields_t* lfields; | |
| 682 | |
| 683 lfields = uv__get_internal_fields(loop); | |
| 684 uv__iou_delete(&lfields->ctl); | |
| 685 uv__iou_delete(&lfields->iou); | |
| 686 | |
| 687 if (loop->inotify_fd != -1) { | |
| 688 uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN); | |
| 689 uv__close(loop->inotify_fd); | |
| 690 loop->inotify_fd = -1; | |
| 691 } | |
| 692 } | |
| 693 | |
| 694 | |
| 695 struct uv__invalidate { | |
| 696 struct epoll_event (*prep)[256]; | |
| 697 struct epoll_event* events; | |
| 698 int nfds; | |
| 699 }; | |
| 700 | |
| 701 | |
| 702 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) { | |
| 703 uv__loop_internal_fields_t* lfields; | |
| 704 struct uv__invalidate* inv; | |
| 705 struct epoll_event dummy; | |
| 706 int i; | |
| 707 | |
| 708 lfields = uv__get_internal_fields(loop); | |
| 709 inv = lfields->inv; | |
| 710 | |
| 711 /* Invalidate events with same file descriptor */ | |
| 712 if (inv != NULL) | |
| 713 for (i = 0; i < inv->nfds; i++) | |
| 714 if (inv->events[i].data.fd == fd) | |
| 715 inv->events[i].data.fd = -1; | |
| 716 | |
| 717 /* Remove the file descriptor from the epoll. | |
| 718 * This avoids a problem where the same file description remains open | |
| 719 * in another process, causing repeated junk epoll events. | |
| 720 * | |
| 721 * Perform EPOLL_CTL_DEL immediately instead of going through | |
| 722 * io_uring's submit queue, otherwise the file descriptor may | |
| 723 * be closed by the time the kernel starts the operation. | |
| 724 * | |
| 725 * We pass in a dummy epoll_event, to work around a bug in old kernels. | |
| 726 * | |
| 727 * Work around a bug in kernels 3.10 to 3.19 where passing a struct that | |
| 728 * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings. | |
| 729 */ | |
| 730 memset(&dummy, 0, sizeof(dummy)); | |
| 731 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy); | |
| 732 } | |
| 733 | |
| 734 | |
| 735 int uv__io_check_fd(uv_loop_t* loop, int fd) { | |
| 736 struct epoll_event e; | |
| 737 int rc; | |
| 738 | |
| 739 memset(&e, 0, sizeof(e)); | |
| 740 e.events = POLLIN; | |
| 741 e.data.fd = -1; | |
| 742 | |
| 743 rc = 0; | |
| 744 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e)) | |
| 745 if (errno != EEXIST) | |
| 746 rc = UV__ERR(errno); | |
| 747 | |
| 748 if (rc == 0) | |
| 749 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e)) | |
| 750 abort(); | |
| 751 | |
| 752 return rc; | |
| 753 } | |
| 754 | |
| 755 | |
| 756 /* Caller must initialize SQE and call uv__iou_submit(). */ | |
| 757 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou, | |
| 758 uv_loop_t* loop, | |
| 759 uv_fs_t* req) { | |
| 760 struct uv__io_uring_sqe* sqe; | |
| 761 uint32_t head; | |
| 762 uint32_t tail; | |
| 763 uint32_t mask; | |
| 764 uint32_t slot; | |
| 765 | |
| 766 /* Lazily create the ring. State machine: -2 means uninitialized, -1 means | |
| 767 * initialization failed. Anything else is a valid ring file descriptor. | |
| 768 */ | |
| 769 if (iou->ringfd == -2) { | |
| 770 /* By default, the SQPOLL is not created. Enable only if the loop is | |
| 771 * configured with UV_LOOP_USE_IO_URING_SQPOLL and the UV_USE_IO_URING | |
| 772 * environment variable is unset or a positive number. | |
| 773 */ | |
| 774 if (loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL) | |
| 775 if (uv__use_io_uring(UV__IORING_SETUP_SQPOLL)) | |
| 776 uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL); | |
| 777 | |
| 778 if (iou->ringfd == -2) | |
| 779 iou->ringfd = -1; /* "failed" */ | |
| 780 } | |
| 781 | |
| 782 if (iou->ringfd == -1) | |
| 783 return NULL; | |
| 784 | |
| 785 head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead, | |
| 786 memory_order_acquire); | |
| 787 tail = *iou->sqtail; | |
| 788 mask = iou->sqmask; | |
| 789 | |
| 790 if ((head & mask) == ((tail + 1) & mask)) | |
| 791 return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */ | |
| 792 | |
| 793 slot = tail & mask; | |
| 794 sqe = iou->sqe; | |
| 795 sqe = &sqe[slot]; | |
| 796 memset(sqe, 0, sizeof(*sqe)); | |
| 797 sqe->user_data = (uintptr_t) req; | |
| 798 | |
| 799 /* Pacify uv_cancel(). */ | |
| 800 req->work_req.loop = loop; | |
| 801 req->work_req.work = NULL; | |
| 802 req->work_req.done = NULL; | |
| 803 uv__queue_init(&req->work_req.wq); | |
| 804 | |
| 805 uv__req_register(loop); | |
| 806 iou->in_flight++; | |
| 807 | |
| 808 return sqe; | |
| 809 } | |
| 810 | |
| 811 | |
| 812 static void uv__iou_submit(struct uv__iou* iou) { | |
| 813 uint32_t flags; | |
| 814 | |
| 815 atomic_store_explicit((_Atomic uint32_t*) iou->sqtail, | |
| 816 *iou->sqtail + 1, | |
| 817 memory_order_release); | |
| 818 | |
| 819 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags, | |
| 820 memory_order_acquire); | |
| 821 | |
| 822 if (flags & UV__IORING_SQ_NEED_WAKEUP) | |
| 823 if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP)) | |
| 824 if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */ | |
| 825 perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */ | |
| 826 } | |
| 827 | |
| 828 | |
| 829 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) { | |
| 830 struct uv__io_uring_sqe* sqe; | |
| 831 struct uv__iou* iou; | |
| 832 int kv; | |
| 833 | |
| 834 kv = uv__kernel_version(); | |
| 835 /* Work around a poorly understood bug in older kernels where closing a file | |
| 836 * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to | |
| 837 * execve("/foo/bar") later on. The bug seems to have been fixed somewhere | |
| 838 * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit | |
| 839 * but good candidates are the several data race fixes. Interestingly, it | |
| 840 * seems to manifest only when running under Docker so the possibility of | |
| 841 * a Docker bug can't be completely ruled out either. Yay, computers. | |
| 842 * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and | |
| 843 * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be | |
| 844 * solved. | |
| 845 */ | |
| 846 if (kv < /* 5.15.90 */ 0x050F5A) | |
| 847 return 0; | |
| 848 | |
| 849 if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100) | |
| 850 return 0; | |
| 851 | |
| 852 | |
| 853 iou = &uv__get_internal_fields(loop)->iou; | |
| 854 | |
| 855 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 856 if (sqe == NULL) | |
| 857 return 0; | |
| 858 | |
| 859 sqe->fd = req->file; | |
| 860 sqe->opcode = UV__IORING_OP_CLOSE; | |
| 861 | |
| 862 uv__iou_submit(iou); | |
| 863 | |
| 864 return 1; | |
| 865 } | |
| 866 | |
| 867 | |
| 868 int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) { | |
| 869 struct uv__io_uring_sqe* sqe; | |
| 870 struct uv__iou* iou; | |
| 871 | |
| 872 if (uv__kernel_version() < /* 6.9 */0x060900) | |
| 873 return 0; | |
| 874 | |
| 875 iou = &uv__get_internal_fields(loop)->iou; | |
| 876 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 877 if (sqe == NULL) | |
| 878 return 0; | |
| 879 | |
| 880 sqe->fd = req->file; | |
| 881 sqe->len = req->off; | |
| 882 sqe->opcode = UV__IORING_OP_FTRUNCATE; | |
| 883 uv__iou_submit(iou); | |
| 884 | |
| 885 return 1; | |
| 886 } | |
| 887 | |
| 888 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop, | |
| 889 uv_fs_t* req, | |
| 890 uint32_t fsync_flags) { | |
| 891 struct uv__io_uring_sqe* sqe; | |
| 892 struct uv__iou* iou; | |
| 893 | |
| 894 iou = &uv__get_internal_fields(loop)->iou; | |
| 895 | |
| 896 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 897 if (sqe == NULL) | |
| 898 return 0; | |
| 899 | |
| 900 /* Little known fact: setting seq->off and seq->len turns | |
| 901 * it into an asynchronous sync_file_range() operation. | |
| 902 */ | |
| 903 sqe->fd = req->file; | |
| 904 sqe->fsync_flags = fsync_flags; | |
| 905 sqe->opcode = UV__IORING_OP_FSYNC; | |
| 906 | |
| 907 uv__iou_submit(iou); | |
| 908 | |
| 909 return 1; | |
| 910 } | |
| 911 | |
| 912 | |
| 913 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) { | |
| 914 struct uv__io_uring_sqe* sqe; | |
| 915 struct uv__iou* iou; | |
| 916 | |
| 917 if (uv__kernel_version() < /* 5.15.0 */0x050F00) | |
| 918 return 0; | |
| 919 | |
| 920 iou = &uv__get_internal_fields(loop)->iou; | |
| 921 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 922 if (sqe == NULL) | |
| 923 return 0; | |
| 924 | |
| 925 sqe->addr = (uintptr_t) req->path; | |
| 926 sqe->fd = AT_FDCWD; | |
| 927 sqe->addr2 = (uintptr_t) req->new_path; | |
| 928 sqe->len = AT_FDCWD; | |
| 929 sqe->opcode = UV__IORING_OP_LINKAT; | |
| 930 | |
| 931 uv__iou_submit(iou); | |
| 932 | |
| 933 return 1; | |
| 934 } | |
| 935 | |
| 936 | |
| 937 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) { | |
| 938 struct uv__io_uring_sqe* sqe; | |
| 939 struct uv__iou* iou; | |
| 940 | |
| 941 if (uv__kernel_version() < /* 5.15.0 */0x050F00) | |
| 942 return 0; | |
| 943 | |
| 944 iou = &uv__get_internal_fields(loop)->iou; | |
| 945 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 946 if (sqe == NULL) | |
| 947 return 0; | |
| 948 | |
| 949 sqe->addr = (uintptr_t) req->path; | |
| 950 sqe->fd = AT_FDCWD; | |
| 951 sqe->len = req->mode; | |
| 952 sqe->opcode = UV__IORING_OP_MKDIRAT; | |
| 953 | |
| 954 uv__iou_submit(iou); | |
| 955 | |
| 956 return 1; | |
| 957 } | |
| 958 | |
| 959 | |
| 960 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) { | |
| 961 struct uv__io_uring_sqe* sqe; | |
| 962 struct uv__iou* iou; | |
| 963 | |
| 964 iou = &uv__get_internal_fields(loop)->iou; | |
| 965 | |
| 966 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 967 if (sqe == NULL) | |
| 968 return 0; | |
| 969 | |
| 970 sqe->addr = (uintptr_t) req->path; | |
| 971 sqe->fd = AT_FDCWD; | |
| 972 sqe->len = req->mode; | |
| 973 sqe->opcode = UV__IORING_OP_OPENAT; | |
| 974 sqe->open_flags = req->flags | O_CLOEXEC; | |
| 975 | |
| 976 uv__iou_submit(iou); | |
| 977 | |
| 978 return 1; | |
| 979 } | |
| 980 | |
| 981 | |
| 982 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) { | |
| 983 struct uv__io_uring_sqe* sqe; | |
| 984 struct uv__iou* iou; | |
| 985 | |
| 986 iou = &uv__get_internal_fields(loop)->iou; | |
| 987 | |
| 988 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 989 if (sqe == NULL) | |
| 990 return 0; | |
| 991 | |
| 992 sqe->addr = (uintptr_t) req->path; | |
| 993 sqe->fd = AT_FDCWD; | |
| 994 sqe->addr2 = (uintptr_t) req->new_path; | |
| 995 sqe->len = AT_FDCWD; | |
| 996 sqe->opcode = UV__IORING_OP_RENAMEAT; | |
| 997 | |
| 998 uv__iou_submit(iou); | |
| 999 | |
| 1000 return 1; | |
| 1001 } | |
| 1002 | |
| 1003 | |
| 1004 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) { | |
| 1005 struct uv__io_uring_sqe* sqe; | |
| 1006 struct uv__iou* iou; | |
| 1007 | |
| 1008 if (uv__kernel_version() < /* 5.15.0 */0x050F00) | |
| 1009 return 0; | |
| 1010 | |
| 1011 iou = &uv__get_internal_fields(loop)->iou; | |
| 1012 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 1013 if (sqe == NULL) | |
| 1014 return 0; | |
| 1015 | |
| 1016 sqe->addr = (uintptr_t) req->path; | |
| 1017 sqe->fd = AT_FDCWD; | |
| 1018 sqe->addr2 = (uintptr_t) req->new_path; | |
| 1019 sqe->opcode = UV__IORING_OP_SYMLINKAT; | |
| 1020 | |
| 1021 uv__iou_submit(iou); | |
| 1022 | |
| 1023 return 1; | |
| 1024 } | |
| 1025 | |
| 1026 | |
| 1027 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) { | |
| 1028 struct uv__io_uring_sqe* sqe; | |
| 1029 struct uv__iou* iou; | |
| 1030 | |
| 1031 iou = &uv__get_internal_fields(loop)->iou; | |
| 1032 | |
| 1033 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 1034 if (sqe == NULL) | |
| 1035 return 0; | |
| 1036 | |
| 1037 sqe->addr = (uintptr_t) req->path; | |
| 1038 sqe->fd = AT_FDCWD; | |
| 1039 sqe->opcode = UV__IORING_OP_UNLINKAT; | |
| 1040 | |
| 1041 uv__iou_submit(iou); | |
| 1042 | |
| 1043 return 1; | |
| 1044 } | |
| 1045 | |
| 1046 | |
| 1047 int uv__iou_fs_read_or_write(uv_loop_t* loop, | |
| 1048 uv_fs_t* req, | |
| 1049 int is_read) { | |
| 1050 struct uv__io_uring_sqe* sqe; | |
| 1051 struct uv__iou* iou; | |
| 1052 | |
| 1053 /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback | |
| 1054 * to the threadpool on writes */ | |
| 1055 if (req->nbufs > IOV_MAX) { | |
| 1056 if (is_read) | |
| 1057 req->nbufs = IOV_MAX; | |
| 1058 else | |
| 1059 return 0; | |
| 1060 } | |
| 1061 | |
| 1062 iou = &uv__get_internal_fields(loop)->iou; | |
| 1063 | |
| 1064 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 1065 if (sqe == NULL) | |
| 1066 return 0; | |
| 1067 | |
| 1068 sqe->addr = (uintptr_t) req->bufs; | |
| 1069 sqe->fd = req->file; | |
| 1070 sqe->len = req->nbufs; | |
| 1071 sqe->off = req->off < 0 ? -1 : req->off; | |
| 1072 sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV; | |
| 1073 | |
| 1074 uv__iou_submit(iou); | |
| 1075 | |
| 1076 return 1; | |
| 1077 } | |
| 1078 | |
| 1079 | |
| 1080 int uv__iou_fs_statx(uv_loop_t* loop, | |
| 1081 uv_fs_t* req, | |
| 1082 int is_fstat, | |
| 1083 int is_lstat) { | |
| 1084 struct uv__io_uring_sqe* sqe; | |
| 1085 struct uv__statx* statxbuf; | |
| 1086 struct uv__iou* iou; | |
| 1087 | |
| 1088 statxbuf = uv__malloc(sizeof(*statxbuf)); | |
| 1089 if (statxbuf == NULL) | |
| 1090 return 0; | |
| 1091 | |
| 1092 iou = &uv__get_internal_fields(loop)->iou; | |
| 1093 | |
| 1094 sqe = uv__iou_get_sqe(iou, loop, req); | |
| 1095 if (sqe == NULL) { | |
| 1096 uv__free(statxbuf); | |
| 1097 return 0; | |
| 1098 } | |
| 1099 | |
| 1100 req->ptr = statxbuf; | |
| 1101 | |
| 1102 sqe->addr = (uintptr_t) req->path; | |
| 1103 sqe->addr2 = (uintptr_t) statxbuf; | |
| 1104 sqe->fd = AT_FDCWD; | |
| 1105 sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */ | |
| 1106 sqe->opcode = UV__IORING_OP_STATX; | |
| 1107 | |
| 1108 if (is_fstat) { | |
| 1109 sqe->addr = (uintptr_t) ""; | |
| 1110 sqe->fd = req->file; | |
| 1111 sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */ | |
| 1112 } | |
| 1113 | |
| 1114 if (is_lstat) | |
| 1115 sqe->statx_flags |= AT_SYMLINK_NOFOLLOW; | |
| 1116 | |
| 1117 uv__iou_submit(iou); | |
| 1118 | |
| 1119 return 1; | |
| 1120 } | |
| 1121 | |
| 1122 | |
| 1123 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) { | |
| 1124 buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor); | |
| 1125 buf->st_mode = statxbuf->stx_mode; | |
| 1126 buf->st_nlink = statxbuf->stx_nlink; | |
| 1127 buf->st_uid = statxbuf->stx_uid; | |
| 1128 buf->st_gid = statxbuf->stx_gid; | |
| 1129 buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor); | |
| 1130 buf->st_ino = statxbuf->stx_ino; | |
| 1131 buf->st_size = statxbuf->stx_size; | |
| 1132 buf->st_blksize = statxbuf->stx_blksize; | |
| 1133 buf->st_blocks = statxbuf->stx_blocks; | |
| 1134 buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec; | |
| 1135 buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec; | |
| 1136 buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec; | |
| 1137 buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec; | |
| 1138 buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec; | |
| 1139 buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec; | |
| 1140 buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec; | |
| 1141 buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec; | |
| 1142 buf->st_flags = 0; | |
| 1143 buf->st_gen = 0; | |
| 1144 } | |
| 1145 | |
| 1146 | |
| 1147 static void uv__iou_fs_statx_post(uv_fs_t* req) { | |
| 1148 struct uv__statx* statxbuf; | |
| 1149 uv_stat_t* buf; | |
| 1150 | |
| 1151 buf = &req->statbuf; | |
| 1152 statxbuf = req->ptr; | |
| 1153 req->ptr = NULL; | |
| 1154 | |
| 1155 if (req->result == 0) { | |
| 1156 uv__msan_unpoison(statxbuf, sizeof(*statxbuf)); | |
| 1157 uv__statx_to_stat(statxbuf, buf); | |
| 1158 req->ptr = buf; | |
| 1159 } | |
| 1160 | |
| 1161 uv__free(statxbuf); | |
| 1162 } | |
| 1163 | |
| 1164 | |
| 1165 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) { | |
| 1166 struct uv__io_uring_cqe* cqe; | |
| 1167 struct uv__io_uring_cqe* e; | |
| 1168 uv_fs_t* req; | |
| 1169 uint32_t head; | |
| 1170 uint32_t tail; | |
| 1171 uint32_t mask; | |
| 1172 uint32_t i; | |
| 1173 uint32_t flags; | |
| 1174 int nevents; | |
| 1175 int rc; | |
| 1176 | |
| 1177 head = *iou->cqhead; | |
| 1178 tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail, | |
| 1179 memory_order_acquire); | |
| 1180 mask = iou->cqmask; | |
| 1181 cqe = iou->cqe; | |
| 1182 nevents = 0; | |
| 1183 | |
| 1184 for (i = head; i != tail; i++) { | |
| 1185 e = &cqe[i & mask]; | |
| 1186 | |
| 1187 req = (uv_fs_t*) (uintptr_t) e->user_data; | |
| 1188 assert(req->type == UV_FS); | |
| 1189 | |
| 1190 uv__req_unregister(loop); | |
| 1191 iou->in_flight--; | |
| 1192 | |
| 1193 /* If the op is not supported by the kernel retry using the thread pool */ | |
| 1194 if (e->res == -EOPNOTSUPP) { | |
| 1195 uv__fs_post(loop, req); | |
| 1196 continue; | |
| 1197 } | |
| 1198 | |
| 1199 /* io_uring stores error codes as negative numbers, same as libuv. */ | |
| 1200 req->result = e->res; | |
| 1201 | |
| 1202 switch (req->fs_type) { | |
| 1203 case UV_FS_FSTAT: | |
| 1204 case UV_FS_LSTAT: | |
| 1205 case UV_FS_STAT: | |
| 1206 uv__iou_fs_statx_post(req); | |
| 1207 break; | |
| 1208 default: /* Squelch -Wswitch warnings. */ | |
| 1209 break; | |
| 1210 } | |
| 1211 | |
| 1212 uv__metrics_update_idle_time(loop); | |
| 1213 req->cb(req); | |
| 1214 nevents++; | |
| 1215 } | |
| 1216 | |
| 1217 atomic_store_explicit((_Atomic uint32_t*) iou->cqhead, | |
| 1218 tail, | |
| 1219 memory_order_release); | |
| 1220 | |
| 1221 /* Check whether CQE's overflowed, if so enter the kernel to make them | |
| 1222 * available. Don't grab them immediately but in the next loop iteration to | |
| 1223 * avoid loop starvation. */ | |
| 1224 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags, | |
| 1225 memory_order_acquire); | |
| 1226 | |
| 1227 if (flags & UV__IORING_SQ_CQ_OVERFLOW) { | |
| 1228 do | |
| 1229 rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS); | |
| 1230 while (rc == -1 && errno == EINTR); | |
| 1231 | |
| 1232 if (rc < 0) | |
| 1233 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */ | |
| 1234 } | |
| 1235 | |
| 1236 uv__metrics_inc_events(loop, nevents); | |
| 1237 if (uv__get_internal_fields(loop)->current_timeout == 0) | |
| 1238 uv__metrics_inc_events_waiting(loop, nevents); | |
| 1239 } | |
| 1240 | |
| 1241 | |
| 1242 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be | |
| 1243 * executed immediately, otherwise the file descriptor may have been closed | |
| 1244 * by the time the kernel starts the operation. | |
| 1245 */ | |
| 1246 static void uv__epoll_ctl_prep(int epollfd, | |
| 1247 struct uv__iou* ctl, | |
| 1248 struct epoll_event (*events)[256], | |
| 1249 int op, | |
| 1250 int fd, | |
| 1251 struct epoll_event* e) { | |
| 1252 struct uv__io_uring_sqe* sqe; | |
| 1253 struct epoll_event* pe; | |
| 1254 uint32_t mask; | |
| 1255 uint32_t slot; | |
| 1256 | |
| 1257 assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD); | |
| 1258 assert(ctl->ringfd != -1); | |
| 1259 | |
| 1260 mask = ctl->sqmask; | |
| 1261 slot = (*ctl->sqtail)++ & mask; | |
| 1262 | |
| 1263 pe = &(*events)[slot]; | |
| 1264 *pe = *e; | |
| 1265 | |
| 1266 sqe = ctl->sqe; | |
| 1267 sqe = &sqe[slot]; | |
| 1268 | |
| 1269 memset(sqe, 0, sizeof(*sqe)); | |
| 1270 sqe->addr = (uintptr_t) pe; | |
| 1271 sqe->fd = epollfd; | |
| 1272 sqe->len = op; | |
| 1273 sqe->off = fd; | |
| 1274 sqe->opcode = UV__IORING_OP_EPOLL_CTL; | |
| 1275 sqe->user_data = op | slot << 2 | (int64_t) fd << 32; | |
| 1276 | |
| 1277 if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask)) | |
| 1278 uv__epoll_ctl_flush(epollfd, ctl, events); | |
| 1279 } | |
| 1280 | |
| 1281 | |
| 1282 static void uv__epoll_ctl_flush(int epollfd, | |
| 1283 struct uv__iou* ctl, | |
| 1284 struct epoll_event (*events)[256]) { | |
| 1285 struct epoll_event oldevents[256]; | |
| 1286 struct uv__io_uring_cqe* cqe; | |
| 1287 uint32_t oldslot; | |
| 1288 uint32_t slot; | |
| 1289 uint32_t n; | |
| 1290 int fd; | |
| 1291 int op; | |
| 1292 int rc; | |
| 1293 | |
| 1294 STATIC_ASSERT(sizeof(oldevents) == sizeof(*events)); | |
| 1295 assert(ctl->ringfd != -1); | |
| 1296 assert(*ctl->sqhead != *ctl->sqtail); | |
| 1297 | |
| 1298 n = *ctl->sqtail - *ctl->sqhead; | |
| 1299 do | |
| 1300 rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS); | |
| 1301 while (rc == -1 && errno == EINTR); | |
| 1302 | |
| 1303 if (rc < 0) | |
| 1304 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */ | |
| 1305 | |
| 1306 if (rc != (int) n) | |
| 1307 abort(); | |
| 1308 | |
| 1309 assert(*ctl->sqhead == *ctl->sqtail); | |
| 1310 | |
| 1311 memcpy(oldevents, *events, sizeof(*events)); | |
| 1312 | |
| 1313 /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors | |
| 1314 * that have been closed, or EPOLL_CTL_ADD commands for file descriptors | |
| 1315 * that we are already watching. Ignore the former and retry the latter | |
| 1316 * with EPOLL_CTL_MOD. | |
| 1317 */ | |
| 1318 while (*ctl->cqhead != *ctl->cqtail) { | |
| 1319 slot = (*ctl->cqhead)++ & ctl->cqmask; | |
| 1320 | |
| 1321 cqe = ctl->cqe; | |
| 1322 cqe = &cqe[slot]; | |
| 1323 | |
| 1324 if (cqe->res == 0) | |
| 1325 continue; | |
| 1326 | |
| 1327 fd = cqe->user_data >> 32; | |
| 1328 op = 3 & cqe->user_data; | |
| 1329 oldslot = 255 & (cqe->user_data >> 2); | |
| 1330 | |
| 1331 if (op == EPOLL_CTL_DEL) | |
| 1332 continue; | |
| 1333 | |
| 1334 if (op != EPOLL_CTL_ADD) | |
| 1335 abort(); | |
| 1336 | |
| 1337 if (cqe->res != -EEXIST) | |
| 1338 abort(); | |
| 1339 | |
| 1340 uv__epoll_ctl_prep(epollfd, | |
| 1341 ctl, | |
| 1342 events, | |
| 1343 EPOLL_CTL_MOD, | |
| 1344 fd, | |
| 1345 &oldevents[oldslot]); | |
| 1346 } | |
| 1347 } | |
| 1348 | |
| 1349 | |
| 1350 void uv__io_poll(uv_loop_t* loop, int timeout) { | |
| 1351 uv__loop_internal_fields_t* lfields; | |
| 1352 struct epoll_event events[1024]; | |
| 1353 struct epoll_event prep[256]; | |
| 1354 struct uv__invalidate inv; | |
| 1355 struct epoll_event* pe; | |
| 1356 struct epoll_event e; | |
| 1357 struct uv__iou* ctl; | |
| 1358 struct uv__iou* iou; | |
| 1359 int real_timeout; | |
| 1360 struct uv__queue* q; | |
| 1361 uv__io_t* w; | |
| 1362 sigset_t* sigmask; | |
| 1363 sigset_t sigset; | |
| 1364 uint64_t base; | |
| 1365 int have_iou_events; | |
| 1366 int have_signals; | |
| 1367 int nevents; | |
| 1368 int epollfd; | |
| 1369 int count; | |
| 1370 int nfds; | |
| 1371 int fd; | |
| 1372 int op; | |
| 1373 int i; | |
| 1374 int user_timeout; | |
| 1375 int reset_timeout; | |
| 1376 | |
| 1377 lfields = uv__get_internal_fields(loop); | |
| 1378 ctl = &lfields->ctl; | |
| 1379 iou = &lfields->iou; | |
| 1380 | |
| 1381 sigmask = NULL; | |
| 1382 if (loop->flags & UV_LOOP_BLOCK_SIGPROF) { | |
| 1383 sigemptyset(&sigset); | |
| 1384 sigaddset(&sigset, SIGPROF); | |
| 1385 sigmask = &sigset; | |
| 1386 } | |
| 1387 | |
| 1388 assert(timeout >= -1); | |
| 1389 base = loop->time; | |
| 1390 count = 48; /* Benchmarks suggest this gives the best throughput. */ | |
| 1391 real_timeout = timeout; | |
| 1392 | |
| 1393 if (lfields->flags & UV_METRICS_IDLE_TIME) { | |
| 1394 reset_timeout = 1; | |
| 1395 user_timeout = timeout; | |
| 1396 timeout = 0; | |
| 1397 } else { | |
| 1398 reset_timeout = 0; | |
| 1399 user_timeout = 0; | |
| 1400 } | |
| 1401 | |
| 1402 epollfd = loop->backend_fd; | |
| 1403 | |
| 1404 memset(&e, 0, sizeof(e)); | |
| 1405 | |
| 1406 while (!uv__queue_empty(&loop->watcher_queue)) { | |
| 1407 q = uv__queue_head(&loop->watcher_queue); | |
| 1408 w = uv__queue_data(q, uv__io_t, watcher_queue); | |
| 1409 uv__queue_remove(q); | |
| 1410 uv__queue_init(q); | |
| 1411 | |
| 1412 op = EPOLL_CTL_MOD; | |
| 1413 if (w->events == 0) | |
| 1414 op = EPOLL_CTL_ADD; | |
| 1415 | |
| 1416 w->events = w->pevents; | |
| 1417 e.events = w->pevents; | |
| 1418 e.data.fd = w->fd; | |
| 1419 fd = w->fd; | |
| 1420 | |
| 1421 if (ctl->ringfd != -1) { | |
| 1422 uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e); | |
| 1423 continue; | |
| 1424 } | |
| 1425 | |
| 1426 if (!epoll_ctl(epollfd, op, fd, &e)) | |
| 1427 continue; | |
| 1428 | |
| 1429 assert(op == EPOLL_CTL_ADD); | |
| 1430 assert(errno == EEXIST); | |
| 1431 | |
| 1432 /* File descriptor that's been watched before, update event mask. */ | |
| 1433 if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e)) | |
| 1434 abort(); | |
| 1435 } | |
| 1436 | |
| 1437 inv.events = events; | |
| 1438 inv.prep = &prep; | |
| 1439 inv.nfds = -1; | |
| 1440 | |
| 1441 for (;;) { | |
| 1442 if (loop->nfds == 0) | |
| 1443 if (iou->in_flight == 0) | |
| 1444 break; | |
| 1445 | |
| 1446 /* All event mask mutations should be visible to the kernel before | |
| 1447 * we enter epoll_pwait(). | |
| 1448 */ | |
| 1449 if (ctl->ringfd != -1) | |
| 1450 while (*ctl->sqhead != *ctl->sqtail) | |
| 1451 uv__epoll_ctl_flush(epollfd, ctl, &prep); | |
| 1452 | |
| 1453 /* Only need to set the provider_entry_time if timeout != 0. The function | |
| 1454 * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME. | |
| 1455 */ | |
| 1456 if (timeout != 0) | |
| 1457 uv__metrics_set_provider_entry_time(loop); | |
| 1458 | |
| 1459 /* Store the current timeout in a location that's globally accessible so | |
| 1460 * other locations like uv__work_done() can determine whether the queue | |
| 1461 * of events in the callback were waiting when poll was called. | |
| 1462 */ | |
| 1463 lfields->current_timeout = timeout; | |
| 1464 | |
| 1465 nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask); | |
| 1466 | |
| 1467 /* Update loop->time unconditionally. It's tempting to skip the update when | |
| 1468 * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the | |
| 1469 * operating system didn't reschedule our process while in the syscall. | |
| 1470 */ | |
| 1471 SAVE_ERRNO(uv__update_time(loop)); | |
| 1472 | |
| 1473 if (nfds == -1) | |
| 1474 assert(errno == EINTR); | |
| 1475 else if (nfds == 0) | |
| 1476 /* Unlimited timeout should only return with events or signal. */ | |
| 1477 assert(timeout != -1); | |
| 1478 | |
| 1479 if (nfds == 0 || nfds == -1) { | |
| 1480 if (reset_timeout != 0) { | |
| 1481 timeout = user_timeout; | |
| 1482 reset_timeout = 0; | |
| 1483 } else if (nfds == 0) { | |
| 1484 return; | |
| 1485 } | |
| 1486 | |
| 1487 /* Interrupted by a signal. Update timeout and poll again. */ | |
| 1488 goto update_timeout; | |
| 1489 } | |
| 1490 | |
| 1491 have_iou_events = 0; | |
| 1492 have_signals = 0; | |
| 1493 nevents = 0; | |
| 1494 | |
| 1495 inv.nfds = nfds; | |
| 1496 lfields->inv = &inv; | |
| 1497 | |
| 1498 for (i = 0; i < nfds; i++) { | |
| 1499 pe = events + i; | |
| 1500 fd = pe->data.fd; | |
| 1501 | |
| 1502 /* Skip invalidated events, see uv__platform_invalidate_fd */ | |
| 1503 if (fd == -1) | |
| 1504 continue; | |
| 1505 | |
| 1506 if (fd == iou->ringfd) { | |
| 1507 uv__poll_io_uring(loop, iou); | |
| 1508 have_iou_events = 1; | |
| 1509 continue; | |
| 1510 } | |
| 1511 | |
| 1512 assert(fd >= 0); | |
| 1513 assert((unsigned) fd < loop->nwatchers); | |
| 1514 | |
| 1515 w = loop->watchers[fd]; | |
| 1516 | |
| 1517 if (w == NULL) { | |
| 1518 /* File descriptor that we've stopped watching, disarm it. | |
| 1519 * | |
| 1520 * Ignore all errors because we may be racing with another thread | |
| 1521 * when the file descriptor is closed. | |
| 1522 * | |
| 1523 * Perform EPOLL_CTL_DEL immediately instead of going through | |
| 1524 * io_uring's submit queue, otherwise the file descriptor may | |
| 1525 * be closed by the time the kernel starts the operation. | |
| 1526 */ | |
| 1527 epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe); | |
| 1528 continue; | |
| 1529 } | |
| 1530 | |
| 1531 /* Give users only events they're interested in. Prevents spurious | |
| 1532 * callbacks when previous callback invocation in this loop has stopped | |
| 1533 * the current watcher. Also, filters out events that users has not | |
| 1534 * requested us to watch. | |
| 1535 */ | |
| 1536 pe->events &= w->pevents | POLLERR | POLLHUP; | |
| 1537 | |
| 1538 /* Work around an epoll quirk where it sometimes reports just the | |
| 1539 * EPOLLERR or EPOLLHUP event. In order to force the event loop to | |
| 1540 * move forward, we merge in the read/write events that the watcher | |
| 1541 * is interested in; uv__read() and uv__write() will then deal with | |
| 1542 * the error or hangup in the usual fashion. | |
| 1543 * | |
| 1544 * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user | |
| 1545 * reads the available data, calls uv_read_stop(), then sometime later | |
| 1546 * calls uv_read_start() again. By then, libuv has forgotten about the | |
| 1547 * hangup and the kernel won't report EPOLLIN again because there's | |
| 1548 * nothing left to read. If anything, libuv is to blame here. The | |
| 1549 * current hack is just a quick bandaid; to properly fix it, libuv | |
| 1550 * needs to remember the error/hangup event. We should get that for | |
| 1551 * free when we switch over to edge-triggered I/O. | |
| 1552 */ | |
| 1553 if (pe->events == POLLERR || pe->events == POLLHUP) | |
| 1554 pe->events |= | |
| 1555 w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI); | |
| 1556 | |
| 1557 if (pe->events != 0) { | |
| 1558 /* Run signal watchers last. This also affects child process watchers | |
| 1559 * because those are implemented in terms of signal watchers. | |
| 1560 */ | |
| 1561 if (w == &loop->signal_io_watcher) { | |
| 1562 have_signals = 1; | |
| 1563 } else { | |
| 1564 uv__metrics_update_idle_time(loop); | |
| 1565 w->cb(loop, w, pe->events); | |
| 1566 } | |
| 1567 | |
| 1568 nevents++; | |
| 1569 } | |
| 1570 } | |
| 1571 | |
| 1572 uv__metrics_inc_events(loop, nevents); | |
| 1573 if (reset_timeout != 0) { | |
| 1574 timeout = user_timeout; | |
| 1575 reset_timeout = 0; | |
| 1576 uv__metrics_inc_events_waiting(loop, nevents); | |
| 1577 } | |
| 1578 | |
| 1579 if (have_signals != 0) { | |
| 1580 uv__metrics_update_idle_time(loop); | |
| 1581 loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN); | |
| 1582 } | |
| 1583 | |
| 1584 lfields->inv = NULL; | |
| 1585 | |
| 1586 if (have_iou_events != 0) | |
| 1587 break; /* Event loop should cycle now so don't poll again. */ | |
| 1588 | |
| 1589 if (have_signals != 0) | |
| 1590 break; /* Event loop should cycle now so don't poll again. */ | |
| 1591 | |
| 1592 if (nevents != 0) { | |
| 1593 if (nfds == ARRAY_SIZE(events) && --count != 0) { | |
| 1594 /* Poll for more events but don't block this time. */ | |
| 1595 timeout = 0; | |
| 1596 continue; | |
| 1597 } | |
| 1598 break; | |
| 1599 } | |
| 1600 | |
| 1601 update_timeout: | |
| 1602 if (timeout == 0) | |
| 1603 break; | |
| 1604 | |
| 1605 if (timeout == -1) | |
| 1606 continue; | |
| 1607 | |
| 1608 assert(timeout > 0); | |
| 1609 | |
| 1610 real_timeout -= (loop->time - base); | |
| 1611 if (real_timeout <= 0) | |
| 1612 break; | |
| 1613 | |
| 1614 timeout = real_timeout; | |
| 1615 } | |
| 1616 | |
| 1617 if (ctl->ringfd != -1) | |
| 1618 while (*ctl->sqhead != *ctl->sqtail) | |
| 1619 uv__epoll_ctl_flush(epollfd, ctl, &prep); | |
| 1620 } | |
| 1621 | |
| 1622 uint64_t uv__hrtime(uv_clocktype_t type) { | |
| 1623 static _Atomic clock_t fast_clock_id = -1; | |
| 1624 struct timespec t; | |
| 1625 clock_t clock_id; | |
| 1626 | |
| 1627 /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has | |
| 1628 * millisecond granularity or better. CLOCK_MONOTONIC_COARSE is | |
| 1629 * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may | |
| 1630 * decide to make a costly system call. | |
| 1631 */ | |
| 1632 /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE | |
| 1633 * when it has microsecond granularity or better (unlikely). | |
| 1634 */ | |
| 1635 clock_id = CLOCK_MONOTONIC; | |
| 1636 if (type != UV_CLOCK_FAST) | |
| 1637 goto done; | |
| 1638 | |
| 1639 clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed); | |
| 1640 if (clock_id != -1) | |
| 1641 goto done; | |
| 1642 | |
| 1643 clock_id = CLOCK_MONOTONIC; | |
| 1644 if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t)) | |
| 1645 if (t.tv_nsec <= 1 * 1000 * 1000) | |
| 1646 clock_id = CLOCK_MONOTONIC_COARSE; | |
| 1647 | |
| 1648 atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed); | |
| 1649 | |
| 1650 done: | |
| 1651 | |
| 1652 if (clock_gettime(clock_id, &t)) | |
| 1653 return 0; /* Not really possible. */ | |
| 1654 | |
| 1655 return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec; | |
| 1656 } | |
| 1657 | |
| 1658 | |
| 1659 int uv_resident_set_memory(size_t* rss) { | |
| 1660 char buf[1024]; | |
| 1661 const char* s; | |
| 1662 long val; | |
| 1663 int rc; | |
| 1664 int i; | |
| 1665 | |
| 1666 /* rss: 24th element */ | |
| 1667 rc = uv__slurp("/proc/self/stat", buf, sizeof(buf)); | |
| 1668 if (rc < 0) | |
| 1669 return rc; | |
| 1670 | |
| 1671 /* find the last ')' */ | |
| 1672 s = strrchr(buf, ')'); | |
| 1673 if (s == NULL) | |
| 1674 goto err; | |
| 1675 | |
| 1676 for (i = 1; i <= 22; i++) { | |
| 1677 s = strchr(s + 1, ' '); | |
| 1678 if (s == NULL) | |
| 1679 goto err; | |
| 1680 } | |
| 1681 | |
| 1682 errno = 0; | |
| 1683 val = strtol(s, NULL, 10); | |
| 1684 if (val < 0 || errno != 0) | |
| 1685 goto err; | |
| 1686 | |
| 1687 *rss = val * getpagesize(); | |
| 1688 return 0; | |
| 1689 | |
| 1690 err: | |
| 1691 return UV_EINVAL; | |
| 1692 } | |
| 1693 | |
| 1694 int uv_uptime(double* uptime) { | |
| 1695 struct timespec now; | |
| 1696 char buf[128]; | |
| 1697 | |
| 1698 /* Consult /proc/uptime when present (common case), or fall back to | |
| 1699 * clock_gettime. Why not always clock_gettime? It doesn't always return the | |
| 1700 * right result under OpenVZ and possibly other containerized environments. | |
| 1701 */ | |
| 1702 if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf))) | |
| 1703 if (1 == sscanf(buf, "%lf", uptime)) | |
| 1704 return 0; | |
| 1705 | |
| 1706 if (clock_gettime(CLOCK_BOOTTIME, &now)) | |
| 1707 return UV__ERR(errno); | |
| 1708 | |
| 1709 *uptime = now.tv_sec; | |
| 1710 return 0; | |
| 1711 } | |
| 1712 | |
| 1713 | |
| 1714 int uv_cpu_info(uv_cpu_info_t** ci, int* count) { | |
| 1715 #if defined(__PPC__) | |
| 1716 static const char model_marker[] = "cpu\t\t: "; | |
| 1717 static const char model_marker2[] = ""; | |
| 1718 #elif defined(__arm__) | |
| 1719 static const char model_marker[] = "model name\t: "; | |
| 1720 static const char model_marker2[] = "Processor\t: "; | |
| 1721 #elif defined(__aarch64__) | |
| 1722 static const char model_marker[] = "CPU part\t: "; | |
| 1723 static const char model_marker2[] = ""; | |
| 1724 #elif defined(__mips__) | |
| 1725 static const char model_marker[] = "cpu model\t\t: "; | |
| 1726 static const char model_marker2[] = ""; | |
| 1727 #elif defined(__loongarch__) | |
| 1728 static const char model_marker[] = "cpu family\t\t: "; | |
| 1729 static const char model_marker2[] = ""; | |
| 1730 #else | |
| 1731 static const char model_marker[] = "model name\t: "; | |
| 1732 static const char model_marker2[] = ""; | |
| 1733 #endif | |
| 1734 static const char parts[] = | |
| 1735 #ifdef __aarch64__ | |
| 1736 "0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n" | |
| 1737 "0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n" | |
| 1738 "0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n" | |
| 1739 "0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n" | |
| 1740 "0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n" | |
| 1741 "0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n" | |
| 1742 "0xc0d\nCortex-A17\n" /* Originally A12 */ | |
| 1743 "0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n" | |
| 1744 "0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n" | |
| 1745 "0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n" | |
| 1746 "0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n" | |
| 1747 "0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n" | |
| 1748 "0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n" | |
| 1749 "0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n" | |
| 1750 "0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n" | |
| 1751 "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n" | |
| 1752 "0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n" | |
| 1753 "0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n" | |
| 1754 #endif | |
| 1755 ""; | |
| 1756 struct cpu { | |
| 1757 unsigned long long freq, user, nice, sys, idle, irq; | |
| 1758 unsigned model; | |
| 1759 }; | |
| 1760 FILE* fp; | |
| 1761 char* p; | |
| 1762 int found; | |
| 1763 int n; | |
| 1764 unsigned i; | |
| 1765 unsigned cpu; | |
| 1766 unsigned maxcpu; | |
| 1767 unsigned size; | |
| 1768 unsigned long long skip; | |
| 1769 struct cpu (*cpus)[8192]; /* Kernel maximum. */ | |
| 1770 struct cpu* c; | |
| 1771 struct cpu t; | |
| 1772 char (*model)[64]; | |
| 1773 unsigned char bitmap[ARRAY_SIZE(*cpus) / 8]; | |
| 1774 /* Assumption: even big.LITTLE systems will have only a handful | |
| 1775 * of different CPU models. Most systems will just have one. | |
| 1776 */ | |
| 1777 char models[8][64]; | |
| 1778 char buf[1024]; | |
| 1779 | |
| 1780 memset(bitmap, 0, sizeof(bitmap)); | |
| 1781 memset(models, 0, sizeof(models)); | |
| 1782 snprintf(*models, sizeof(*models), "unknown"); | |
| 1783 maxcpu = 0; | |
| 1784 | |
| 1785 cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus)); | |
| 1786 if (cpus == NULL) | |
| 1787 return UV_ENOMEM; | |
| 1788 | |
| 1789 fp = uv__open_file("/proc/stat"); | |
| 1790 if (fp == NULL) { | |
| 1791 uv__free(cpus); | |
| 1792 return UV__ERR(errno); | |
| 1793 } | |
| 1794 | |
| 1795 if (NULL == fgets(buf, sizeof(buf), fp)) | |
| 1796 abort(); | |
| 1797 | |
| 1798 for (;;) { | |
| 1799 memset(&t, 0, sizeof(t)); | |
| 1800 | |
| 1801 n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu", | |
| 1802 &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq); | |
| 1803 | |
| 1804 if (n != 7) | |
| 1805 break; | |
| 1806 | |
| 1807 if (NULL == fgets(buf, sizeof(buf), fp)) | |
| 1808 abort(); | |
| 1809 | |
| 1810 if (cpu >= ARRAY_SIZE(*cpus)) | |
| 1811 continue; | |
| 1812 | |
| 1813 (*cpus)[cpu] = t; | |
| 1814 | |
| 1815 bitmap[cpu >> 3] |= 1 << (cpu & 7); | |
| 1816 | |
| 1817 if (cpu >= maxcpu) | |
| 1818 maxcpu = cpu + 1; | |
| 1819 } | |
| 1820 | |
| 1821 fclose(fp); | |
| 1822 | |
| 1823 fp = uv__open_file("/proc/cpuinfo"); | |
| 1824 if (fp == NULL) | |
| 1825 goto nocpuinfo; | |
| 1826 | |
| 1827 for (;;) { | |
| 1828 if (1 != fscanf(fp, "processor\t: %u\n", &cpu)) | |
| 1829 break; /* Parse error. */ | |
| 1830 | |
| 1831 while (fgets(buf, sizeof(buf), fp)) { | |
| 1832 if (!strncmp(buf, model_marker, sizeof(model_marker) - 1)) { | |
| 1833 p = buf + sizeof(model_marker) - 1; | |
| 1834 goto parts; | |
| 1835 } | |
| 1836 if (!*model_marker2) | |
| 1837 continue; | |
| 1838 if (!strncmp(buf, model_marker2, sizeof(model_marker2) - 1)) { | |
| 1839 p = buf + sizeof(model_marker2) - 1; | |
| 1840 goto parts; | |
| 1841 } | |
| 1842 } | |
| 1843 | |
| 1844 goto next; /* Not found. */ | |
| 1845 | |
| 1846 parts: | |
| 1847 n = (int) strcspn(p, "\n"); | |
| 1848 | |
| 1849 /* arm64: translate CPU part code to model name. */ | |
| 1850 if (*parts) { | |
| 1851 p = memmem(parts, sizeof(parts) - 1, p, n + 1); | |
| 1852 if (p == NULL) | |
| 1853 p = "unknown"; | |
| 1854 else | |
| 1855 p += n + 1; | |
| 1856 n = (int) strcspn(p, "\n"); | |
| 1857 } | |
| 1858 | |
| 1859 found = 0; | |
| 1860 for (model = models; !found && model < ARRAY_END(models); model++) | |
| 1861 found = !strncmp(p, *model, strlen(*model)); | |
| 1862 | |
| 1863 if (!found) | |
| 1864 goto next; | |
| 1865 | |
| 1866 if (**model == '\0') | |
| 1867 snprintf(*model, sizeof(*model), "%.*s", n, p); | |
| 1868 | |
| 1869 if (cpu < maxcpu) | |
| 1870 (*cpus)[cpu].model = model - models; | |
| 1871 | |
| 1872 next: | |
| 1873 while (fgets(buf, sizeof(buf), fp)) | |
| 1874 if (*buf == '\n') | |
| 1875 break; | |
| 1876 } | |
| 1877 | |
| 1878 fclose(fp); | |
| 1879 fp = NULL; | |
| 1880 | |
| 1881 nocpuinfo: | |
| 1882 | |
| 1883 n = 0; | |
| 1884 for (cpu = 0; cpu < maxcpu; cpu++) { | |
| 1885 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7)))) | |
| 1886 continue; | |
| 1887 | |
| 1888 n++; | |
| 1889 snprintf(buf, sizeof(buf), | |
| 1890 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu); | |
| 1891 | |
| 1892 fp = uv__open_file(buf); | |
| 1893 if (fp == NULL) | |
| 1894 continue; | |
| 1895 | |
| 1896 if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq)) | |
| 1897 abort(); | |
| 1898 fclose(fp); | |
| 1899 fp = NULL; | |
| 1900 } | |
| 1901 | |
| 1902 size = n * sizeof(**ci) + sizeof(models); | |
| 1903 *ci = uv__malloc(size); | |
| 1904 *count = 0; | |
| 1905 | |
| 1906 if (*ci == NULL) { | |
| 1907 uv__free(cpus); | |
| 1908 return UV_ENOMEM; | |
| 1909 } | |
| 1910 | |
| 1911 *count = n; | |
| 1912 p = memcpy(*ci + n, models, sizeof(models)); | |
| 1913 | |
| 1914 i = 0; | |
| 1915 for (cpu = 0; cpu < maxcpu; cpu++) { | |
| 1916 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7)))) | |
| 1917 continue; | |
| 1918 | |
| 1919 c = *cpus + cpu; | |
| 1920 | |
| 1921 (*ci)[i++] = (uv_cpu_info_t) { | |
| 1922 .model = p + c->model * sizeof(*model), | |
| 1923 .speed = c->freq / 1000, | |
| 1924 /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz, | |
| 1925 * therefore the multiplier is always 1000/100 = 10. | |
| 1926 */ | |
| 1927 .cpu_times = (struct uv_cpu_times_s) { | |
| 1928 .user = 10 * c->user, | |
| 1929 .nice = 10 * c->nice, | |
| 1930 .sys = 10 * c->sys, | |
| 1931 .idle = 10 * c->idle, | |
| 1932 .irq = 10 * c->irq, | |
| 1933 }, | |
| 1934 }; | |
| 1935 } | |
| 1936 | |
| 1937 uv__free(cpus); | |
| 1938 | |
| 1939 return 0; | |
| 1940 } | |
| 1941 | |
| 1942 | |
| 1943 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) { | |
| 1944 if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING))) | |
| 1945 return 1; | |
| 1946 if (ent->ifa_addr == NULL) | |
| 1947 return 1; | |
| 1948 /* | |
| 1949 * On Linux getifaddrs returns information related to the raw underlying | |
| 1950 * devices. We're not interested in this information yet. | |
| 1951 */ | |
| 1952 if (ent->ifa_addr->sa_family == PF_PACKET) | |
| 1953 return exclude_type; | |
| 1954 return !exclude_type; | |
| 1955 } | |
| 1956 | |
| 1957 /* TODO(bnoordhuis) share with bsd-ifaddrs.c */ | |
| 1958 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) { | |
| 1959 uv_interface_address_t* address; | |
| 1960 struct sockaddr_ll* sll; | |
| 1961 struct ifaddrs* addrs; | |
| 1962 struct ifaddrs* ent; | |
| 1963 size_t namelen; | |
| 1964 char* name; | |
| 1965 int i; | |
| 1966 | |
| 1967 *count = 0; | |
| 1968 *addresses = NULL; | |
| 1969 | |
| 1970 if (getifaddrs(&addrs)) | |
| 1971 return UV__ERR(errno); | |
| 1972 | |
| 1973 /* Count the number of interfaces */ | |
| 1974 namelen = 0; | |
| 1975 for (ent = addrs; ent != NULL; ent = ent->ifa_next) { | |
| 1976 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR)) | |
| 1977 continue; | |
| 1978 | |
| 1979 namelen += strlen(ent->ifa_name) + 1; | |
| 1980 (*count)++; | |
| 1981 } | |
| 1982 | |
| 1983 if (*count == 0) { | |
| 1984 freeifaddrs(addrs); | |
| 1985 return 0; | |
| 1986 } | |
| 1987 | |
| 1988 /* Make sure the memory is initiallized to zero using calloc() */ | |
| 1989 *addresses = uv__calloc(1, *count * sizeof(**addresses) + namelen); | |
| 1990 if (*addresses == NULL) { | |
| 1991 freeifaddrs(addrs); | |
| 1992 return UV_ENOMEM; | |
| 1993 } | |
| 1994 | |
| 1995 name = (char*) &(*addresses)[*count]; | |
| 1996 address = *addresses; | |
| 1997 | |
| 1998 for (ent = addrs; ent != NULL; ent = ent->ifa_next) { | |
| 1999 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR)) | |
| 2000 continue; | |
| 2001 | |
| 2002 namelen = strlen(ent->ifa_name) + 1; | |
| 2003 address->name = memcpy(name, ent->ifa_name, namelen); | |
| 2004 name += namelen; | |
| 2005 | |
| 2006 if (ent->ifa_addr->sa_family == AF_INET6) { | |
| 2007 address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr); | |
| 2008 } else { | |
| 2009 address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr); | |
| 2010 } | |
| 2011 | |
| 2012 if (ent->ifa_netmask->sa_family == AF_INET6) { | |
| 2013 address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask); | |
| 2014 } else { | |
| 2015 address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask); | |
| 2016 } | |
| 2017 | |
| 2018 address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK); | |
| 2019 | |
| 2020 address++; | |
| 2021 } | |
| 2022 | |
| 2023 /* Fill in physical addresses for each interface */ | |
| 2024 for (ent = addrs; ent != NULL; ent = ent->ifa_next) { | |
| 2025 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS)) | |
| 2026 continue; | |
| 2027 | |
| 2028 address = *addresses; | |
| 2029 | |
| 2030 for (i = 0; i < (*count); i++) { | |
| 2031 size_t namelen = strlen(ent->ifa_name); | |
| 2032 /* Alias interface share the same physical address */ | |
| 2033 if (strncmp(address->name, ent->ifa_name, namelen) == 0 && | |
| 2034 (address->name[namelen] == 0 || address->name[namelen] == ':')) { | |
| 2035 sll = (struct sockaddr_ll*)ent->ifa_addr; | |
| 2036 memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr)); | |
| 2037 } | |
| 2038 address++; | |
| 2039 } | |
| 2040 } | |
| 2041 | |
| 2042 freeifaddrs(addrs); | |
| 2043 | |
| 2044 return 0; | |
| 2045 } | |
| 2046 | |
| 2047 | |
| 2048 /* TODO(bnoordhuis) share with bsd-ifaddrs.c */ | |
| 2049 void uv_free_interface_addresses(uv_interface_address_t* addresses, | |
| 2050 int count) { | |
| 2051 uv__free(addresses); | |
| 2052 } | |
| 2053 | |
| 2054 | |
| 2055 void uv__set_process_title(const char* title) { | |
| 2056 #if defined(PR_SET_NAME) | |
| 2057 prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */ | |
| 2058 #endif | |
| 2059 } | |
| 2060 | |
| 2061 | |
| 2062 static uint64_t uv__read_proc_meminfo(const char* what) { | |
| 2063 uint64_t rc; | |
| 2064 char* p; | |
| 2065 char buf[4096]; /* Large enough to hold all of /proc/meminfo. */ | |
| 2066 | |
| 2067 if (uv__slurp("/proc/meminfo", buf, sizeof(buf))) | |
| 2068 return 0; | |
| 2069 | |
| 2070 p = strstr(buf, what); | |
| 2071 | |
| 2072 if (p == NULL) | |
| 2073 return 0; | |
| 2074 | |
| 2075 p += strlen(what); | |
| 2076 | |
| 2077 rc = 0; | |
| 2078 sscanf(p, "%" PRIu64 " kB", &rc); | |
| 2079 | |
| 2080 return rc * 1024; | |
| 2081 } | |
| 2082 | |
| 2083 | |
| 2084 uint64_t uv_get_free_memory(void) { | |
| 2085 struct sysinfo info; | |
| 2086 uint64_t rc; | |
| 2087 | |
| 2088 rc = uv__read_proc_meminfo("MemAvailable:"); | |
| 2089 | |
| 2090 if (rc != 0) | |
| 2091 return rc; | |
| 2092 | |
| 2093 if (0 == sysinfo(&info)) | |
| 2094 return (uint64_t) info.freeram * info.mem_unit; | |
| 2095 | |
| 2096 return 0; | |
| 2097 } | |
| 2098 | |
| 2099 | |
| 2100 uint64_t uv_get_total_memory(void) { | |
| 2101 struct sysinfo info; | |
| 2102 uint64_t rc; | |
| 2103 | |
| 2104 rc = uv__read_proc_meminfo("MemTotal:"); | |
| 2105 | |
| 2106 if (rc != 0) | |
| 2107 return rc; | |
| 2108 | |
| 2109 if (0 == sysinfo(&info)) | |
| 2110 return (uint64_t) info.totalram * info.mem_unit; | |
| 2111 | |
| 2112 return 0; | |
| 2113 } | |
| 2114 | |
| 2115 | |
| 2116 static uint64_t uv__read_uint64(const char* filename) { | |
| 2117 char buf[32]; /* Large enough to hold an encoded uint64_t. */ | |
| 2118 uint64_t rc; | |
| 2119 | |
| 2120 rc = 0; | |
| 2121 if (0 == uv__slurp(filename, buf, sizeof(buf))) | |
| 2122 if (1 != sscanf(buf, "%" PRIu64, &rc)) | |
| 2123 if (0 == strcmp(buf, "max\n")) | |
| 2124 rc = UINT64_MAX; | |
| 2125 | |
| 2126 return rc; | |
| 2127 } | |
| 2128 | |
| 2129 | |
| 2130 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups, | |
| 2131 * finds the location and length of the memory controller mount path. | |
| 2132 * This disregards the leading / for easy concatenation of paths. | |
| 2133 * Returns NULL if the memory controller wasn't found. */ | |
| 2134 static char* uv__cgroup1_find_memory_controller(char buf[static 1024], | |
| 2135 int* n) { | |
| 2136 char* p; | |
| 2137 | |
| 2138 /* Seek to the memory controller line. */ | |
| 2139 p = strchr(buf, ':'); | |
| 2140 while (p != NULL && strncmp(p, ":memory:", 8)) { | |
| 2141 p = strchr(p, '\n'); | |
| 2142 if (p != NULL) | |
| 2143 p = strchr(p, ':'); | |
| 2144 } | |
| 2145 | |
| 2146 if (p != NULL) { | |
| 2147 /* Determine the length of the mount path. */ | |
| 2148 p = p + strlen(":memory:/"); | |
| 2149 *n = (int) strcspn(p, "\n"); | |
| 2150 } | |
| 2151 | |
| 2152 return p; | |
| 2153 } | |
| 2154 | |
| 2155 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high, | |
| 2156 uint64_t* max) { | |
| 2157 char filename[4097]; | |
| 2158 char* p; | |
| 2159 int n; | |
| 2160 uint64_t cgroup1_max; | |
| 2161 | |
| 2162 /* Find out where the controller is mounted. */ | |
| 2163 p = uv__cgroup1_find_memory_controller(buf, &n); | |
| 2164 if (p != NULL) { | |
| 2165 snprintf(filename, sizeof(filename), | |
| 2166 "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p); | |
| 2167 *high = uv__read_uint64(filename); | |
| 2168 | |
| 2169 snprintf(filename, sizeof(filename), | |
| 2170 "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p); | |
| 2171 *max = uv__read_uint64(filename); | |
| 2172 | |
| 2173 /* If the controller wasn't mounted, the reads above will have failed, | |
| 2174 * as indicated by uv__read_uint64 returning 0. | |
| 2175 */ | |
| 2176 if (*high != 0 && *max != 0) | |
| 2177 goto update_limits; | |
| 2178 } | |
| 2179 | |
| 2180 /* Fall back to the limits of the global memory controller. */ | |
| 2181 *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes"); | |
| 2182 *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes"); | |
| 2183 | |
| 2184 /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect | |
| 2185 * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE). | |
| 2186 */ | |
| 2187 update_limits: | |
| 2188 cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1); | |
| 2189 if (*high == cgroup1_max) | |
| 2190 *high = UINT64_MAX; | |
| 2191 if (*max == cgroup1_max) | |
| 2192 *max = UINT64_MAX; | |
| 2193 } | |
| 2194 | |
| 2195 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high, | |
| 2196 uint64_t* max) { | |
| 2197 char filename[4097]; | |
| 2198 char* p; | |
| 2199 int n; | |
| 2200 | |
| 2201 /* Find out where the controller is mounted. */ | |
| 2202 p = buf + strlen("0::/"); | |
| 2203 n = (int) strcspn(p, "\n"); | |
| 2204 | |
| 2205 /* Read the memory limits of the controller. */ | |
| 2206 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p); | |
| 2207 *max = uv__read_uint64(filename); | |
| 2208 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p); | |
| 2209 *high = uv__read_uint64(filename); | |
| 2210 } | |
| 2211 | |
| 2212 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) { | |
| 2213 uint64_t high; | |
| 2214 uint64_t max; | |
| 2215 | |
| 2216 /* In the case of cgroupv2, we'll only have a single entry. */ | |
| 2217 if (strncmp(buf, "0::/", 4)) | |
| 2218 uv__get_cgroup1_memory_limits(buf, &high, &max); | |
| 2219 else | |
| 2220 uv__get_cgroup2_memory_limits(buf, &high, &max); | |
| 2221 | |
| 2222 if (high == 0 || max == 0) | |
| 2223 return 0; | |
| 2224 | |
| 2225 return high < max ? high : max; | |
| 2226 } | |
| 2227 | |
| 2228 uint64_t uv_get_constrained_memory(void) { | |
| 2229 char buf[1024]; | |
| 2230 | |
| 2231 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf))) | |
| 2232 return 0; | |
| 2233 | |
| 2234 return uv__get_cgroup_constrained_memory(buf); | |
| 2235 } | |
| 2236 | |
| 2237 | |
| 2238 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) { | |
| 2239 char filename[4097]; | |
| 2240 uint64_t current; | |
| 2241 char* p; | |
| 2242 int n; | |
| 2243 | |
| 2244 /* Find out where the controller is mounted. */ | |
| 2245 p = uv__cgroup1_find_memory_controller(buf, &n); | |
| 2246 if (p != NULL) { | |
| 2247 snprintf(filename, sizeof(filename), | |
| 2248 "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p); | |
| 2249 current = uv__read_uint64(filename); | |
| 2250 | |
| 2251 /* If the controller wasn't mounted, the reads above will have failed, | |
| 2252 * as indicated by uv__read_uint64 returning 0. | |
| 2253 */ | |
| 2254 if (current != 0) | |
| 2255 return current; | |
| 2256 } | |
| 2257 | |
| 2258 /* Fall back to the usage of the global memory controller. */ | |
| 2259 return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes"); | |
| 2260 } | |
| 2261 | |
| 2262 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) { | |
| 2263 char filename[4097]; | |
| 2264 char* p; | |
| 2265 int n; | |
| 2266 | |
| 2267 /* Find out where the controller is mounted. */ | |
| 2268 p = buf + strlen("0::/"); | |
| 2269 n = (int) strcspn(p, "\n"); | |
| 2270 | |
| 2271 snprintf(filename, sizeof(filename), | |
| 2272 "/sys/fs/cgroup/%.*s/memory.current", n, p); | |
| 2273 return uv__read_uint64(filename); | |
| 2274 } | |
| 2275 | |
| 2276 uint64_t uv_get_available_memory(void) { | |
| 2277 char buf[1024]; | |
| 2278 uint64_t constrained; | |
| 2279 uint64_t current; | |
| 2280 uint64_t total; | |
| 2281 | |
| 2282 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf))) | |
| 2283 return 0; | |
| 2284 | |
| 2285 constrained = uv__get_cgroup_constrained_memory(buf); | |
| 2286 if (constrained == 0) | |
| 2287 return uv_get_free_memory(); | |
| 2288 | |
| 2289 total = uv_get_total_memory(); | |
| 2290 if (constrained > total) | |
| 2291 return uv_get_free_memory(); | |
| 2292 | |
| 2293 /* In the case of cgroupv2, we'll only have a single entry. */ | |
| 2294 if (strncmp(buf, "0::/", 4)) | |
| 2295 current = uv__get_cgroup1_current_memory(buf); | |
| 2296 else | |
| 2297 current = uv__get_cgroup2_current_memory(buf); | |
| 2298 | |
| 2299 /* memory usage can be higher than the limit (for short bursts of time) */ | |
| 2300 if (constrained < current) | |
| 2301 return 0; | |
| 2302 | |
| 2303 return constrained - current; | |
| 2304 } | |
| 2305 | |
| 2306 | |
| 2307 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup, | |
| 2308 long long* quota) { | |
| 2309 static const char cgroup_mount[] = "/sys/fs/cgroup"; | |
| 2310 const char* cgroup_trimmed; | |
| 2311 char buf[1024]; | |
| 2312 char full_path[256]; | |
| 2313 char path[256]; | |
| 2314 char quota_buf[16]; | |
| 2315 char* last_slash; | |
| 2316 int cgroup_size; | |
| 2317 long long limit; | |
| 2318 long long min_quota; | |
| 2319 long long period; | |
| 2320 | |
| 2321 if (strncmp(cgroup, "0::/", 4) != 0) | |
| 2322 return UV_EINVAL; | |
| 2323 | |
| 2324 /* Trim ending \n by replacing it with a 0 */ | |
| 2325 cgroup_trimmed = cgroup + sizeof("0::/") - 1; /* Skip the prefix "0::/" */ | |
| 2326 cgroup_size = (int)strcspn(cgroup_trimmed, "\n"); /* Find the first \n */ | |
| 2327 min_quota = LLONG_MAX; | |
| 2328 | |
| 2329 /* Construct the path to the cpu.max files */ | |
| 2330 snprintf(path, sizeof(path), "%s/%.*s/cgroup.controllers", cgroup_mount, | |
| 2331 cgroup_size, cgroup_trimmed); | |
| 2332 | |
| 2333 /* Read controllers, if not exists, not really a cgroup */ | |
| 2334 if (uv__slurp(path, buf, sizeof(buf)) < 0) | |
| 2335 return UV_EIO; | |
| 2336 | |
| 2337 snprintf(path, sizeof(path), "%s/%.*s", cgroup_mount, cgroup_size, | |
| 2338 cgroup_trimmed); | |
| 2339 | |
| 2340 /* | |
| 2341 * Traverse up the cgroup v2 hierarchy, starting from the current cgroup path. | |
| 2342 * At each level, attempt to read the "cpu.max" file, which defines the CPU | |
| 2343 * quota and period. | |
| 2344 * | |
| 2345 * This reflects how Linux applies cgroup limits hierarchically. | |
| 2346 * | |
| 2347 * e.g: given a path like /sys/fs/cgroup/foo/bar/baz, we check: | |
| 2348 * - /sys/fs/cgroup/foo/bar/baz/cpu.max | |
| 2349 * - /sys/fs/cgroup/foo/bar/cpu.max | |
| 2350 * - /sys/fs/cgroup/foo/cpu.max | |
| 2351 * - /sys/fs/cgroup/cpu.max | |
| 2352 */ | |
| 2353 while (strncmp(path, cgroup_mount, strlen(cgroup_mount)) == 0) { | |
| 2354 snprintf(full_path, sizeof(full_path), "%s/cpu.max", path); | |
| 2355 | |
| 2356 /* Silently ignore and continue if the file does not exist */ | |
| 2357 if (uv__slurp(full_path, quota_buf, sizeof(quota_buf)) < 0) | |
| 2358 goto next; | |
| 2359 | |
| 2360 /* No limit, move on */ | |
| 2361 if (strncmp(quota_buf, "max", 3) == 0) | |
| 2362 goto next; | |
| 2363 | |
| 2364 /* Read cpu.max */ | |
| 2365 if (sscanf(quota_buf, "%lld %lld", &limit, &period) != 2) | |
| 2366 goto next; | |
| 2367 | |
| 2368 /* Can't divide by 0 */ | |
| 2369 if (period == 0) | |
| 2370 goto next; | |
| 2371 | |
| 2372 *quota = limit / period; | |
| 2373 if (*quota < min_quota) | |
| 2374 min_quota = *quota; | |
| 2375 | |
| 2376 next: | |
| 2377 /* Move up one level in the cgroup hierarchy by trimming the last path. | |
| 2378 * The loop ends once we reach the cgroup root mount point. | |
| 2379 */ | |
| 2380 last_slash = strrchr(path, '/'); | |
| 2381 if (last_slash == NULL || strcmp(path, cgroup_mount) == 0) | |
| 2382 break; | |
| 2383 *last_slash = '\0'; | |
| 2384 } | |
| 2385 | |
| 2386 return 0; | |
| 2387 } | |
| 2388 | |
| 2389 static char* uv__cgroup1_find_cpu_controller(const char* cgroup, | |
| 2390 int* cgroup_size) { | |
| 2391 /* Seek to the cpu controller line. */ | |
| 2392 char* cgroup_cpu = strstr(cgroup, ":cpu,"); | |
| 2393 | |
| 2394 if (cgroup_cpu != NULL) { | |
| 2395 /* Skip the controller prefix to the start of the cgroup path. */ | |
| 2396 cgroup_cpu += sizeof(":cpu,") - 1; | |
| 2397 /* Determine the length of the cgroup path, excluding the newline. */ | |
| 2398 *cgroup_size = (int)strcspn(cgroup_cpu, "\n"); | |
| 2399 } | |
| 2400 | |
| 2401 return cgroup_cpu; | |
| 2402 } | |
| 2403 | |
| 2404 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup, | |
| 2405 long long* quota) { | |
| 2406 char path[256]; | |
| 2407 char buf[1024]; | |
| 2408 int cgroup_size; | |
| 2409 char* cgroup_cpu; | |
| 2410 long long period_length; | |
| 2411 long long quota_per_period; | |
| 2412 | |
| 2413 cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size); | |
| 2414 | |
| 2415 if (cgroup_cpu == NULL) | |
| 2416 return UV_EIO; | |
| 2417 | |
| 2418 /* Construct the path to the cpu.cfs_quota_us file */ | |
| 2419 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us", | |
| 2420 cgroup_size, cgroup_cpu); | |
| 2421 | |
| 2422 /* Read cpu.cfs_quota_us */ | |
| 2423 if (uv__slurp(path, buf, sizeof(buf)) < 0) | |
| 2424 return UV_EIO; | |
| 2425 | |
| 2426 if (sscanf(buf, "%lld", "a_per_period) != 1) | |
| 2427 return UV_EINVAL; | |
| 2428 | |
| 2429 /* Construct the path to the cpu.cfs_period_us file */ | |
| 2430 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us", | |
| 2431 cgroup_size, cgroup_cpu); | |
| 2432 | |
| 2433 /* Read cpu.cfs_period_us */ | |
| 2434 if (uv__slurp(path, buf, sizeof(buf)) < 0) | |
| 2435 return UV_EIO; | |
| 2436 | |
| 2437 if (sscanf(buf, "%lld", &period_length) != 1) | |
| 2438 return UV_EINVAL; | |
| 2439 | |
| 2440 /* Can't divide by 0 */ | |
| 2441 if (period_length == 0) | |
| 2442 return UV_EINVAL; | |
| 2443 | |
| 2444 *quota = quota_per_period / period_length; | |
| 2445 | |
| 2446 return 0; | |
| 2447 } | |
| 2448 | |
| 2449 int uv__get_constrained_cpu(long long* quota) { | |
| 2450 char cgroup[1024]; | |
| 2451 | |
| 2452 /* Read the cgroup from /proc/self/cgroup */ | |
| 2453 if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0) | |
| 2454 return UV_EIO; | |
| 2455 | |
| 2456 /* Check if the system is using cgroup v2 by examining /proc/self/cgroup | |
| 2457 * The entry for cgroup v2 is always in the format "0::$PATH" | |
| 2458 * see https://docs.kernel.org/admin-guide/cgroup-v2.html */ | |
| 2459 if (strncmp(cgroup, "0::/", 4) == 0) | |
| 2460 return uv__get_cgroupv2_constrained_cpu(cgroup, quota); | |
| 2461 else | |
| 2462 return uv__get_cgroupv1_constrained_cpu(cgroup, quota); | |
| 2463 } | |
| 2464 | |
| 2465 | |
| 2466 void uv_loadavg(double avg[3]) { | |
| 2467 struct sysinfo info; | |
| 2468 char buf[128]; /* Large enough to hold all of /proc/loadavg. */ | |
| 2469 | |
| 2470 if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf))) | |
| 2471 if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2])) | |
| 2472 return; | |
| 2473 | |
| 2474 if (sysinfo(&info) < 0) | |
| 2475 return; | |
| 2476 | |
| 2477 avg[0] = (double) info.loads[0] / 65536.0; | |
| 2478 avg[1] = (double) info.loads[1] / 65536.0; | |
| 2479 avg[2] = (double) info.loads[2] / 65536.0; | |
| 2480 } | |
| 2481 | |
| 2482 | |
| 2483 static int compare_watchers(const struct watcher_list* a, | |
| 2484 const struct watcher_list* b) { | |
| 2485 if (a->wd < b->wd) return -1; | |
| 2486 if (a->wd > b->wd) return 1; | |
| 2487 return 0; | |
| 2488 } | |
| 2489 | |
| 2490 | |
| 2491 static int init_inotify(uv_loop_t* loop) { | |
| 2492 int err; | |
| 2493 int fd; | |
| 2494 | |
| 2495 if (loop->inotify_fd != -1) | |
| 2496 return 0; | |
| 2497 | |
| 2498 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); | |
| 2499 if (fd < 0) | |
| 2500 return UV__ERR(errno); | |
| 2501 | |
| 2502 err = uv__io_init_start(loop, &loop->inotify_read_watcher, uv__inotify_read, | |
| 2503 fd, POLLIN); | |
| 2504 if (err) { | |
| 2505 uv__close(fd); | |
| 2506 return err; | |
| 2507 } | |
| 2508 | |
| 2509 loop->inotify_fd = fd; | |
| 2510 return 0; | |
| 2511 } | |
| 2512 | |
| 2513 | |
| 2514 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) { | |
| 2515 /* Open the inotify_fd, and re-arm all the inotify watchers. */ | |
| 2516 int err; | |
| 2517 struct watcher_list* tmp_watcher_list_iter; | |
| 2518 struct watcher_list* watcher_list; | |
| 2519 struct watcher_list tmp_watcher_list; | |
| 2520 struct uv__queue queue; | |
| 2521 struct uv__queue* q; | |
| 2522 uv_fs_event_t* handle; | |
| 2523 char* tmp_path; | |
| 2524 | |
| 2525 if (root == NULL) | |
| 2526 return 0; | |
| 2527 | |
| 2528 /* We must restore the old watcher list to be able to close items | |
| 2529 * out of it. | |
| 2530 */ | |
| 2531 loop->inotify_watchers = root; | |
| 2532 | |
| 2533 uv__queue_init(&tmp_watcher_list.watchers); | |
| 2534 /* Note that the queue we use is shared with the start and stop() | |
| 2535 * functions, making uv__queue_foreach unsafe to use. So we use the | |
| 2536 * uv__queue_move trick to safely iterate. Also don't free the watcher | |
| 2537 * list until we're done iterating. c.f. uv__inotify_read. | |
| 2538 */ | |
| 2539 RB_FOREACH_SAFE(watcher_list, watcher_root, | |
| 2540 uv__inotify_watchers(loop), tmp_watcher_list_iter) { | |
| 2541 watcher_list->iterating = 1; | |
| 2542 uv__queue_move(&watcher_list->watchers, &queue); | |
| 2543 while (!uv__queue_empty(&queue)) { | |
| 2544 q = uv__queue_head(&queue); | |
| 2545 handle = uv__queue_data(q, uv_fs_event_t, watchers); | |
| 2546 /* It's critical to keep a copy of path here, because it | |
| 2547 * will be set to NULL by stop() and then deallocated by | |
| 2548 * maybe_free_watcher_list | |
| 2549 */ | |
| 2550 tmp_path = uv__strdup(handle->path); | |
| 2551 assert(tmp_path != NULL); | |
| 2552 uv__queue_remove(q); | |
| 2553 uv__queue_insert_tail(&watcher_list->watchers, q); | |
| 2554 uv_fs_event_stop(handle); | |
| 2555 | |
| 2556 uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers); | |
| 2557 handle->path = tmp_path; | |
| 2558 } | |
| 2559 watcher_list->iterating = 0; | |
| 2560 maybe_free_watcher_list(watcher_list, loop); | |
| 2561 } | |
| 2562 | |
| 2563 uv__queue_move(&tmp_watcher_list.watchers, &queue); | |
| 2564 while (!uv__queue_empty(&queue)) { | |
| 2565 q = uv__queue_head(&queue); | |
| 2566 uv__queue_remove(q); | |
| 2567 handle = uv__queue_data(q, uv_fs_event_t, watchers); | |
| 2568 tmp_path = handle->path; | |
| 2569 handle->path = NULL; | |
| 2570 err = uv_fs_event_start(handle, handle->cb, tmp_path, 0); | |
| 2571 uv__free(tmp_path); | |
| 2572 if (err) | |
| 2573 return err; | |
| 2574 } | |
| 2575 | |
| 2576 return 0; | |
| 2577 } | |
| 2578 | |
| 2579 | |
| 2580 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) { | |
| 2581 struct watcher_list w; | |
| 2582 w.wd = wd; | |
| 2583 return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w); | |
| 2584 } | |
| 2585 | |
| 2586 | |
| 2587 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) { | |
| 2588 /* if the watcher_list->watchers is being iterated over, we can't free it. */ | |
| 2589 if ((!w->iterating) && uv__queue_empty(&w->watchers)) { | |
| 2590 /* No watchers left for this path. Clean up. */ | |
| 2591 RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w); | |
| 2592 inotify_rm_watch(loop->inotify_fd, w->wd); | |
| 2593 uv__free(w); | |
| 2594 } | |
| 2595 } | |
| 2596 | |
| 2597 | |
| 2598 static void uv__inotify_read(uv_loop_t* loop, | |
| 2599 uv__io_t* dummy, | |
| 2600 unsigned int events) { | |
| 2601 const struct inotify_event* e; | |
| 2602 struct watcher_list* w; | |
| 2603 uv_fs_event_t* h; | |
| 2604 struct uv__queue queue; | |
| 2605 struct uv__queue* q; | |
| 2606 const char* path; | |
| 2607 ssize_t size; | |
| 2608 const char *p; | |
| 2609 /* needs to be large enough for sizeof(inotify_event) + strlen(path) */ | |
| 2610 char buf[4096]; | |
| 2611 | |
| 2612 for (;;) { | |
| 2613 do | |
| 2614 size = read(loop->inotify_fd, buf, sizeof(buf)); | |
| 2615 while (size == -1 && errno == EINTR); | |
| 2616 | |
| 2617 if (size == -1) { | |
| 2618 assert(errno == EAGAIN || errno == EWOULDBLOCK); | |
| 2619 break; | |
| 2620 } | |
| 2621 | |
| 2622 assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */ | |
| 2623 | |
| 2624 /* Now we have one or more inotify_event structs. */ | |
| 2625 for (p = buf; p < buf + size; p += sizeof(*e) + e->len) { | |
| 2626 e = (const struct inotify_event*) p; | |
| 2627 | |
| 2628 events = 0; | |
| 2629 if (e->mask & (IN_ATTRIB|IN_MODIFY)) | |
| 2630 events |= UV_CHANGE; | |
| 2631 if (e->mask & ~(IN_ATTRIB|IN_MODIFY)) | |
| 2632 events |= UV_RENAME; | |
| 2633 | |
| 2634 w = find_watcher(loop, e->wd); | |
| 2635 if (w == NULL) | |
| 2636 continue; /* Stale event, no watchers left. */ | |
| 2637 | |
| 2638 /* inotify does not return the filename when monitoring a single file | |
| 2639 * for modifications. Repurpose the filename for API compatibility. | |
| 2640 * I'm not convinced this is a good thing, maybe it should go. | |
| 2641 */ | |
| 2642 path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path); | |
| 2643 | |
| 2644 /* We're about to iterate over the queue and call user's callbacks. | |
| 2645 * What can go wrong? | |
| 2646 * A callback could call uv_fs_event_stop() | |
| 2647 * and the queue can change under our feet. | |
| 2648 * So, we use uv__queue_move() trick to safely iterate over the queue. | |
| 2649 * And we don't free the watcher_list until we're done iterating. | |
| 2650 * | |
| 2651 * First, | |
| 2652 * tell uv_fs_event_stop() (that could be called from a user's callback) | |
| 2653 * not to free watcher_list. | |
| 2654 */ | |
| 2655 w->iterating = 1; | |
| 2656 uv__queue_move(&w->watchers, &queue); | |
| 2657 while (!uv__queue_empty(&queue)) { | |
| 2658 q = uv__queue_head(&queue); | |
| 2659 h = uv__queue_data(q, uv_fs_event_t, watchers); | |
| 2660 | |
| 2661 uv__queue_remove(q); | |
| 2662 uv__queue_insert_tail(&w->watchers, q); | |
| 2663 | |
| 2664 h->cb(h, path, events, 0); | |
| 2665 } | |
| 2666 /* done iterating, time to (maybe) free empty watcher_list */ | |
| 2667 w->iterating = 0; | |
| 2668 maybe_free_watcher_list(w, loop); | |
| 2669 } | |
| 2670 } | |
| 2671 } | |
| 2672 | |
| 2673 | |
| 2674 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) { | |
| 2675 uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT); | |
| 2676 return 0; | |
| 2677 } | |
| 2678 | |
| 2679 | |
| 2680 int uv_fs_event_start(uv_fs_event_t* handle, | |
| 2681 uv_fs_event_cb cb, | |
| 2682 const char* path, | |
| 2683 unsigned int flags) { | |
| 2684 struct watcher_list* w; | |
| 2685 uv_loop_t* loop; | |
| 2686 size_t len; | |
| 2687 int events; | |
| 2688 int err; | |
| 2689 int wd; | |
| 2690 | |
| 2691 if (uv__is_active(handle)) | |
| 2692 return UV_EINVAL; | |
| 2693 | |
| 2694 loop = handle->loop; | |
| 2695 | |
| 2696 err = init_inotify(loop); | |
| 2697 if (err) | |
| 2698 return err; | |
| 2699 | |
| 2700 events = IN_ATTRIB | |
| 2701 | IN_CREATE | |
| 2702 | IN_MODIFY | |
| 2703 | IN_DELETE | |
| 2704 | IN_DELETE_SELF | |
| 2705 | IN_MOVE_SELF | |
| 2706 | IN_MOVED_FROM | |
| 2707 | IN_MOVED_TO; | |
| 2708 | |
| 2709 wd = inotify_add_watch(loop->inotify_fd, path, events); | |
| 2710 if (wd == -1) | |
| 2711 return UV__ERR(errno); | |
| 2712 | |
| 2713 w = find_watcher(loop, wd); | |
| 2714 if (w) | |
| 2715 goto no_insert; | |
| 2716 | |
| 2717 len = strlen(path) + 1; | |
| 2718 w = uv__malloc(sizeof(*w) + len); | |
| 2719 if (w == NULL) | |
| 2720 return UV_ENOMEM; | |
| 2721 | |
| 2722 w->wd = wd; | |
| 2723 w->path = memcpy(w + 1, path, len); | |
| 2724 uv__queue_init(&w->watchers); | |
| 2725 w->iterating = 0; | |
| 2726 RB_INSERT(watcher_root, uv__inotify_watchers(loop), w); | |
| 2727 | |
| 2728 no_insert: | |
| 2729 uv__handle_start(handle); | |
| 2730 uv__queue_insert_tail(&w->watchers, &handle->watchers); | |
| 2731 handle->path = w->path; | |
| 2732 handle->cb = cb; | |
| 2733 handle->wd = wd; | |
| 2734 | |
| 2735 return 0; | |
| 2736 } | |
| 2737 | |
| 2738 | |
| 2739 int uv_fs_event_stop(uv_fs_event_t* handle) { | |
| 2740 struct watcher_list* w; | |
| 2741 | |
| 2742 if (!uv__is_active(handle)) | |
| 2743 return 0; | |
| 2744 | |
| 2745 w = find_watcher(handle->loop, handle->wd); | |
| 2746 assert(w != NULL); | |
| 2747 | |
| 2748 handle->wd = -1; | |
| 2749 handle->path = NULL; | |
| 2750 uv__handle_stop(handle); | |
| 2751 uv__queue_remove(&handle->watchers); | |
| 2752 | |
| 2753 maybe_free_watcher_list(w, handle->loop); | |
| 2754 | |
| 2755 return 0; | |
| 2756 } | |
| 2757 | |
| 2758 | |
| 2759 void uv__fs_event_close(uv_fs_event_t* handle) { | |
| 2760 uv_fs_event_stop(handle); | |
| 2761 } |