Mercurial
diff third_party/libuv/src/unix/linux.c @ 160:948de3f54cea
[ThirdParty] Added libuv
| author | June Park <parkjune1995@gmail.com> |
|---|---|
| date | Wed, 14 Jan 2026 19:39:52 -0800 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/third_party/libuv/src/unix/linux.c Wed Jan 14 19:39:52 2026 -0800 @@ -0,0 +1,2761 @@ +/* Copyright Joyent, Inc. and other Node contributors. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their + * EPOLL* counterparts. We use the POLL* variants in this file because that + * is what libuv uses elsewhere. + */ + +#include "uv.h" +#include "internal.h" + +#include <inttypes.h> +#include <stdatomic.h> +#include <stddef.h> /* offsetof */ +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <errno.h> + +#include <fcntl.h> +#include <ifaddrs.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netpacket/packet.h> +#include <sys/epoll.h> +#include <sys/inotify.h> +#include <sys/mman.h> +#include <sys/param.h> +#include <sys/prctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/sysinfo.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/utsname.h> +#include <time.h> +#include <unistd.h> + +#ifndef __NR_io_uring_setup +# define __NR_io_uring_setup 425 +#endif + +#ifndef __NR_io_uring_enter +# define __NR_io_uring_enter 426 +#endif + +#ifndef __NR_io_uring_register +# define __NR_io_uring_register 427 +#endif + +#ifndef __NR_copy_file_range +# if defined(__x86_64__) +# define __NR_copy_file_range 326 +# elif defined(__i386__) +# define __NR_copy_file_range 377 +# elif defined(__s390__) +# define __NR_copy_file_range 375 +# elif defined(__arm__) +# define __NR_copy_file_range 391 +# elif defined(__aarch64__) +# define __NR_copy_file_range 285 +# elif defined(__powerpc__) +# define __NR_copy_file_range 379 +# elif defined(__arc__) +# define __NR_copy_file_range 285 +# elif defined(__riscv) +# define __NR_copy_file_range 285 +# endif +#endif /* __NR_copy_file_range */ + +#ifndef __NR_statx +# if defined(__x86_64__) +# define __NR_statx 332 +# elif defined(__i386__) +# define __NR_statx 383 +# elif defined(__aarch64__) +# define __NR_statx 397 +# elif defined(__arm__) +# define __NR_statx 397 +# elif defined(__ppc__) +# define __NR_statx 383 +# elif defined(__s390__) +# define __NR_statx 379 +# elif defined(__riscv) +# define __NR_statx 291 +# endif +#endif /* __NR_statx */ + +#ifndef __NR_getrandom +# if defined(__x86_64__) +# define __NR_getrandom 318 +# elif defined(__i386__) +# define __NR_getrandom 355 +# elif defined(__aarch64__) +# define __NR_getrandom 384 +# elif defined(__arm__) +# define __NR_getrandom 384 +# elif defined(__ppc__) +# define __NR_getrandom 359 +# elif defined(__s390__) +# define __NR_getrandom 349 +# elif defined(__riscv) +# define __NR_getrandom 278 +# endif +#endif /* __NR_getrandom */ + +enum { + UV__IORING_SETUP_SQPOLL = 2u, + UV__IORING_SETUP_NO_SQARRAY = 0x10000u, +}; + +enum { + UV__IORING_FEAT_SINGLE_MMAP = 1u, + UV__IORING_FEAT_NODROP = 2u, + UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */ +}; + +enum { + UV__IORING_OP_READV = 1, + UV__IORING_OP_WRITEV = 2, + UV__IORING_OP_FSYNC = 3, + UV__IORING_OP_OPENAT = 18, + UV__IORING_OP_CLOSE = 19, + UV__IORING_OP_STATX = 21, + UV__IORING_OP_EPOLL_CTL = 29, + UV__IORING_OP_RENAMEAT = 35, + UV__IORING_OP_UNLINKAT = 36, + UV__IORING_OP_MKDIRAT = 37, + UV__IORING_OP_SYMLINKAT = 38, + UV__IORING_OP_LINKAT = 39, + UV__IORING_OP_FTRUNCATE = 55, +}; + +enum { + UV__IORING_ENTER_GETEVENTS = 1u, + UV__IORING_ENTER_SQ_WAKEUP = 2u, +}; + +enum { + UV__IORING_SQ_NEED_WAKEUP = 1u, + UV__IORING_SQ_CQ_OVERFLOW = 2u, +}; + +struct uv__io_cqring_offsets { + uint32_t head; + uint32_t tail; + uint32_t ring_mask; + uint32_t ring_entries; + uint32_t overflow; + uint32_t cqes; + uint64_t reserved0; + uint64_t reserved1; +}; + +STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets)); + +struct uv__io_sqring_offsets { + uint32_t head; + uint32_t tail; + uint32_t ring_mask; + uint32_t ring_entries; + uint32_t flags; + uint32_t dropped; + uint32_t array; + uint32_t reserved0; + uint64_t reserved1; +}; + +STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets)); + +struct uv__io_uring_cqe { + uint64_t user_data; + int32_t res; + uint32_t flags; +}; + +STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe)); + +struct uv__io_uring_sqe { + uint8_t opcode; + uint8_t flags; + uint16_t ioprio; + int32_t fd; + union { + uint64_t off; + uint64_t addr2; + }; + union { + uint64_t addr; + }; + uint32_t len; + union { + uint32_t rw_flags; + uint32_t fsync_flags; + uint32_t open_flags; + uint32_t statx_flags; + }; + uint64_t user_data; + union { + uint16_t buf_index; + uint64_t pad[3]; + }; +}; + +STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe)); +STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode)); +STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags)); +STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio)); +STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd)); +STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off)); +STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr)); +STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len)); +STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags)); +STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data)); +STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index)); + +struct uv__io_uring_params { + uint32_t sq_entries; + uint32_t cq_entries; + uint32_t flags; + uint32_t sq_thread_cpu; + uint32_t sq_thread_idle; + uint32_t features; + uint32_t reserved[4]; + struct uv__io_sqring_offsets sq_off; /* 40 bytes */ + struct uv__io_cqring_offsets cq_off; /* 40 bytes */ +}; + +STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params)); +STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off)); +STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off)); + +STATIC_ASSERT(EPOLL_CTL_ADD < 4); +STATIC_ASSERT(EPOLL_CTL_DEL < 4); +STATIC_ASSERT(EPOLL_CTL_MOD < 4); + +struct watcher_list { + RB_ENTRY(watcher_list) entry; + struct uv__queue watchers; + int iterating; + char* path; + int wd; +}; + +struct watcher_root { + struct watcher_list* rbh_root; +}; + +static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root); +static void uv__inotify_read(uv_loop_t* loop, + uv__io_t* w, + unsigned int revents); +static int compare_watchers(const struct watcher_list* a, + const struct watcher_list* b); +static void maybe_free_watcher_list(struct watcher_list* w, + uv_loop_t* loop); + +static void uv__epoll_ctl_flush(int epollfd, + struct uv__iou* ctl, + struct epoll_event (*events)[256]); + +static void uv__epoll_ctl_prep(int epollfd, + struct uv__iou* ctl, + struct epoll_event (*events)[256], + int op, + int fd, + struct epoll_event* e); + +RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers) + + +static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) { + /* This cast works because watcher_root is a struct with a pointer as its + * sole member. Such type punning is unsafe in the presence of strict + * pointer aliasing (and is just plain nasty) but that is why libuv + * is compiled with -fno-strict-aliasing. + */ + return (struct watcher_root*) &loop->inotify_watchers; +} + + +unsigned uv__kernel_version(void) { + static _Atomic unsigned cached_version; + struct utsname u; + unsigned version; + unsigned major; + unsigned minor; + unsigned patch; + char v_sig[256]; + char* needle; + + version = atomic_load_explicit(&cached_version, memory_order_relaxed); + if (version != 0) + return version; + + /* Check /proc/version_signature first as it's the way to get the mainline + * kernel version in Ubuntu. The format is: + * Ubuntu ubuntu_kernel_version mainline_kernel_version + * For example: + * Ubuntu 5.15.0-79.86-generic 5.15.111 + */ + if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig))) + if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch)) + goto calculate_version; + + if (-1 == uname(&u)) + return 0; + + /* In Debian we need to check `version` instead of `release` to extract the + * mainline kernel version. This is an example of how it looks like: + * #1 SMP Debian 5.10.46-4 (2021-08-03) + */ + needle = strstr(u.version, "Debian "); + if (needle != NULL) + if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch)) + goto calculate_version; + + if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch)) + return 0; + + /* Handle it when the process runs under the UNAME26 personality: + * + * - kernels >= 3.x identify as 2.6.40+x + * - kernels >= 4.x identify as 2.6.60+x + * + * UNAME26 is a poorly conceived hack that doesn't let us distinguish + * between 4.x kernels and 5.x/6.x kernels so we conservatively assume + * that 2.6.60+x means 4.x. + * + * Fun fact of the day: it's technically possible to observe the actual + * kernel version for a brief moment because uname() first copies out the + * real release string before overwriting it with the backcompat string. + */ + if (major == 2 && minor == 6) { + if (patch >= 60) { + major = 4; + minor = patch - 60; + patch = 0; + } else if (patch >= 40) { + major = 3; + minor = patch - 40; + patch = 0; + } + } + +calculate_version: + version = major * 65536 + minor * 256 + patch; + atomic_store_explicit(&cached_version, version, memory_order_relaxed); + + return version; +} + + +ssize_t +uv__fs_copy_file_range(int fd_in, + off_t* off_in, + int fd_out, + off_t* off_out, + size_t len, + unsigned int flags) +{ +#ifdef __NR_copy_file_range + return syscall(__NR_copy_file_range, + fd_in, + off_in, + fd_out, + off_out, + len, + flags); +#else + return errno = ENOSYS, -1; +#endif +} + + +int uv__statx(int dirfd, + const char* path, + int flags, + unsigned int mask, + struct uv__statx* statxbuf) { +#if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30 + return errno = ENOSYS, -1; +#else + int rc; + + rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf); + if (rc >= 0) + uv__msan_unpoison(statxbuf, sizeof(*statxbuf)); + + return rc; +#endif +} + + +ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) { +#if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28 + return errno = ENOSYS, -1; +#else + ssize_t rc; + + rc = syscall(__NR_getrandom, buf, buflen, flags); + if (rc >= 0) + uv__msan_unpoison(buf, buflen); + + return rc; +#endif +} + + +int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) { + return syscall(__NR_io_uring_setup, entries, params); +} + + +int uv__io_uring_enter(int fd, + unsigned to_submit, + unsigned min_complete, + unsigned flags) { + /* io_uring_enter used to take a sigset_t but it's unused + * in newer kernels unless IORING_ENTER_EXT_ARG is set, + * in which case it takes a struct io_uring_getevents_arg. + */ + return syscall(__NR_io_uring_enter, + fd, + to_submit, + min_complete, + flags, + NULL, + 0L); +} + + +int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) { + return syscall(__NR_io_uring_register, fd, opcode, arg, nargs); +} + + +static int uv__use_io_uring(uint32_t flags) { +#if defined(__ANDROID_API__) + return 0; /* Possibly available but blocked by seccomp. */ +#elif defined(__arm__) && __SIZEOF_POINTER__ == 4 + /* See https://github.com/libuv/libuv/issues/4158. */ + return 0; /* All 32 bits kernels appear buggy. */ +#elif defined(__powerpc64__) || defined(__ppc64__) + /* See https://github.com/libuv/libuv/issues/4283. */ + return 0; /* Random SIGSEGV in signal handler. */ +#else + /* Ternary: unknown=0, yes=1, no=-1 */ + static _Atomic int use_io_uring; + char* val; + int use; + +#if defined(__hppa__) + /* io_uring first supported on parisc in 6.1, functional in .51 + * https://lore.kernel.org/all/[email protected]/ + */ + if (uv__kernel_version() < /*6.1.51*/0x060133) + return 0; +#endif + + /* SQPOLL is all kinds of buggy but epoll batching should work fine. */ + if (0 == (flags & UV__IORING_SETUP_SQPOLL)) + return 1; + + /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */ + if (uv__kernel_version() < /*5.10.186*/0x050ABA) + return 0; + + use = atomic_load_explicit(&use_io_uring, memory_order_relaxed); + + if (use == 0) { + val = getenv("UV_USE_IO_URING"); + use = val != NULL && atoi(val) > 0 ? 1 : -1; + atomic_store_explicit(&use_io_uring, use, memory_order_relaxed); + } + + return use > 0; +#endif +} + + +static void uv__iou_init(int epollfd, + struct uv__iou* iou, + uint32_t entries, + uint32_t flags) { + struct uv__io_uring_params params; + struct epoll_event e; + size_t cqlen; + size_t sqlen; + size_t maxlen; + size_t sqelen; + unsigned kernel_version; + uint32_t* sqarray; + uint32_t i; + char* sq; + char* sqe; + int ringfd; + int no_sqarray; + + sq = MAP_FAILED; + sqe = MAP_FAILED; + + if (!uv__use_io_uring(flags)) + return; + + kernel_version = uv__kernel_version(); + no_sqarray = + UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600); + + /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement. + * Mostly academic because we check for a v5.13 kernel afterwards anyway. + */ + memset(¶ms, 0, sizeof(params)); + params.flags = flags | no_sqarray; + + if (flags & UV__IORING_SETUP_SQPOLL) + params.sq_thread_idle = 10; /* milliseconds */ + + /* Kernel returns a file descriptor with O_CLOEXEC flag set. */ + ringfd = uv__io_uring_setup(entries, ¶ms); + if (ringfd == -1) + return; + + /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're + * actually detecting is whether IORING_OP_STATX works with SQPOLL. + */ + if (!(params.features & UV__IORING_FEAT_RSRC_TAGS)) + goto fail; + + /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */ + if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP)) + goto fail; + + /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */ + if (!(params.features & UV__IORING_FEAT_NODROP)) + goto fail; + + sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t); + cqlen = + params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe); + maxlen = sqlen < cqlen ? cqlen : sqlen; + sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe); + + sq = mmap(0, + maxlen, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, + ringfd, + 0); /* IORING_OFF_SQ_RING */ + + sqe = mmap(0, + sqelen, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, + ringfd, + 0x10000000ull); /* IORING_OFF_SQES */ + + if (sq == MAP_FAILED || sqe == MAP_FAILED) + goto fail; + + if (flags & UV__IORING_SETUP_SQPOLL) { + /* Only interested in completion events. To get notified when + * the kernel pulls items from the submission ring, add POLLOUT. + */ + memset(&e, 0, sizeof(e)); + e.events = POLLIN; + e.data.fd = ringfd; + + if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e)) + goto fail; + } + + iou->sqhead = (uint32_t*) (sq + params.sq_off.head); + iou->sqtail = (uint32_t*) (sq + params.sq_off.tail); + iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask); + iou->sqflags = (uint32_t*) (sq + params.sq_off.flags); + iou->cqhead = (uint32_t*) (sq + params.cq_off.head); + iou->cqtail = (uint32_t*) (sq + params.cq_off.tail); + iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask); + iou->sq = sq; + iou->cqe = sq + params.cq_off.cqes; + iou->sqe = sqe; + iou->sqlen = sqlen; + iou->cqlen = cqlen; + iou->maxlen = maxlen; + iou->sqelen = sqelen; + iou->ringfd = ringfd; + iou->in_flight = 0; + + if (no_sqarray) + return; + + sqarray = (uint32_t*) (sq + params.sq_off.array); + for (i = 0; i <= iou->sqmask; i++) + sqarray[i] = i; /* Slot -> sqe identity mapping. */ + + return; + +fail: + if (sq != MAP_FAILED) + munmap(sq, maxlen); + + if (sqe != MAP_FAILED) + munmap(sqe, sqelen); + + uv__close(ringfd); +} + + +static void uv__iou_delete(struct uv__iou* iou) { + if (iou->ringfd > -1) { + munmap(iou->sq, iou->maxlen); + munmap(iou->sqe, iou->sqelen); + uv__close(iou->ringfd); + iou->ringfd = -1; + } +} + + +int uv__platform_loop_init(uv_loop_t* loop) { + uv__loop_internal_fields_t* lfields; + + lfields = uv__get_internal_fields(loop); + lfields->ctl.ringfd = -1; + lfields->iou.ringfd = -2; /* "uninitialized" */ + + loop->inotify_watchers = NULL; + loop->inotify_fd = -1; + loop->backend_fd = epoll_create1(O_CLOEXEC); + + if (loop->backend_fd == -1) + return UV__ERR(errno); + + uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0); + + return 0; +} + + +int uv__io_fork(uv_loop_t* loop) { + int err; + struct watcher_list* root; + + root = uv__inotify_watchers(loop)->rbh_root; + + uv__close(loop->backend_fd); + loop->backend_fd = -1; + + /* TODO(bnoordhuis) Loses items from the submission and completion rings. */ + uv__platform_loop_delete(loop); + + err = uv__platform_loop_init(loop); + if (err) + return err; + + return uv__inotify_fork(loop, root); +} + + +void uv__platform_loop_delete(uv_loop_t* loop) { + uv__loop_internal_fields_t* lfields; + + lfields = uv__get_internal_fields(loop); + uv__iou_delete(&lfields->ctl); + uv__iou_delete(&lfields->iou); + + if (loop->inotify_fd != -1) { + uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN); + uv__close(loop->inotify_fd); + loop->inotify_fd = -1; + } +} + + +struct uv__invalidate { + struct epoll_event (*prep)[256]; + struct epoll_event* events; + int nfds; +}; + + +void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) { + uv__loop_internal_fields_t* lfields; + struct uv__invalidate* inv; + struct epoll_event dummy; + int i; + + lfields = uv__get_internal_fields(loop); + inv = lfields->inv; + + /* Invalidate events with same file descriptor */ + if (inv != NULL) + for (i = 0; i < inv->nfds; i++) + if (inv->events[i].data.fd == fd) + inv->events[i].data.fd = -1; + + /* Remove the file descriptor from the epoll. + * This avoids a problem where the same file description remains open + * in another process, causing repeated junk epoll events. + * + * Perform EPOLL_CTL_DEL immediately instead of going through + * io_uring's submit queue, otherwise the file descriptor may + * be closed by the time the kernel starts the operation. + * + * We pass in a dummy epoll_event, to work around a bug in old kernels. + * + * Work around a bug in kernels 3.10 to 3.19 where passing a struct that + * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings. + */ + memset(&dummy, 0, sizeof(dummy)); + epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy); +} + + +int uv__io_check_fd(uv_loop_t* loop, int fd) { + struct epoll_event e; + int rc; + + memset(&e, 0, sizeof(e)); + e.events = POLLIN; + e.data.fd = -1; + + rc = 0; + if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e)) + if (errno != EEXIST) + rc = UV__ERR(errno); + + if (rc == 0) + if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e)) + abort(); + + return rc; +} + + +/* Caller must initialize SQE and call uv__iou_submit(). */ +static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou, + uv_loop_t* loop, + uv_fs_t* req) { + struct uv__io_uring_sqe* sqe; + uint32_t head; + uint32_t tail; + uint32_t mask; + uint32_t slot; + + /* Lazily create the ring. State machine: -2 means uninitialized, -1 means + * initialization failed. Anything else is a valid ring file descriptor. + */ + if (iou->ringfd == -2) { + /* By default, the SQPOLL is not created. Enable only if the loop is + * configured with UV_LOOP_USE_IO_URING_SQPOLL and the UV_USE_IO_URING + * environment variable is unset or a positive number. + */ + if (loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL) + if (uv__use_io_uring(UV__IORING_SETUP_SQPOLL)) + uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL); + + if (iou->ringfd == -2) + iou->ringfd = -1; /* "failed" */ + } + + if (iou->ringfd == -1) + return NULL; + + head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead, + memory_order_acquire); + tail = *iou->sqtail; + mask = iou->sqmask; + + if ((head & mask) == ((tail + 1) & mask)) + return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */ + + slot = tail & mask; + sqe = iou->sqe; + sqe = &sqe[slot]; + memset(sqe, 0, sizeof(*sqe)); + sqe->user_data = (uintptr_t) req; + + /* Pacify uv_cancel(). */ + req->work_req.loop = loop; + req->work_req.work = NULL; + req->work_req.done = NULL; + uv__queue_init(&req->work_req.wq); + + uv__req_register(loop); + iou->in_flight++; + + return sqe; +} + + +static void uv__iou_submit(struct uv__iou* iou) { + uint32_t flags; + + atomic_store_explicit((_Atomic uint32_t*) iou->sqtail, + *iou->sqtail + 1, + memory_order_release); + + flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags, + memory_order_acquire); + + if (flags & UV__IORING_SQ_NEED_WAKEUP) + if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP)) + if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */ + perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */ +} + + +int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) { + struct uv__io_uring_sqe* sqe; + struct uv__iou* iou; + int kv; + + kv = uv__kernel_version(); + /* Work around a poorly understood bug in older kernels where closing a file + * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to + * execve("/foo/bar") later on. The bug seems to have been fixed somewhere + * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit + * but good candidates are the several data race fixes. Interestingly, it + * seems to manifest only when running under Docker so the possibility of + * a Docker bug can't be completely ruled out either. Yay, computers. + * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and + * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be + * solved. + */ + if (kv < /* 5.15.90 */ 0x050F5A) + return 0; + + if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100) + return 0; + + + iou = &uv__get_internal_fields(loop)->iou; + + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) + return 0; + + sqe->fd = req->file; + sqe->opcode = UV__IORING_OP_CLOSE; + + uv__iou_submit(iou); + + return 1; +} + + +int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) { + struct uv__io_uring_sqe* sqe; + struct uv__iou* iou; + + if (uv__kernel_version() < /* 6.9 */0x060900) + return 0; + + iou = &uv__get_internal_fields(loop)->iou; + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) + return 0; + + sqe->fd = req->file; + sqe->len = req->off; + sqe->opcode = UV__IORING_OP_FTRUNCATE; + uv__iou_submit(iou); + + return 1; +} + +int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop, + uv_fs_t* req, + uint32_t fsync_flags) { + struct uv__io_uring_sqe* sqe; + struct uv__iou* iou; + + iou = &uv__get_internal_fields(loop)->iou; + + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) + return 0; + + /* Little known fact: setting seq->off and seq->len turns + * it into an asynchronous sync_file_range() operation. + */ + sqe->fd = req->file; + sqe->fsync_flags = fsync_flags; + sqe->opcode = UV__IORING_OP_FSYNC; + + uv__iou_submit(iou); + + return 1; +} + + +int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) { + struct uv__io_uring_sqe* sqe; + struct uv__iou* iou; + + if (uv__kernel_version() < /* 5.15.0 */0x050F00) + return 0; + + iou = &uv__get_internal_fields(loop)->iou; + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) + return 0; + + sqe->addr = (uintptr_t) req->path; + sqe->fd = AT_FDCWD; + sqe->addr2 = (uintptr_t) req->new_path; + sqe->len = AT_FDCWD; + sqe->opcode = UV__IORING_OP_LINKAT; + + uv__iou_submit(iou); + + return 1; +} + + +int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) { + struct uv__io_uring_sqe* sqe; + struct uv__iou* iou; + + if (uv__kernel_version() < /* 5.15.0 */0x050F00) + return 0; + + iou = &uv__get_internal_fields(loop)->iou; + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) + return 0; + + sqe->addr = (uintptr_t) req->path; + sqe->fd = AT_FDCWD; + sqe->len = req->mode; + sqe->opcode = UV__IORING_OP_MKDIRAT; + + uv__iou_submit(iou); + + return 1; +} + + +int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) { + struct uv__io_uring_sqe* sqe; + struct uv__iou* iou; + + iou = &uv__get_internal_fields(loop)->iou; + + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) + return 0; + + sqe->addr = (uintptr_t) req->path; + sqe->fd = AT_FDCWD; + sqe->len = req->mode; + sqe->opcode = UV__IORING_OP_OPENAT; + sqe->open_flags = req->flags | O_CLOEXEC; + + uv__iou_submit(iou); + + return 1; +} + + +int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) { + struct uv__io_uring_sqe* sqe; + struct uv__iou* iou; + + iou = &uv__get_internal_fields(loop)->iou; + + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) + return 0; + + sqe->addr = (uintptr_t) req->path; + sqe->fd = AT_FDCWD; + sqe->addr2 = (uintptr_t) req->new_path; + sqe->len = AT_FDCWD; + sqe->opcode = UV__IORING_OP_RENAMEAT; + + uv__iou_submit(iou); + + return 1; +} + + +int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) { + struct uv__io_uring_sqe* sqe; + struct uv__iou* iou; + + if (uv__kernel_version() < /* 5.15.0 */0x050F00) + return 0; + + iou = &uv__get_internal_fields(loop)->iou; + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) + return 0; + + sqe->addr = (uintptr_t) req->path; + sqe->fd = AT_FDCWD; + sqe->addr2 = (uintptr_t) req->new_path; + sqe->opcode = UV__IORING_OP_SYMLINKAT; + + uv__iou_submit(iou); + + return 1; +} + + +int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) { + struct uv__io_uring_sqe* sqe; + struct uv__iou* iou; + + iou = &uv__get_internal_fields(loop)->iou; + + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) + return 0; + + sqe->addr = (uintptr_t) req->path; + sqe->fd = AT_FDCWD; + sqe->opcode = UV__IORING_OP_UNLINKAT; + + uv__iou_submit(iou); + + return 1; +} + + +int uv__iou_fs_read_or_write(uv_loop_t* loop, + uv_fs_t* req, + int is_read) { + struct uv__io_uring_sqe* sqe; + struct uv__iou* iou; + + /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback + * to the threadpool on writes */ + if (req->nbufs > IOV_MAX) { + if (is_read) + req->nbufs = IOV_MAX; + else + return 0; + } + + iou = &uv__get_internal_fields(loop)->iou; + + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) + return 0; + + sqe->addr = (uintptr_t) req->bufs; + sqe->fd = req->file; + sqe->len = req->nbufs; + sqe->off = req->off < 0 ? -1 : req->off; + sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV; + + uv__iou_submit(iou); + + return 1; +} + + +int uv__iou_fs_statx(uv_loop_t* loop, + uv_fs_t* req, + int is_fstat, + int is_lstat) { + struct uv__io_uring_sqe* sqe; + struct uv__statx* statxbuf; + struct uv__iou* iou; + + statxbuf = uv__malloc(sizeof(*statxbuf)); + if (statxbuf == NULL) + return 0; + + iou = &uv__get_internal_fields(loop)->iou; + + sqe = uv__iou_get_sqe(iou, loop, req); + if (sqe == NULL) { + uv__free(statxbuf); + return 0; + } + + req->ptr = statxbuf; + + sqe->addr = (uintptr_t) req->path; + sqe->addr2 = (uintptr_t) statxbuf; + sqe->fd = AT_FDCWD; + sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */ + sqe->opcode = UV__IORING_OP_STATX; + + if (is_fstat) { + sqe->addr = (uintptr_t) ""; + sqe->fd = req->file; + sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */ + } + + if (is_lstat) + sqe->statx_flags |= AT_SYMLINK_NOFOLLOW; + + uv__iou_submit(iou); + + return 1; +} + + +void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) { + buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor); + buf->st_mode = statxbuf->stx_mode; + buf->st_nlink = statxbuf->stx_nlink; + buf->st_uid = statxbuf->stx_uid; + buf->st_gid = statxbuf->stx_gid; + buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor); + buf->st_ino = statxbuf->stx_ino; + buf->st_size = statxbuf->stx_size; + buf->st_blksize = statxbuf->stx_blksize; + buf->st_blocks = statxbuf->stx_blocks; + buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec; + buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec; + buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec; + buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec; + buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec; + buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec; + buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec; + buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec; + buf->st_flags = 0; + buf->st_gen = 0; +} + + +static void uv__iou_fs_statx_post(uv_fs_t* req) { + struct uv__statx* statxbuf; + uv_stat_t* buf; + + buf = &req->statbuf; + statxbuf = req->ptr; + req->ptr = NULL; + + if (req->result == 0) { + uv__msan_unpoison(statxbuf, sizeof(*statxbuf)); + uv__statx_to_stat(statxbuf, buf); + req->ptr = buf; + } + + uv__free(statxbuf); +} + + +static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) { + struct uv__io_uring_cqe* cqe; + struct uv__io_uring_cqe* e; + uv_fs_t* req; + uint32_t head; + uint32_t tail; + uint32_t mask; + uint32_t i; + uint32_t flags; + int nevents; + int rc; + + head = *iou->cqhead; + tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail, + memory_order_acquire); + mask = iou->cqmask; + cqe = iou->cqe; + nevents = 0; + + for (i = head; i != tail; i++) { + e = &cqe[i & mask]; + + req = (uv_fs_t*) (uintptr_t) e->user_data; + assert(req->type == UV_FS); + + uv__req_unregister(loop); + iou->in_flight--; + + /* If the op is not supported by the kernel retry using the thread pool */ + if (e->res == -EOPNOTSUPP) { + uv__fs_post(loop, req); + continue; + } + + /* io_uring stores error codes as negative numbers, same as libuv. */ + req->result = e->res; + + switch (req->fs_type) { + case UV_FS_FSTAT: + case UV_FS_LSTAT: + case UV_FS_STAT: + uv__iou_fs_statx_post(req); + break; + default: /* Squelch -Wswitch warnings. */ + break; + } + + uv__metrics_update_idle_time(loop); + req->cb(req); + nevents++; + } + + atomic_store_explicit((_Atomic uint32_t*) iou->cqhead, + tail, + memory_order_release); + + /* Check whether CQE's overflowed, if so enter the kernel to make them + * available. Don't grab them immediately but in the next loop iteration to + * avoid loop starvation. */ + flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags, + memory_order_acquire); + + if (flags & UV__IORING_SQ_CQ_OVERFLOW) { + do + rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS); + while (rc == -1 && errno == EINTR); + + if (rc < 0) + perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */ + } + + uv__metrics_inc_events(loop, nevents); + if (uv__get_internal_fields(loop)->current_timeout == 0) + uv__metrics_inc_events_waiting(loop, nevents); +} + + +/* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be + * executed immediately, otherwise the file descriptor may have been closed + * by the time the kernel starts the operation. + */ +static void uv__epoll_ctl_prep(int epollfd, + struct uv__iou* ctl, + struct epoll_event (*events)[256], + int op, + int fd, + struct epoll_event* e) { + struct uv__io_uring_sqe* sqe; + struct epoll_event* pe; + uint32_t mask; + uint32_t slot; + + assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD); + assert(ctl->ringfd != -1); + + mask = ctl->sqmask; + slot = (*ctl->sqtail)++ & mask; + + pe = &(*events)[slot]; + *pe = *e; + + sqe = ctl->sqe; + sqe = &sqe[slot]; + + memset(sqe, 0, sizeof(*sqe)); + sqe->addr = (uintptr_t) pe; + sqe->fd = epollfd; + sqe->len = op; + sqe->off = fd; + sqe->opcode = UV__IORING_OP_EPOLL_CTL; + sqe->user_data = op | slot << 2 | (int64_t) fd << 32; + + if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask)) + uv__epoll_ctl_flush(epollfd, ctl, events); +} + + +static void uv__epoll_ctl_flush(int epollfd, + struct uv__iou* ctl, + struct epoll_event (*events)[256]) { + struct epoll_event oldevents[256]; + struct uv__io_uring_cqe* cqe; + uint32_t oldslot; + uint32_t slot; + uint32_t n; + int fd; + int op; + int rc; + + STATIC_ASSERT(sizeof(oldevents) == sizeof(*events)); + assert(ctl->ringfd != -1); + assert(*ctl->sqhead != *ctl->sqtail); + + n = *ctl->sqtail - *ctl->sqhead; + do + rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS); + while (rc == -1 && errno == EINTR); + + if (rc < 0) + perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */ + + if (rc != (int) n) + abort(); + + assert(*ctl->sqhead == *ctl->sqtail); + + memcpy(oldevents, *events, sizeof(*events)); + + /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors + * that have been closed, or EPOLL_CTL_ADD commands for file descriptors + * that we are already watching. Ignore the former and retry the latter + * with EPOLL_CTL_MOD. + */ + while (*ctl->cqhead != *ctl->cqtail) { + slot = (*ctl->cqhead)++ & ctl->cqmask; + + cqe = ctl->cqe; + cqe = &cqe[slot]; + + if (cqe->res == 0) + continue; + + fd = cqe->user_data >> 32; + op = 3 & cqe->user_data; + oldslot = 255 & (cqe->user_data >> 2); + + if (op == EPOLL_CTL_DEL) + continue; + + if (op != EPOLL_CTL_ADD) + abort(); + + if (cqe->res != -EEXIST) + abort(); + + uv__epoll_ctl_prep(epollfd, + ctl, + events, + EPOLL_CTL_MOD, + fd, + &oldevents[oldslot]); + } +} + + +void uv__io_poll(uv_loop_t* loop, int timeout) { + uv__loop_internal_fields_t* lfields; + struct epoll_event events[1024]; + struct epoll_event prep[256]; + struct uv__invalidate inv; + struct epoll_event* pe; + struct epoll_event e; + struct uv__iou* ctl; + struct uv__iou* iou; + int real_timeout; + struct uv__queue* q; + uv__io_t* w; + sigset_t* sigmask; + sigset_t sigset; + uint64_t base; + int have_iou_events; + int have_signals; + int nevents; + int epollfd; + int count; + int nfds; + int fd; + int op; + int i; + int user_timeout; + int reset_timeout; + + lfields = uv__get_internal_fields(loop); + ctl = &lfields->ctl; + iou = &lfields->iou; + + sigmask = NULL; + if (loop->flags & UV_LOOP_BLOCK_SIGPROF) { + sigemptyset(&sigset); + sigaddset(&sigset, SIGPROF); + sigmask = &sigset; + } + + assert(timeout >= -1); + base = loop->time; + count = 48; /* Benchmarks suggest this gives the best throughput. */ + real_timeout = timeout; + + if (lfields->flags & UV_METRICS_IDLE_TIME) { + reset_timeout = 1; + user_timeout = timeout; + timeout = 0; + } else { + reset_timeout = 0; + user_timeout = 0; + } + + epollfd = loop->backend_fd; + + memset(&e, 0, sizeof(e)); + + while (!uv__queue_empty(&loop->watcher_queue)) { + q = uv__queue_head(&loop->watcher_queue); + w = uv__queue_data(q, uv__io_t, watcher_queue); + uv__queue_remove(q); + uv__queue_init(q); + + op = EPOLL_CTL_MOD; + if (w->events == 0) + op = EPOLL_CTL_ADD; + + w->events = w->pevents; + e.events = w->pevents; + e.data.fd = w->fd; + fd = w->fd; + + if (ctl->ringfd != -1) { + uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e); + continue; + } + + if (!epoll_ctl(epollfd, op, fd, &e)) + continue; + + assert(op == EPOLL_CTL_ADD); + assert(errno == EEXIST); + + /* File descriptor that's been watched before, update event mask. */ + if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e)) + abort(); + } + + inv.events = events; + inv.prep = &prep; + inv.nfds = -1; + + for (;;) { + if (loop->nfds == 0) + if (iou->in_flight == 0) + break; + + /* All event mask mutations should be visible to the kernel before + * we enter epoll_pwait(). + */ + if (ctl->ringfd != -1) + while (*ctl->sqhead != *ctl->sqtail) + uv__epoll_ctl_flush(epollfd, ctl, &prep); + + /* Only need to set the provider_entry_time if timeout != 0. The function + * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME. + */ + if (timeout != 0) + uv__metrics_set_provider_entry_time(loop); + + /* Store the current timeout in a location that's globally accessible so + * other locations like uv__work_done() can determine whether the queue + * of events in the callback were waiting when poll was called. + */ + lfields->current_timeout = timeout; + + nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask); + + /* Update loop->time unconditionally. It's tempting to skip the update when + * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the + * operating system didn't reschedule our process while in the syscall. + */ + SAVE_ERRNO(uv__update_time(loop)); + + if (nfds == -1) + assert(errno == EINTR); + else if (nfds == 0) + /* Unlimited timeout should only return with events or signal. */ + assert(timeout != -1); + + if (nfds == 0 || nfds == -1) { + if (reset_timeout != 0) { + timeout = user_timeout; + reset_timeout = 0; + } else if (nfds == 0) { + return; + } + + /* Interrupted by a signal. Update timeout and poll again. */ + goto update_timeout; + } + + have_iou_events = 0; + have_signals = 0; + nevents = 0; + + inv.nfds = nfds; + lfields->inv = &inv; + + for (i = 0; i < nfds; i++) { + pe = events + i; + fd = pe->data.fd; + + /* Skip invalidated events, see uv__platform_invalidate_fd */ + if (fd == -1) + continue; + + if (fd == iou->ringfd) { + uv__poll_io_uring(loop, iou); + have_iou_events = 1; + continue; + } + + assert(fd >= 0); + assert((unsigned) fd < loop->nwatchers); + + w = loop->watchers[fd]; + + if (w == NULL) { + /* File descriptor that we've stopped watching, disarm it. + * + * Ignore all errors because we may be racing with another thread + * when the file descriptor is closed. + * + * Perform EPOLL_CTL_DEL immediately instead of going through + * io_uring's submit queue, otherwise the file descriptor may + * be closed by the time the kernel starts the operation. + */ + epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe); + continue; + } + + /* Give users only events they're interested in. Prevents spurious + * callbacks when previous callback invocation in this loop has stopped + * the current watcher. Also, filters out events that users has not + * requested us to watch. + */ + pe->events &= w->pevents | POLLERR | POLLHUP; + + /* Work around an epoll quirk where it sometimes reports just the + * EPOLLERR or EPOLLHUP event. In order to force the event loop to + * move forward, we merge in the read/write events that the watcher + * is interested in; uv__read() and uv__write() will then deal with + * the error or hangup in the usual fashion. + * + * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user + * reads the available data, calls uv_read_stop(), then sometime later + * calls uv_read_start() again. By then, libuv has forgotten about the + * hangup and the kernel won't report EPOLLIN again because there's + * nothing left to read. If anything, libuv is to blame here. The + * current hack is just a quick bandaid; to properly fix it, libuv + * needs to remember the error/hangup event. We should get that for + * free when we switch over to edge-triggered I/O. + */ + if (pe->events == POLLERR || pe->events == POLLHUP) + pe->events |= + w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI); + + if (pe->events != 0) { + /* Run signal watchers last. This also affects child process watchers + * because those are implemented in terms of signal watchers. + */ + if (w == &loop->signal_io_watcher) { + have_signals = 1; + } else { + uv__metrics_update_idle_time(loop); + w->cb(loop, w, pe->events); + } + + nevents++; + } + } + + uv__metrics_inc_events(loop, nevents); + if (reset_timeout != 0) { + timeout = user_timeout; + reset_timeout = 0; + uv__metrics_inc_events_waiting(loop, nevents); + } + + if (have_signals != 0) { + uv__metrics_update_idle_time(loop); + loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN); + } + + lfields->inv = NULL; + + if (have_iou_events != 0) + break; /* Event loop should cycle now so don't poll again. */ + + if (have_signals != 0) + break; /* Event loop should cycle now so don't poll again. */ + + if (nevents != 0) { + if (nfds == ARRAY_SIZE(events) && --count != 0) { + /* Poll for more events but don't block this time. */ + timeout = 0; + continue; + } + break; + } + +update_timeout: + if (timeout == 0) + break; + + if (timeout == -1) + continue; + + assert(timeout > 0); + + real_timeout -= (loop->time - base); + if (real_timeout <= 0) + break; + + timeout = real_timeout; + } + + if (ctl->ringfd != -1) + while (*ctl->sqhead != *ctl->sqtail) + uv__epoll_ctl_flush(epollfd, ctl, &prep); +} + +uint64_t uv__hrtime(uv_clocktype_t type) { + static _Atomic clock_t fast_clock_id = -1; + struct timespec t; + clock_t clock_id; + + /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has + * millisecond granularity or better. CLOCK_MONOTONIC_COARSE is + * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may + * decide to make a costly system call. + */ + /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE + * when it has microsecond granularity or better (unlikely). + */ + clock_id = CLOCK_MONOTONIC; + if (type != UV_CLOCK_FAST) + goto done; + + clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed); + if (clock_id != -1) + goto done; + + clock_id = CLOCK_MONOTONIC; + if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t)) + if (t.tv_nsec <= 1 * 1000 * 1000) + clock_id = CLOCK_MONOTONIC_COARSE; + + atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed); + +done: + + if (clock_gettime(clock_id, &t)) + return 0; /* Not really possible. */ + + return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec; +} + + +int uv_resident_set_memory(size_t* rss) { + char buf[1024]; + const char* s; + long val; + int rc; + int i; + + /* rss: 24th element */ + rc = uv__slurp("/proc/self/stat", buf, sizeof(buf)); + if (rc < 0) + return rc; + + /* find the last ')' */ + s = strrchr(buf, ')'); + if (s == NULL) + goto err; + + for (i = 1; i <= 22; i++) { + s = strchr(s + 1, ' '); + if (s == NULL) + goto err; + } + + errno = 0; + val = strtol(s, NULL, 10); + if (val < 0 || errno != 0) + goto err; + + *rss = val * getpagesize(); + return 0; + +err: + return UV_EINVAL; +} + +int uv_uptime(double* uptime) { + struct timespec now; + char buf[128]; + + /* Consult /proc/uptime when present (common case), or fall back to + * clock_gettime. Why not always clock_gettime? It doesn't always return the + * right result under OpenVZ and possibly other containerized environments. + */ + if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf))) + if (1 == sscanf(buf, "%lf", uptime)) + return 0; + + if (clock_gettime(CLOCK_BOOTTIME, &now)) + return UV__ERR(errno); + + *uptime = now.tv_sec; + return 0; +} + + +int uv_cpu_info(uv_cpu_info_t** ci, int* count) { +#if defined(__PPC__) + static const char model_marker[] = "cpu\t\t: "; + static const char model_marker2[] = ""; +#elif defined(__arm__) + static const char model_marker[] = "model name\t: "; + static const char model_marker2[] = "Processor\t: "; +#elif defined(__aarch64__) + static const char model_marker[] = "CPU part\t: "; + static const char model_marker2[] = ""; +#elif defined(__mips__) + static const char model_marker[] = "cpu model\t\t: "; + static const char model_marker2[] = ""; +#elif defined(__loongarch__) + static const char model_marker[] = "cpu family\t\t: "; + static const char model_marker2[] = ""; +#else + static const char model_marker[] = "model name\t: "; + static const char model_marker2[] = ""; +#endif + static const char parts[] = +#ifdef __aarch64__ + "0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n" + "0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n" + "0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n" + "0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n" + "0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n" + "0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n" + "0xc0d\nCortex-A17\n" /* Originally A12 */ + "0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n" + "0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n" + "0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n" + "0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n" + "0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n" + "0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n" + "0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n" + "0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n" + "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n" + "0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n" + "0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n" +#endif + ""; + struct cpu { + unsigned long long freq, user, nice, sys, idle, irq; + unsigned model; + }; + FILE* fp; + char* p; + int found; + int n; + unsigned i; + unsigned cpu; + unsigned maxcpu; + unsigned size; + unsigned long long skip; + struct cpu (*cpus)[8192]; /* Kernel maximum. */ + struct cpu* c; + struct cpu t; + char (*model)[64]; + unsigned char bitmap[ARRAY_SIZE(*cpus) / 8]; + /* Assumption: even big.LITTLE systems will have only a handful + * of different CPU models. Most systems will just have one. + */ + char models[8][64]; + char buf[1024]; + + memset(bitmap, 0, sizeof(bitmap)); + memset(models, 0, sizeof(models)); + snprintf(*models, sizeof(*models), "unknown"); + maxcpu = 0; + + cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus)); + if (cpus == NULL) + return UV_ENOMEM; + + fp = uv__open_file("/proc/stat"); + if (fp == NULL) { + uv__free(cpus); + return UV__ERR(errno); + } + + if (NULL == fgets(buf, sizeof(buf), fp)) + abort(); + + for (;;) { + memset(&t, 0, sizeof(t)); + + n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu", + &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq); + + if (n != 7) + break; + + if (NULL == fgets(buf, sizeof(buf), fp)) + abort(); + + if (cpu >= ARRAY_SIZE(*cpus)) + continue; + + (*cpus)[cpu] = t; + + bitmap[cpu >> 3] |= 1 << (cpu & 7); + + if (cpu >= maxcpu) + maxcpu = cpu + 1; + } + + fclose(fp); + + fp = uv__open_file("/proc/cpuinfo"); + if (fp == NULL) + goto nocpuinfo; + + for (;;) { + if (1 != fscanf(fp, "processor\t: %u\n", &cpu)) + break; /* Parse error. */ + + while (fgets(buf, sizeof(buf), fp)) { + if (!strncmp(buf, model_marker, sizeof(model_marker) - 1)) { + p = buf + sizeof(model_marker) - 1; + goto parts; + } + if (!*model_marker2) + continue; + if (!strncmp(buf, model_marker2, sizeof(model_marker2) - 1)) { + p = buf + sizeof(model_marker2) - 1; + goto parts; + } + } + + goto next; /* Not found. */ + +parts: + n = (int) strcspn(p, "\n"); + + /* arm64: translate CPU part code to model name. */ + if (*parts) { + p = memmem(parts, sizeof(parts) - 1, p, n + 1); + if (p == NULL) + p = "unknown"; + else + p += n + 1; + n = (int) strcspn(p, "\n"); + } + + found = 0; + for (model = models; !found && model < ARRAY_END(models); model++) + found = !strncmp(p, *model, strlen(*model)); + + if (!found) + goto next; + + if (**model == '\0') + snprintf(*model, sizeof(*model), "%.*s", n, p); + + if (cpu < maxcpu) + (*cpus)[cpu].model = model - models; + +next: + while (fgets(buf, sizeof(buf), fp)) + if (*buf == '\n') + break; + } + + fclose(fp); + fp = NULL; + +nocpuinfo: + + n = 0; + for (cpu = 0; cpu < maxcpu; cpu++) { + if (!(bitmap[cpu >> 3] & (1 << (cpu & 7)))) + continue; + + n++; + snprintf(buf, sizeof(buf), + "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu); + + fp = uv__open_file(buf); + if (fp == NULL) + continue; + + if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq)) + abort(); + fclose(fp); + fp = NULL; + } + + size = n * sizeof(**ci) + sizeof(models); + *ci = uv__malloc(size); + *count = 0; + + if (*ci == NULL) { + uv__free(cpus); + return UV_ENOMEM; + } + + *count = n; + p = memcpy(*ci + n, models, sizeof(models)); + + i = 0; + for (cpu = 0; cpu < maxcpu; cpu++) { + if (!(bitmap[cpu >> 3] & (1 << (cpu & 7)))) + continue; + + c = *cpus + cpu; + + (*ci)[i++] = (uv_cpu_info_t) { + .model = p + c->model * sizeof(*model), + .speed = c->freq / 1000, + /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz, + * therefore the multiplier is always 1000/100 = 10. + */ + .cpu_times = (struct uv_cpu_times_s) { + .user = 10 * c->user, + .nice = 10 * c->nice, + .sys = 10 * c->sys, + .idle = 10 * c->idle, + .irq = 10 * c->irq, + }, + }; + } + + uv__free(cpus); + + return 0; +} + + +static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) { + if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING))) + return 1; + if (ent->ifa_addr == NULL) + return 1; + /* + * On Linux getifaddrs returns information related to the raw underlying + * devices. We're not interested in this information yet. + */ + if (ent->ifa_addr->sa_family == PF_PACKET) + return exclude_type; + return !exclude_type; +} + +/* TODO(bnoordhuis) share with bsd-ifaddrs.c */ +int uv_interface_addresses(uv_interface_address_t** addresses, int* count) { + uv_interface_address_t* address; + struct sockaddr_ll* sll; + struct ifaddrs* addrs; + struct ifaddrs* ent; + size_t namelen; + char* name; + int i; + + *count = 0; + *addresses = NULL; + + if (getifaddrs(&addrs)) + return UV__ERR(errno); + + /* Count the number of interfaces */ + namelen = 0; + for (ent = addrs; ent != NULL; ent = ent->ifa_next) { + if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR)) + continue; + + namelen += strlen(ent->ifa_name) + 1; + (*count)++; + } + + if (*count == 0) { + freeifaddrs(addrs); + return 0; + } + + /* Make sure the memory is initiallized to zero using calloc() */ + *addresses = uv__calloc(1, *count * sizeof(**addresses) + namelen); + if (*addresses == NULL) { + freeifaddrs(addrs); + return UV_ENOMEM; + } + + name = (char*) &(*addresses)[*count]; + address = *addresses; + + for (ent = addrs; ent != NULL; ent = ent->ifa_next) { + if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR)) + continue; + + namelen = strlen(ent->ifa_name) + 1; + address->name = memcpy(name, ent->ifa_name, namelen); + name += namelen; + + if (ent->ifa_addr->sa_family == AF_INET6) { + address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr); + } else { + address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr); + } + + if (ent->ifa_netmask->sa_family == AF_INET6) { + address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask); + } else { + address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask); + } + + address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK); + + address++; + } + + /* Fill in physical addresses for each interface */ + for (ent = addrs; ent != NULL; ent = ent->ifa_next) { + if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS)) + continue; + + address = *addresses; + + for (i = 0; i < (*count); i++) { + size_t namelen = strlen(ent->ifa_name); + /* Alias interface share the same physical address */ + if (strncmp(address->name, ent->ifa_name, namelen) == 0 && + (address->name[namelen] == 0 || address->name[namelen] == ':')) { + sll = (struct sockaddr_ll*)ent->ifa_addr; + memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr)); + } + address++; + } + } + + freeifaddrs(addrs); + + return 0; +} + + +/* TODO(bnoordhuis) share with bsd-ifaddrs.c */ +void uv_free_interface_addresses(uv_interface_address_t* addresses, + int count) { + uv__free(addresses); +} + + +void uv__set_process_title(const char* title) { +#if defined(PR_SET_NAME) + prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */ +#endif +} + + +static uint64_t uv__read_proc_meminfo(const char* what) { + uint64_t rc; + char* p; + char buf[4096]; /* Large enough to hold all of /proc/meminfo. */ + + if (uv__slurp("/proc/meminfo", buf, sizeof(buf))) + return 0; + + p = strstr(buf, what); + + if (p == NULL) + return 0; + + p += strlen(what); + + rc = 0; + sscanf(p, "%" PRIu64 " kB", &rc); + + return rc * 1024; +} + + +uint64_t uv_get_free_memory(void) { + struct sysinfo info; + uint64_t rc; + + rc = uv__read_proc_meminfo("MemAvailable:"); + + if (rc != 0) + return rc; + + if (0 == sysinfo(&info)) + return (uint64_t) info.freeram * info.mem_unit; + + return 0; +} + + +uint64_t uv_get_total_memory(void) { + struct sysinfo info; + uint64_t rc; + + rc = uv__read_proc_meminfo("MemTotal:"); + + if (rc != 0) + return rc; + + if (0 == sysinfo(&info)) + return (uint64_t) info.totalram * info.mem_unit; + + return 0; +} + + +static uint64_t uv__read_uint64(const char* filename) { + char buf[32]; /* Large enough to hold an encoded uint64_t. */ + uint64_t rc; + + rc = 0; + if (0 == uv__slurp(filename, buf, sizeof(buf))) + if (1 != sscanf(buf, "%" PRIu64, &rc)) + if (0 == strcmp(buf, "max\n")) + rc = UINT64_MAX; + + return rc; +} + + +/* Given a buffer with the contents of a cgroup1 /proc/self/cgroups, + * finds the location and length of the memory controller mount path. + * This disregards the leading / for easy concatenation of paths. + * Returns NULL if the memory controller wasn't found. */ +static char* uv__cgroup1_find_memory_controller(char buf[static 1024], + int* n) { + char* p; + + /* Seek to the memory controller line. */ + p = strchr(buf, ':'); + while (p != NULL && strncmp(p, ":memory:", 8)) { + p = strchr(p, '\n'); + if (p != NULL) + p = strchr(p, ':'); + } + + if (p != NULL) { + /* Determine the length of the mount path. */ + p = p + strlen(":memory:/"); + *n = (int) strcspn(p, "\n"); + } + + return p; +} + +static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high, + uint64_t* max) { + char filename[4097]; + char* p; + int n; + uint64_t cgroup1_max; + + /* Find out where the controller is mounted. */ + p = uv__cgroup1_find_memory_controller(buf, &n); + if (p != NULL) { + snprintf(filename, sizeof(filename), + "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p); + *high = uv__read_uint64(filename); + + snprintf(filename, sizeof(filename), + "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p); + *max = uv__read_uint64(filename); + + /* If the controller wasn't mounted, the reads above will have failed, + * as indicated by uv__read_uint64 returning 0. + */ + if (*high != 0 && *max != 0) + goto update_limits; + } + + /* Fall back to the limits of the global memory controller. */ + *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes"); + *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes"); + + /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect + * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE). + */ +update_limits: + cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1); + if (*high == cgroup1_max) + *high = UINT64_MAX; + if (*max == cgroup1_max) + *max = UINT64_MAX; +} + +static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high, + uint64_t* max) { + char filename[4097]; + char* p; + int n; + + /* Find out where the controller is mounted. */ + p = buf + strlen("0::/"); + n = (int) strcspn(p, "\n"); + + /* Read the memory limits of the controller. */ + snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p); + *max = uv__read_uint64(filename); + snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p); + *high = uv__read_uint64(filename); +} + +static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) { + uint64_t high; + uint64_t max; + + /* In the case of cgroupv2, we'll only have a single entry. */ + if (strncmp(buf, "0::/", 4)) + uv__get_cgroup1_memory_limits(buf, &high, &max); + else + uv__get_cgroup2_memory_limits(buf, &high, &max); + + if (high == 0 || max == 0) + return 0; + + return high < max ? high : max; +} + +uint64_t uv_get_constrained_memory(void) { + char buf[1024]; + + if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf))) + return 0; + + return uv__get_cgroup_constrained_memory(buf); +} + + +static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) { + char filename[4097]; + uint64_t current; + char* p; + int n; + + /* Find out where the controller is mounted. */ + p = uv__cgroup1_find_memory_controller(buf, &n); + if (p != NULL) { + snprintf(filename, sizeof(filename), + "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p); + current = uv__read_uint64(filename); + + /* If the controller wasn't mounted, the reads above will have failed, + * as indicated by uv__read_uint64 returning 0. + */ + if (current != 0) + return current; + } + + /* Fall back to the usage of the global memory controller. */ + return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes"); +} + +static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) { + char filename[4097]; + char* p; + int n; + + /* Find out where the controller is mounted. */ + p = buf + strlen("0::/"); + n = (int) strcspn(p, "\n"); + + snprintf(filename, sizeof(filename), + "/sys/fs/cgroup/%.*s/memory.current", n, p); + return uv__read_uint64(filename); +} + +uint64_t uv_get_available_memory(void) { + char buf[1024]; + uint64_t constrained; + uint64_t current; + uint64_t total; + + if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf))) + return 0; + + constrained = uv__get_cgroup_constrained_memory(buf); + if (constrained == 0) + return uv_get_free_memory(); + + total = uv_get_total_memory(); + if (constrained > total) + return uv_get_free_memory(); + + /* In the case of cgroupv2, we'll only have a single entry. */ + if (strncmp(buf, "0::/", 4)) + current = uv__get_cgroup1_current_memory(buf); + else + current = uv__get_cgroup2_current_memory(buf); + + /* memory usage can be higher than the limit (for short bursts of time) */ + if (constrained < current) + return 0; + + return constrained - current; +} + + +static int uv__get_cgroupv2_constrained_cpu(const char* cgroup, + long long* quota) { + static const char cgroup_mount[] = "/sys/fs/cgroup"; + const char* cgroup_trimmed; + char buf[1024]; + char full_path[256]; + char path[256]; + char quota_buf[16]; + char* last_slash; + int cgroup_size; + long long limit; + long long min_quota; + long long period; + + if (strncmp(cgroup, "0::/", 4) != 0) + return UV_EINVAL; + + /* Trim ending \n by replacing it with a 0 */ + cgroup_trimmed = cgroup + sizeof("0::/") - 1; /* Skip the prefix "0::/" */ + cgroup_size = (int)strcspn(cgroup_trimmed, "\n"); /* Find the first \n */ + min_quota = LLONG_MAX; + + /* Construct the path to the cpu.max files */ + snprintf(path, sizeof(path), "%s/%.*s/cgroup.controllers", cgroup_mount, + cgroup_size, cgroup_trimmed); + + /* Read controllers, if not exists, not really a cgroup */ + if (uv__slurp(path, buf, sizeof(buf)) < 0) + return UV_EIO; + + snprintf(path, sizeof(path), "%s/%.*s", cgroup_mount, cgroup_size, + cgroup_trimmed); + + /* + * Traverse up the cgroup v2 hierarchy, starting from the current cgroup path. + * At each level, attempt to read the "cpu.max" file, which defines the CPU + * quota and period. + * + * This reflects how Linux applies cgroup limits hierarchically. + * + * e.g: given a path like /sys/fs/cgroup/foo/bar/baz, we check: + * - /sys/fs/cgroup/foo/bar/baz/cpu.max + * - /sys/fs/cgroup/foo/bar/cpu.max + * - /sys/fs/cgroup/foo/cpu.max + * - /sys/fs/cgroup/cpu.max + */ + while (strncmp(path, cgroup_mount, strlen(cgroup_mount)) == 0) { + snprintf(full_path, sizeof(full_path), "%s/cpu.max", path); + + /* Silently ignore and continue if the file does not exist */ + if (uv__slurp(full_path, quota_buf, sizeof(quota_buf)) < 0) + goto next; + + /* No limit, move on */ + if (strncmp(quota_buf, "max", 3) == 0) + goto next; + + /* Read cpu.max */ + if (sscanf(quota_buf, "%lld %lld", &limit, &period) != 2) + goto next; + + /* Can't divide by 0 */ + if (period == 0) + goto next; + + *quota = limit / period; + if (*quota < min_quota) + min_quota = *quota; + +next: + /* Move up one level in the cgroup hierarchy by trimming the last path. + * The loop ends once we reach the cgroup root mount point. + */ + last_slash = strrchr(path, '/'); + if (last_slash == NULL || strcmp(path, cgroup_mount) == 0) + break; + *last_slash = '\0'; + } + + return 0; +} + +static char* uv__cgroup1_find_cpu_controller(const char* cgroup, + int* cgroup_size) { + /* Seek to the cpu controller line. */ + char* cgroup_cpu = strstr(cgroup, ":cpu,"); + + if (cgroup_cpu != NULL) { + /* Skip the controller prefix to the start of the cgroup path. */ + cgroup_cpu += sizeof(":cpu,") - 1; + /* Determine the length of the cgroup path, excluding the newline. */ + *cgroup_size = (int)strcspn(cgroup_cpu, "\n"); + } + + return cgroup_cpu; +} + +static int uv__get_cgroupv1_constrained_cpu(const char* cgroup, + long long* quota) { + char path[256]; + char buf[1024]; + int cgroup_size; + char* cgroup_cpu; + long long period_length; + long long quota_per_period; + + cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size); + + if (cgroup_cpu == NULL) + return UV_EIO; + + /* Construct the path to the cpu.cfs_quota_us file */ + snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us", + cgroup_size, cgroup_cpu); + + /* Read cpu.cfs_quota_us */ + if (uv__slurp(path, buf, sizeof(buf)) < 0) + return UV_EIO; + + if (sscanf(buf, "%lld", "a_per_period) != 1) + return UV_EINVAL; + + /* Construct the path to the cpu.cfs_period_us file */ + snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us", + cgroup_size, cgroup_cpu); + + /* Read cpu.cfs_period_us */ + if (uv__slurp(path, buf, sizeof(buf)) < 0) + return UV_EIO; + + if (sscanf(buf, "%lld", &period_length) != 1) + return UV_EINVAL; + + /* Can't divide by 0 */ + if (period_length == 0) + return UV_EINVAL; + + *quota = quota_per_period / period_length; + + return 0; +} + +int uv__get_constrained_cpu(long long* quota) { + char cgroup[1024]; + + /* Read the cgroup from /proc/self/cgroup */ + if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0) + return UV_EIO; + + /* Check if the system is using cgroup v2 by examining /proc/self/cgroup + * The entry for cgroup v2 is always in the format "0::$PATH" + * see https://docs.kernel.org/admin-guide/cgroup-v2.html */ + if (strncmp(cgroup, "0::/", 4) == 0) + return uv__get_cgroupv2_constrained_cpu(cgroup, quota); + else + return uv__get_cgroupv1_constrained_cpu(cgroup, quota); +} + + +void uv_loadavg(double avg[3]) { + struct sysinfo info; + char buf[128]; /* Large enough to hold all of /proc/loadavg. */ + + if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf))) + if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2])) + return; + + if (sysinfo(&info) < 0) + return; + + avg[0] = (double) info.loads[0] / 65536.0; + avg[1] = (double) info.loads[1] / 65536.0; + avg[2] = (double) info.loads[2] / 65536.0; +} + + +static int compare_watchers(const struct watcher_list* a, + const struct watcher_list* b) { + if (a->wd < b->wd) return -1; + if (a->wd > b->wd) return 1; + return 0; +} + + +static int init_inotify(uv_loop_t* loop) { + int err; + int fd; + + if (loop->inotify_fd != -1) + return 0; + + fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); + if (fd < 0) + return UV__ERR(errno); + + err = uv__io_init_start(loop, &loop->inotify_read_watcher, uv__inotify_read, + fd, POLLIN); + if (err) { + uv__close(fd); + return err; + } + + loop->inotify_fd = fd; + return 0; +} + + +static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) { + /* Open the inotify_fd, and re-arm all the inotify watchers. */ + int err; + struct watcher_list* tmp_watcher_list_iter; + struct watcher_list* watcher_list; + struct watcher_list tmp_watcher_list; + struct uv__queue queue; + struct uv__queue* q; + uv_fs_event_t* handle; + char* tmp_path; + + if (root == NULL) + return 0; + + /* We must restore the old watcher list to be able to close items + * out of it. + */ + loop->inotify_watchers = root; + + uv__queue_init(&tmp_watcher_list.watchers); + /* Note that the queue we use is shared with the start and stop() + * functions, making uv__queue_foreach unsafe to use. So we use the + * uv__queue_move trick to safely iterate. Also don't free the watcher + * list until we're done iterating. c.f. uv__inotify_read. + */ + RB_FOREACH_SAFE(watcher_list, watcher_root, + uv__inotify_watchers(loop), tmp_watcher_list_iter) { + watcher_list->iterating = 1; + uv__queue_move(&watcher_list->watchers, &queue); + while (!uv__queue_empty(&queue)) { + q = uv__queue_head(&queue); + handle = uv__queue_data(q, uv_fs_event_t, watchers); + /* It's critical to keep a copy of path here, because it + * will be set to NULL by stop() and then deallocated by + * maybe_free_watcher_list + */ + tmp_path = uv__strdup(handle->path); + assert(tmp_path != NULL); + uv__queue_remove(q); + uv__queue_insert_tail(&watcher_list->watchers, q); + uv_fs_event_stop(handle); + + uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers); + handle->path = tmp_path; + } + watcher_list->iterating = 0; + maybe_free_watcher_list(watcher_list, loop); + } + + uv__queue_move(&tmp_watcher_list.watchers, &queue); + while (!uv__queue_empty(&queue)) { + q = uv__queue_head(&queue); + uv__queue_remove(q); + handle = uv__queue_data(q, uv_fs_event_t, watchers); + tmp_path = handle->path; + handle->path = NULL; + err = uv_fs_event_start(handle, handle->cb, tmp_path, 0); + uv__free(tmp_path); + if (err) + return err; + } + + return 0; +} + + +static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) { + struct watcher_list w; + w.wd = wd; + return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w); +} + + +static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) { + /* if the watcher_list->watchers is being iterated over, we can't free it. */ + if ((!w->iterating) && uv__queue_empty(&w->watchers)) { + /* No watchers left for this path. Clean up. */ + RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w); + inotify_rm_watch(loop->inotify_fd, w->wd); + uv__free(w); + } +} + + +static void uv__inotify_read(uv_loop_t* loop, + uv__io_t* dummy, + unsigned int events) { + const struct inotify_event* e; + struct watcher_list* w; + uv_fs_event_t* h; + struct uv__queue queue; + struct uv__queue* q; + const char* path; + ssize_t size; + const char *p; + /* needs to be large enough for sizeof(inotify_event) + strlen(path) */ + char buf[4096]; + + for (;;) { + do + size = read(loop->inotify_fd, buf, sizeof(buf)); + while (size == -1 && errno == EINTR); + + if (size == -1) { + assert(errno == EAGAIN || errno == EWOULDBLOCK); + break; + } + + assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */ + + /* Now we have one or more inotify_event structs. */ + for (p = buf; p < buf + size; p += sizeof(*e) + e->len) { + e = (const struct inotify_event*) p; + + events = 0; + if (e->mask & (IN_ATTRIB|IN_MODIFY)) + events |= UV_CHANGE; + if (e->mask & ~(IN_ATTRIB|IN_MODIFY)) + events |= UV_RENAME; + + w = find_watcher(loop, e->wd); + if (w == NULL) + continue; /* Stale event, no watchers left. */ + + /* inotify does not return the filename when monitoring a single file + * for modifications. Repurpose the filename for API compatibility. + * I'm not convinced this is a good thing, maybe it should go. + */ + path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path); + + /* We're about to iterate over the queue and call user's callbacks. + * What can go wrong? + * A callback could call uv_fs_event_stop() + * and the queue can change under our feet. + * So, we use uv__queue_move() trick to safely iterate over the queue. + * And we don't free the watcher_list until we're done iterating. + * + * First, + * tell uv_fs_event_stop() (that could be called from a user's callback) + * not to free watcher_list. + */ + w->iterating = 1; + uv__queue_move(&w->watchers, &queue); + while (!uv__queue_empty(&queue)) { + q = uv__queue_head(&queue); + h = uv__queue_data(q, uv_fs_event_t, watchers); + + uv__queue_remove(q); + uv__queue_insert_tail(&w->watchers, q); + + h->cb(h, path, events, 0); + } + /* done iterating, time to (maybe) free empty watcher_list */ + w->iterating = 0; + maybe_free_watcher_list(w, loop); + } + } +} + + +int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) { + uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT); + return 0; +} + + +int uv_fs_event_start(uv_fs_event_t* handle, + uv_fs_event_cb cb, + const char* path, + unsigned int flags) { + struct watcher_list* w; + uv_loop_t* loop; + size_t len; + int events; + int err; + int wd; + + if (uv__is_active(handle)) + return UV_EINVAL; + + loop = handle->loop; + + err = init_inotify(loop); + if (err) + return err; + + events = IN_ATTRIB + | IN_CREATE + | IN_MODIFY + | IN_DELETE + | IN_DELETE_SELF + | IN_MOVE_SELF + | IN_MOVED_FROM + | IN_MOVED_TO; + + wd = inotify_add_watch(loop->inotify_fd, path, events); + if (wd == -1) + return UV__ERR(errno); + + w = find_watcher(loop, wd); + if (w) + goto no_insert; + + len = strlen(path) + 1; + w = uv__malloc(sizeof(*w) + len); + if (w == NULL) + return UV_ENOMEM; + + w->wd = wd; + w->path = memcpy(w + 1, path, len); + uv__queue_init(&w->watchers); + w->iterating = 0; + RB_INSERT(watcher_root, uv__inotify_watchers(loop), w); + +no_insert: + uv__handle_start(handle); + uv__queue_insert_tail(&w->watchers, &handle->watchers); + handle->path = w->path; + handle->cb = cb; + handle->wd = wd; + + return 0; +} + + +int uv_fs_event_stop(uv_fs_event_t* handle) { + struct watcher_list* w; + + if (!uv__is_active(handle)) + return 0; + + w = find_watcher(handle->loop, handle->wd); + assert(w != NULL); + + handle->wd = -1; + handle->path = NULL; + uv__handle_stop(handle); + uv__queue_remove(&handle->watchers); + + maybe_free_watcher_list(w, handle->loop); + + return 0; +} + + +void uv__fs_event_close(uv_fs_event_t* handle) { + uv_fs_event_stop(handle); +}