comparison third_party/libuv/src/unix/linux.c @ 160:948de3f54cea

[ThirdParty] Added libuv
author June Park <parkjune1995@gmail.com>
date Wed, 14 Jan 2026 19:39:52 -0800
parents
children
comparison
equal deleted inserted replaced
159:05cf9467a1c3 160:948de3f54cea
1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
2 * Permission is hereby granted, free of charge, to any person obtaining a copy
3 * of this software and associated documentation files (the "Software"), to
4 * deal in the Software without restriction, including without limitation the
5 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
6 * sell copies of the Software, and to permit persons to whom the Software is
7 * furnished to do so, subject to the following conditions:
8 *
9 * The above copyright notice and this permission notice shall be included in
10 * all copies or substantial portions of the Software.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
17 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
18 * IN THE SOFTWARE.
19 */
20
21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
22 * EPOLL* counterparts. We use the POLL* variants in this file because that
23 * is what libuv uses elsewhere.
24 */
25
26 #include "uv.h"
27 #include "internal.h"
28
29 #include <inttypes.h>
30 #include <stdatomic.h>
31 #include <stddef.h> /* offsetof */
32 #include <stdint.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <assert.h>
37 #include <errno.h>
38
39 #include <fcntl.h>
40 #include <ifaddrs.h>
41 #include <net/ethernet.h>
42 #include <net/if.h>
43 #include <netpacket/packet.h>
44 #include <sys/epoll.h>
45 #include <sys/inotify.h>
46 #include <sys/mman.h>
47 #include <sys/param.h>
48 #include <sys/prctl.h>
49 #include <sys/socket.h>
50 #include <sys/stat.h>
51 #include <sys/syscall.h>
52 #include <sys/sysinfo.h>
53 #include <sys/sysmacros.h>
54 #include <sys/types.h>
55 #include <sys/utsname.h>
56 #include <time.h>
57 #include <unistd.h>
58
59 #ifndef __NR_io_uring_setup
60 # define __NR_io_uring_setup 425
61 #endif
62
63 #ifndef __NR_io_uring_enter
64 # define __NR_io_uring_enter 426
65 #endif
66
67 #ifndef __NR_io_uring_register
68 # define __NR_io_uring_register 427
69 #endif
70
71 #ifndef __NR_copy_file_range
72 # if defined(__x86_64__)
73 # define __NR_copy_file_range 326
74 # elif defined(__i386__)
75 # define __NR_copy_file_range 377
76 # elif defined(__s390__)
77 # define __NR_copy_file_range 375
78 # elif defined(__arm__)
79 # define __NR_copy_file_range 391
80 # elif defined(__aarch64__)
81 # define __NR_copy_file_range 285
82 # elif defined(__powerpc__)
83 # define __NR_copy_file_range 379
84 # elif defined(__arc__)
85 # define __NR_copy_file_range 285
86 # elif defined(__riscv)
87 # define __NR_copy_file_range 285
88 # endif
89 #endif /* __NR_copy_file_range */
90
91 #ifndef __NR_statx
92 # if defined(__x86_64__)
93 # define __NR_statx 332
94 # elif defined(__i386__)
95 # define __NR_statx 383
96 # elif defined(__aarch64__)
97 # define __NR_statx 397
98 # elif defined(__arm__)
99 # define __NR_statx 397
100 # elif defined(__ppc__)
101 # define __NR_statx 383
102 # elif defined(__s390__)
103 # define __NR_statx 379
104 # elif defined(__riscv)
105 # define __NR_statx 291
106 # endif
107 #endif /* __NR_statx */
108
109 #ifndef __NR_getrandom
110 # if defined(__x86_64__)
111 # define __NR_getrandom 318
112 # elif defined(__i386__)
113 # define __NR_getrandom 355
114 # elif defined(__aarch64__)
115 # define __NR_getrandom 384
116 # elif defined(__arm__)
117 # define __NR_getrandom 384
118 # elif defined(__ppc__)
119 # define __NR_getrandom 359
120 # elif defined(__s390__)
121 # define __NR_getrandom 349
122 # elif defined(__riscv)
123 # define __NR_getrandom 278
124 # endif
125 #endif /* __NR_getrandom */
126
127 enum {
128 UV__IORING_SETUP_SQPOLL = 2u,
129 UV__IORING_SETUP_NO_SQARRAY = 0x10000u,
130 };
131
132 enum {
133 UV__IORING_FEAT_SINGLE_MMAP = 1u,
134 UV__IORING_FEAT_NODROP = 2u,
135 UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */
136 };
137
138 enum {
139 UV__IORING_OP_READV = 1,
140 UV__IORING_OP_WRITEV = 2,
141 UV__IORING_OP_FSYNC = 3,
142 UV__IORING_OP_OPENAT = 18,
143 UV__IORING_OP_CLOSE = 19,
144 UV__IORING_OP_STATX = 21,
145 UV__IORING_OP_EPOLL_CTL = 29,
146 UV__IORING_OP_RENAMEAT = 35,
147 UV__IORING_OP_UNLINKAT = 36,
148 UV__IORING_OP_MKDIRAT = 37,
149 UV__IORING_OP_SYMLINKAT = 38,
150 UV__IORING_OP_LINKAT = 39,
151 UV__IORING_OP_FTRUNCATE = 55,
152 };
153
154 enum {
155 UV__IORING_ENTER_GETEVENTS = 1u,
156 UV__IORING_ENTER_SQ_WAKEUP = 2u,
157 };
158
159 enum {
160 UV__IORING_SQ_NEED_WAKEUP = 1u,
161 UV__IORING_SQ_CQ_OVERFLOW = 2u,
162 };
163
164 struct uv__io_cqring_offsets {
165 uint32_t head;
166 uint32_t tail;
167 uint32_t ring_mask;
168 uint32_t ring_entries;
169 uint32_t overflow;
170 uint32_t cqes;
171 uint64_t reserved0;
172 uint64_t reserved1;
173 };
174
175 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
176
177 struct uv__io_sqring_offsets {
178 uint32_t head;
179 uint32_t tail;
180 uint32_t ring_mask;
181 uint32_t ring_entries;
182 uint32_t flags;
183 uint32_t dropped;
184 uint32_t array;
185 uint32_t reserved0;
186 uint64_t reserved1;
187 };
188
189 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
190
191 struct uv__io_uring_cqe {
192 uint64_t user_data;
193 int32_t res;
194 uint32_t flags;
195 };
196
197 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
198
199 struct uv__io_uring_sqe {
200 uint8_t opcode;
201 uint8_t flags;
202 uint16_t ioprio;
203 int32_t fd;
204 union {
205 uint64_t off;
206 uint64_t addr2;
207 };
208 union {
209 uint64_t addr;
210 };
211 uint32_t len;
212 union {
213 uint32_t rw_flags;
214 uint32_t fsync_flags;
215 uint32_t open_flags;
216 uint32_t statx_flags;
217 };
218 uint64_t user_data;
219 union {
220 uint16_t buf_index;
221 uint64_t pad[3];
222 };
223 };
224
225 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
226 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
227 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
228 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
229 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
230 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
231 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
232 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
233 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
234 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
235 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
236
237 struct uv__io_uring_params {
238 uint32_t sq_entries;
239 uint32_t cq_entries;
240 uint32_t flags;
241 uint32_t sq_thread_cpu;
242 uint32_t sq_thread_idle;
243 uint32_t features;
244 uint32_t reserved[4];
245 struct uv__io_sqring_offsets sq_off; /* 40 bytes */
246 struct uv__io_cqring_offsets cq_off; /* 40 bytes */
247 };
248
249 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
250 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
251 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
252
253 STATIC_ASSERT(EPOLL_CTL_ADD < 4);
254 STATIC_ASSERT(EPOLL_CTL_DEL < 4);
255 STATIC_ASSERT(EPOLL_CTL_MOD < 4);
256
257 struct watcher_list {
258 RB_ENTRY(watcher_list) entry;
259 struct uv__queue watchers;
260 int iterating;
261 char* path;
262 int wd;
263 };
264
265 struct watcher_root {
266 struct watcher_list* rbh_root;
267 };
268
269 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
270 static void uv__inotify_read(uv_loop_t* loop,
271 uv__io_t* w,
272 unsigned int revents);
273 static int compare_watchers(const struct watcher_list* a,
274 const struct watcher_list* b);
275 static void maybe_free_watcher_list(struct watcher_list* w,
276 uv_loop_t* loop);
277
278 static void uv__epoll_ctl_flush(int epollfd,
279 struct uv__iou* ctl,
280 struct epoll_event (*events)[256]);
281
282 static void uv__epoll_ctl_prep(int epollfd,
283 struct uv__iou* ctl,
284 struct epoll_event (*events)[256],
285 int op,
286 int fd,
287 struct epoll_event* e);
288
289 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
290
291
292 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
293 /* This cast works because watcher_root is a struct with a pointer as its
294 * sole member. Such type punning is unsafe in the presence of strict
295 * pointer aliasing (and is just plain nasty) but that is why libuv
296 * is compiled with -fno-strict-aliasing.
297 */
298 return (struct watcher_root*) &loop->inotify_watchers;
299 }
300
301
302 unsigned uv__kernel_version(void) {
303 static _Atomic unsigned cached_version;
304 struct utsname u;
305 unsigned version;
306 unsigned major;
307 unsigned minor;
308 unsigned patch;
309 char v_sig[256];
310 char* needle;
311
312 version = atomic_load_explicit(&cached_version, memory_order_relaxed);
313 if (version != 0)
314 return version;
315
316 /* Check /proc/version_signature first as it's the way to get the mainline
317 * kernel version in Ubuntu. The format is:
318 * Ubuntu ubuntu_kernel_version mainline_kernel_version
319 * For example:
320 * Ubuntu 5.15.0-79.86-generic 5.15.111
321 */
322 if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
323 if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
324 goto calculate_version;
325
326 if (-1 == uname(&u))
327 return 0;
328
329 /* In Debian we need to check `version` instead of `release` to extract the
330 * mainline kernel version. This is an example of how it looks like:
331 * #1 SMP Debian 5.10.46-4 (2021-08-03)
332 */
333 needle = strstr(u.version, "Debian ");
334 if (needle != NULL)
335 if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
336 goto calculate_version;
337
338 if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
339 return 0;
340
341 /* Handle it when the process runs under the UNAME26 personality:
342 *
343 * - kernels >= 3.x identify as 2.6.40+x
344 * - kernels >= 4.x identify as 2.6.60+x
345 *
346 * UNAME26 is a poorly conceived hack that doesn't let us distinguish
347 * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
348 * that 2.6.60+x means 4.x.
349 *
350 * Fun fact of the day: it's technically possible to observe the actual
351 * kernel version for a brief moment because uname() first copies out the
352 * real release string before overwriting it with the backcompat string.
353 */
354 if (major == 2 && minor == 6) {
355 if (patch >= 60) {
356 major = 4;
357 minor = patch - 60;
358 patch = 0;
359 } else if (patch >= 40) {
360 major = 3;
361 minor = patch - 40;
362 patch = 0;
363 }
364 }
365
366 calculate_version:
367 version = major * 65536 + minor * 256 + patch;
368 atomic_store_explicit(&cached_version, version, memory_order_relaxed);
369
370 return version;
371 }
372
373
374 ssize_t
375 uv__fs_copy_file_range(int fd_in,
376 off_t* off_in,
377 int fd_out,
378 off_t* off_out,
379 size_t len,
380 unsigned int flags)
381 {
382 #ifdef __NR_copy_file_range
383 return syscall(__NR_copy_file_range,
384 fd_in,
385 off_in,
386 fd_out,
387 off_out,
388 len,
389 flags);
390 #else
391 return errno = ENOSYS, -1;
392 #endif
393 }
394
395
396 int uv__statx(int dirfd,
397 const char* path,
398 int flags,
399 unsigned int mask,
400 struct uv__statx* statxbuf) {
401 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
402 return errno = ENOSYS, -1;
403 #else
404 int rc;
405
406 rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
407 if (rc >= 0)
408 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
409
410 return rc;
411 #endif
412 }
413
414
415 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
416 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
417 return errno = ENOSYS, -1;
418 #else
419 ssize_t rc;
420
421 rc = syscall(__NR_getrandom, buf, buflen, flags);
422 if (rc >= 0)
423 uv__msan_unpoison(buf, buflen);
424
425 return rc;
426 #endif
427 }
428
429
430 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
431 return syscall(__NR_io_uring_setup, entries, params);
432 }
433
434
435 int uv__io_uring_enter(int fd,
436 unsigned to_submit,
437 unsigned min_complete,
438 unsigned flags) {
439 /* io_uring_enter used to take a sigset_t but it's unused
440 * in newer kernels unless IORING_ENTER_EXT_ARG is set,
441 * in which case it takes a struct io_uring_getevents_arg.
442 */
443 return syscall(__NR_io_uring_enter,
444 fd,
445 to_submit,
446 min_complete,
447 flags,
448 NULL,
449 0L);
450 }
451
452
453 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
454 return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
455 }
456
457
458 static int uv__use_io_uring(uint32_t flags) {
459 #if defined(__ANDROID_API__)
460 return 0; /* Possibly available but blocked by seccomp. */
461 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4
462 /* See https://github.com/libuv/libuv/issues/4158. */
463 return 0; /* All 32 bits kernels appear buggy. */
464 #elif defined(__powerpc64__) || defined(__ppc64__)
465 /* See https://github.com/libuv/libuv/issues/4283. */
466 return 0; /* Random SIGSEGV in signal handler. */
467 #else
468 /* Ternary: unknown=0, yes=1, no=-1 */
469 static _Atomic int use_io_uring;
470 char* val;
471 int use;
472
473 #if defined(__hppa__)
474 /* io_uring first supported on parisc in 6.1, functional in .51
475 * https://lore.kernel.org/all/[email protected]/
476 */
477 if (uv__kernel_version() < /*6.1.51*/0x060133)
478 return 0;
479 #endif
480
481 /* SQPOLL is all kinds of buggy but epoll batching should work fine. */
482 if (0 == (flags & UV__IORING_SETUP_SQPOLL))
483 return 1;
484
485 /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
486 if (uv__kernel_version() < /*5.10.186*/0x050ABA)
487 return 0;
488
489 use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
490
491 if (use == 0) {
492 val = getenv("UV_USE_IO_URING");
493 use = val != NULL && atoi(val) > 0 ? 1 : -1;
494 atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
495 }
496
497 return use > 0;
498 #endif
499 }
500
501
502 static void uv__iou_init(int epollfd,
503 struct uv__iou* iou,
504 uint32_t entries,
505 uint32_t flags) {
506 struct uv__io_uring_params params;
507 struct epoll_event e;
508 size_t cqlen;
509 size_t sqlen;
510 size_t maxlen;
511 size_t sqelen;
512 unsigned kernel_version;
513 uint32_t* sqarray;
514 uint32_t i;
515 char* sq;
516 char* sqe;
517 int ringfd;
518 int no_sqarray;
519
520 sq = MAP_FAILED;
521 sqe = MAP_FAILED;
522
523 if (!uv__use_io_uring(flags))
524 return;
525
526 kernel_version = uv__kernel_version();
527 no_sqarray =
528 UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600);
529
530 /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
531 * Mostly academic because we check for a v5.13 kernel afterwards anyway.
532 */
533 memset(&params, 0, sizeof(params));
534 params.flags = flags | no_sqarray;
535
536 if (flags & UV__IORING_SETUP_SQPOLL)
537 params.sq_thread_idle = 10; /* milliseconds */
538
539 /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
540 ringfd = uv__io_uring_setup(entries, &params);
541 if (ringfd == -1)
542 return;
543
544 /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
545 * actually detecting is whether IORING_OP_STATX works with SQPOLL.
546 */
547 if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
548 goto fail;
549
550 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
551 if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
552 goto fail;
553
554 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
555 if (!(params.features & UV__IORING_FEAT_NODROP))
556 goto fail;
557
558 sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
559 cqlen =
560 params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
561 maxlen = sqlen < cqlen ? cqlen : sqlen;
562 sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
563
564 sq = mmap(0,
565 maxlen,
566 PROT_READ | PROT_WRITE,
567 MAP_SHARED | MAP_POPULATE,
568 ringfd,
569 0); /* IORING_OFF_SQ_RING */
570
571 sqe = mmap(0,
572 sqelen,
573 PROT_READ | PROT_WRITE,
574 MAP_SHARED | MAP_POPULATE,
575 ringfd,
576 0x10000000ull); /* IORING_OFF_SQES */
577
578 if (sq == MAP_FAILED || sqe == MAP_FAILED)
579 goto fail;
580
581 if (flags & UV__IORING_SETUP_SQPOLL) {
582 /* Only interested in completion events. To get notified when
583 * the kernel pulls items from the submission ring, add POLLOUT.
584 */
585 memset(&e, 0, sizeof(e));
586 e.events = POLLIN;
587 e.data.fd = ringfd;
588
589 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
590 goto fail;
591 }
592
593 iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
594 iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
595 iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
596 iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
597 iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
598 iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
599 iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
600 iou->sq = sq;
601 iou->cqe = sq + params.cq_off.cqes;
602 iou->sqe = sqe;
603 iou->sqlen = sqlen;
604 iou->cqlen = cqlen;
605 iou->maxlen = maxlen;
606 iou->sqelen = sqelen;
607 iou->ringfd = ringfd;
608 iou->in_flight = 0;
609
610 if (no_sqarray)
611 return;
612
613 sqarray = (uint32_t*) (sq + params.sq_off.array);
614 for (i = 0; i <= iou->sqmask; i++)
615 sqarray[i] = i; /* Slot -> sqe identity mapping. */
616
617 return;
618
619 fail:
620 if (sq != MAP_FAILED)
621 munmap(sq, maxlen);
622
623 if (sqe != MAP_FAILED)
624 munmap(sqe, sqelen);
625
626 uv__close(ringfd);
627 }
628
629
630 static void uv__iou_delete(struct uv__iou* iou) {
631 if (iou->ringfd > -1) {
632 munmap(iou->sq, iou->maxlen);
633 munmap(iou->sqe, iou->sqelen);
634 uv__close(iou->ringfd);
635 iou->ringfd = -1;
636 }
637 }
638
639
640 int uv__platform_loop_init(uv_loop_t* loop) {
641 uv__loop_internal_fields_t* lfields;
642
643 lfields = uv__get_internal_fields(loop);
644 lfields->ctl.ringfd = -1;
645 lfields->iou.ringfd = -2; /* "uninitialized" */
646
647 loop->inotify_watchers = NULL;
648 loop->inotify_fd = -1;
649 loop->backend_fd = epoll_create1(O_CLOEXEC);
650
651 if (loop->backend_fd == -1)
652 return UV__ERR(errno);
653
654 uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
655
656 return 0;
657 }
658
659
660 int uv__io_fork(uv_loop_t* loop) {
661 int err;
662 struct watcher_list* root;
663
664 root = uv__inotify_watchers(loop)->rbh_root;
665
666 uv__close(loop->backend_fd);
667 loop->backend_fd = -1;
668
669 /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
670 uv__platform_loop_delete(loop);
671
672 err = uv__platform_loop_init(loop);
673 if (err)
674 return err;
675
676 return uv__inotify_fork(loop, root);
677 }
678
679
680 void uv__platform_loop_delete(uv_loop_t* loop) {
681 uv__loop_internal_fields_t* lfields;
682
683 lfields = uv__get_internal_fields(loop);
684 uv__iou_delete(&lfields->ctl);
685 uv__iou_delete(&lfields->iou);
686
687 if (loop->inotify_fd != -1) {
688 uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
689 uv__close(loop->inotify_fd);
690 loop->inotify_fd = -1;
691 }
692 }
693
694
695 struct uv__invalidate {
696 struct epoll_event (*prep)[256];
697 struct epoll_event* events;
698 int nfds;
699 };
700
701
702 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
703 uv__loop_internal_fields_t* lfields;
704 struct uv__invalidate* inv;
705 struct epoll_event dummy;
706 int i;
707
708 lfields = uv__get_internal_fields(loop);
709 inv = lfields->inv;
710
711 /* Invalidate events with same file descriptor */
712 if (inv != NULL)
713 for (i = 0; i < inv->nfds; i++)
714 if (inv->events[i].data.fd == fd)
715 inv->events[i].data.fd = -1;
716
717 /* Remove the file descriptor from the epoll.
718 * This avoids a problem where the same file description remains open
719 * in another process, causing repeated junk epoll events.
720 *
721 * Perform EPOLL_CTL_DEL immediately instead of going through
722 * io_uring's submit queue, otherwise the file descriptor may
723 * be closed by the time the kernel starts the operation.
724 *
725 * We pass in a dummy epoll_event, to work around a bug in old kernels.
726 *
727 * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
728 * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
729 */
730 memset(&dummy, 0, sizeof(dummy));
731 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
732 }
733
734
735 int uv__io_check_fd(uv_loop_t* loop, int fd) {
736 struct epoll_event e;
737 int rc;
738
739 memset(&e, 0, sizeof(e));
740 e.events = POLLIN;
741 e.data.fd = -1;
742
743 rc = 0;
744 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
745 if (errno != EEXIST)
746 rc = UV__ERR(errno);
747
748 if (rc == 0)
749 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
750 abort();
751
752 return rc;
753 }
754
755
756 /* Caller must initialize SQE and call uv__iou_submit(). */
757 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
758 uv_loop_t* loop,
759 uv_fs_t* req) {
760 struct uv__io_uring_sqe* sqe;
761 uint32_t head;
762 uint32_t tail;
763 uint32_t mask;
764 uint32_t slot;
765
766 /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
767 * initialization failed. Anything else is a valid ring file descriptor.
768 */
769 if (iou->ringfd == -2) {
770 /* By default, the SQPOLL is not created. Enable only if the loop is
771 * configured with UV_LOOP_USE_IO_URING_SQPOLL and the UV_USE_IO_URING
772 * environment variable is unset or a positive number.
773 */
774 if (loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL)
775 if (uv__use_io_uring(UV__IORING_SETUP_SQPOLL))
776 uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
777
778 if (iou->ringfd == -2)
779 iou->ringfd = -1; /* "failed" */
780 }
781
782 if (iou->ringfd == -1)
783 return NULL;
784
785 head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
786 memory_order_acquire);
787 tail = *iou->sqtail;
788 mask = iou->sqmask;
789
790 if ((head & mask) == ((tail + 1) & mask))
791 return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
792
793 slot = tail & mask;
794 sqe = iou->sqe;
795 sqe = &sqe[slot];
796 memset(sqe, 0, sizeof(*sqe));
797 sqe->user_data = (uintptr_t) req;
798
799 /* Pacify uv_cancel(). */
800 req->work_req.loop = loop;
801 req->work_req.work = NULL;
802 req->work_req.done = NULL;
803 uv__queue_init(&req->work_req.wq);
804
805 uv__req_register(loop);
806 iou->in_flight++;
807
808 return sqe;
809 }
810
811
812 static void uv__iou_submit(struct uv__iou* iou) {
813 uint32_t flags;
814
815 atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
816 *iou->sqtail + 1,
817 memory_order_release);
818
819 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
820 memory_order_acquire);
821
822 if (flags & UV__IORING_SQ_NEED_WAKEUP)
823 if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
824 if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */
825 perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */
826 }
827
828
829 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
830 struct uv__io_uring_sqe* sqe;
831 struct uv__iou* iou;
832 int kv;
833
834 kv = uv__kernel_version();
835 /* Work around a poorly understood bug in older kernels where closing a file
836 * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
837 * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
838 * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
839 * but good candidates are the several data race fixes. Interestingly, it
840 * seems to manifest only when running under Docker so the possibility of
841 * a Docker bug can't be completely ruled out either. Yay, computers.
842 * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
843 * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
844 * solved.
845 */
846 if (kv < /* 5.15.90 */ 0x050F5A)
847 return 0;
848
849 if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
850 return 0;
851
852
853 iou = &uv__get_internal_fields(loop)->iou;
854
855 sqe = uv__iou_get_sqe(iou, loop, req);
856 if (sqe == NULL)
857 return 0;
858
859 sqe->fd = req->file;
860 sqe->opcode = UV__IORING_OP_CLOSE;
861
862 uv__iou_submit(iou);
863
864 return 1;
865 }
866
867
868 int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) {
869 struct uv__io_uring_sqe* sqe;
870 struct uv__iou* iou;
871
872 if (uv__kernel_version() < /* 6.9 */0x060900)
873 return 0;
874
875 iou = &uv__get_internal_fields(loop)->iou;
876 sqe = uv__iou_get_sqe(iou, loop, req);
877 if (sqe == NULL)
878 return 0;
879
880 sqe->fd = req->file;
881 sqe->len = req->off;
882 sqe->opcode = UV__IORING_OP_FTRUNCATE;
883 uv__iou_submit(iou);
884
885 return 1;
886 }
887
888 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
889 uv_fs_t* req,
890 uint32_t fsync_flags) {
891 struct uv__io_uring_sqe* sqe;
892 struct uv__iou* iou;
893
894 iou = &uv__get_internal_fields(loop)->iou;
895
896 sqe = uv__iou_get_sqe(iou, loop, req);
897 if (sqe == NULL)
898 return 0;
899
900 /* Little known fact: setting seq->off and seq->len turns
901 * it into an asynchronous sync_file_range() operation.
902 */
903 sqe->fd = req->file;
904 sqe->fsync_flags = fsync_flags;
905 sqe->opcode = UV__IORING_OP_FSYNC;
906
907 uv__iou_submit(iou);
908
909 return 1;
910 }
911
912
913 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
914 struct uv__io_uring_sqe* sqe;
915 struct uv__iou* iou;
916
917 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
918 return 0;
919
920 iou = &uv__get_internal_fields(loop)->iou;
921 sqe = uv__iou_get_sqe(iou, loop, req);
922 if (sqe == NULL)
923 return 0;
924
925 sqe->addr = (uintptr_t) req->path;
926 sqe->fd = AT_FDCWD;
927 sqe->addr2 = (uintptr_t) req->new_path;
928 sqe->len = AT_FDCWD;
929 sqe->opcode = UV__IORING_OP_LINKAT;
930
931 uv__iou_submit(iou);
932
933 return 1;
934 }
935
936
937 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
938 struct uv__io_uring_sqe* sqe;
939 struct uv__iou* iou;
940
941 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
942 return 0;
943
944 iou = &uv__get_internal_fields(loop)->iou;
945 sqe = uv__iou_get_sqe(iou, loop, req);
946 if (sqe == NULL)
947 return 0;
948
949 sqe->addr = (uintptr_t) req->path;
950 sqe->fd = AT_FDCWD;
951 sqe->len = req->mode;
952 sqe->opcode = UV__IORING_OP_MKDIRAT;
953
954 uv__iou_submit(iou);
955
956 return 1;
957 }
958
959
960 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
961 struct uv__io_uring_sqe* sqe;
962 struct uv__iou* iou;
963
964 iou = &uv__get_internal_fields(loop)->iou;
965
966 sqe = uv__iou_get_sqe(iou, loop, req);
967 if (sqe == NULL)
968 return 0;
969
970 sqe->addr = (uintptr_t) req->path;
971 sqe->fd = AT_FDCWD;
972 sqe->len = req->mode;
973 sqe->opcode = UV__IORING_OP_OPENAT;
974 sqe->open_flags = req->flags | O_CLOEXEC;
975
976 uv__iou_submit(iou);
977
978 return 1;
979 }
980
981
982 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
983 struct uv__io_uring_sqe* sqe;
984 struct uv__iou* iou;
985
986 iou = &uv__get_internal_fields(loop)->iou;
987
988 sqe = uv__iou_get_sqe(iou, loop, req);
989 if (sqe == NULL)
990 return 0;
991
992 sqe->addr = (uintptr_t) req->path;
993 sqe->fd = AT_FDCWD;
994 sqe->addr2 = (uintptr_t) req->new_path;
995 sqe->len = AT_FDCWD;
996 sqe->opcode = UV__IORING_OP_RENAMEAT;
997
998 uv__iou_submit(iou);
999
1000 return 1;
1001 }
1002
1003
1004 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
1005 struct uv__io_uring_sqe* sqe;
1006 struct uv__iou* iou;
1007
1008 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
1009 return 0;
1010
1011 iou = &uv__get_internal_fields(loop)->iou;
1012 sqe = uv__iou_get_sqe(iou, loop, req);
1013 if (sqe == NULL)
1014 return 0;
1015
1016 sqe->addr = (uintptr_t) req->path;
1017 sqe->fd = AT_FDCWD;
1018 sqe->addr2 = (uintptr_t) req->new_path;
1019 sqe->opcode = UV__IORING_OP_SYMLINKAT;
1020
1021 uv__iou_submit(iou);
1022
1023 return 1;
1024 }
1025
1026
1027 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
1028 struct uv__io_uring_sqe* sqe;
1029 struct uv__iou* iou;
1030
1031 iou = &uv__get_internal_fields(loop)->iou;
1032
1033 sqe = uv__iou_get_sqe(iou, loop, req);
1034 if (sqe == NULL)
1035 return 0;
1036
1037 sqe->addr = (uintptr_t) req->path;
1038 sqe->fd = AT_FDCWD;
1039 sqe->opcode = UV__IORING_OP_UNLINKAT;
1040
1041 uv__iou_submit(iou);
1042
1043 return 1;
1044 }
1045
1046
1047 int uv__iou_fs_read_or_write(uv_loop_t* loop,
1048 uv_fs_t* req,
1049 int is_read) {
1050 struct uv__io_uring_sqe* sqe;
1051 struct uv__iou* iou;
1052
1053 /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
1054 * to the threadpool on writes */
1055 if (req->nbufs > IOV_MAX) {
1056 if (is_read)
1057 req->nbufs = IOV_MAX;
1058 else
1059 return 0;
1060 }
1061
1062 iou = &uv__get_internal_fields(loop)->iou;
1063
1064 sqe = uv__iou_get_sqe(iou, loop, req);
1065 if (sqe == NULL)
1066 return 0;
1067
1068 sqe->addr = (uintptr_t) req->bufs;
1069 sqe->fd = req->file;
1070 sqe->len = req->nbufs;
1071 sqe->off = req->off < 0 ? -1 : req->off;
1072 sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
1073
1074 uv__iou_submit(iou);
1075
1076 return 1;
1077 }
1078
1079
1080 int uv__iou_fs_statx(uv_loop_t* loop,
1081 uv_fs_t* req,
1082 int is_fstat,
1083 int is_lstat) {
1084 struct uv__io_uring_sqe* sqe;
1085 struct uv__statx* statxbuf;
1086 struct uv__iou* iou;
1087
1088 statxbuf = uv__malloc(sizeof(*statxbuf));
1089 if (statxbuf == NULL)
1090 return 0;
1091
1092 iou = &uv__get_internal_fields(loop)->iou;
1093
1094 sqe = uv__iou_get_sqe(iou, loop, req);
1095 if (sqe == NULL) {
1096 uv__free(statxbuf);
1097 return 0;
1098 }
1099
1100 req->ptr = statxbuf;
1101
1102 sqe->addr = (uintptr_t) req->path;
1103 sqe->addr2 = (uintptr_t) statxbuf;
1104 sqe->fd = AT_FDCWD;
1105 sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
1106 sqe->opcode = UV__IORING_OP_STATX;
1107
1108 if (is_fstat) {
1109 sqe->addr = (uintptr_t) "";
1110 sqe->fd = req->file;
1111 sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
1112 }
1113
1114 if (is_lstat)
1115 sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
1116
1117 uv__iou_submit(iou);
1118
1119 return 1;
1120 }
1121
1122
1123 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
1124 buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
1125 buf->st_mode = statxbuf->stx_mode;
1126 buf->st_nlink = statxbuf->stx_nlink;
1127 buf->st_uid = statxbuf->stx_uid;
1128 buf->st_gid = statxbuf->stx_gid;
1129 buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
1130 buf->st_ino = statxbuf->stx_ino;
1131 buf->st_size = statxbuf->stx_size;
1132 buf->st_blksize = statxbuf->stx_blksize;
1133 buf->st_blocks = statxbuf->stx_blocks;
1134 buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
1135 buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
1136 buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
1137 buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
1138 buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
1139 buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
1140 buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
1141 buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
1142 buf->st_flags = 0;
1143 buf->st_gen = 0;
1144 }
1145
1146
1147 static void uv__iou_fs_statx_post(uv_fs_t* req) {
1148 struct uv__statx* statxbuf;
1149 uv_stat_t* buf;
1150
1151 buf = &req->statbuf;
1152 statxbuf = req->ptr;
1153 req->ptr = NULL;
1154
1155 if (req->result == 0) {
1156 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
1157 uv__statx_to_stat(statxbuf, buf);
1158 req->ptr = buf;
1159 }
1160
1161 uv__free(statxbuf);
1162 }
1163
1164
1165 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
1166 struct uv__io_uring_cqe* cqe;
1167 struct uv__io_uring_cqe* e;
1168 uv_fs_t* req;
1169 uint32_t head;
1170 uint32_t tail;
1171 uint32_t mask;
1172 uint32_t i;
1173 uint32_t flags;
1174 int nevents;
1175 int rc;
1176
1177 head = *iou->cqhead;
1178 tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
1179 memory_order_acquire);
1180 mask = iou->cqmask;
1181 cqe = iou->cqe;
1182 nevents = 0;
1183
1184 for (i = head; i != tail; i++) {
1185 e = &cqe[i & mask];
1186
1187 req = (uv_fs_t*) (uintptr_t) e->user_data;
1188 assert(req->type == UV_FS);
1189
1190 uv__req_unregister(loop);
1191 iou->in_flight--;
1192
1193 /* If the op is not supported by the kernel retry using the thread pool */
1194 if (e->res == -EOPNOTSUPP) {
1195 uv__fs_post(loop, req);
1196 continue;
1197 }
1198
1199 /* io_uring stores error codes as negative numbers, same as libuv. */
1200 req->result = e->res;
1201
1202 switch (req->fs_type) {
1203 case UV_FS_FSTAT:
1204 case UV_FS_LSTAT:
1205 case UV_FS_STAT:
1206 uv__iou_fs_statx_post(req);
1207 break;
1208 default: /* Squelch -Wswitch warnings. */
1209 break;
1210 }
1211
1212 uv__metrics_update_idle_time(loop);
1213 req->cb(req);
1214 nevents++;
1215 }
1216
1217 atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
1218 tail,
1219 memory_order_release);
1220
1221 /* Check whether CQE's overflowed, if so enter the kernel to make them
1222 * available. Don't grab them immediately but in the next loop iteration to
1223 * avoid loop starvation. */
1224 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
1225 memory_order_acquire);
1226
1227 if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
1228 do
1229 rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
1230 while (rc == -1 && errno == EINTR);
1231
1232 if (rc < 0)
1233 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
1234 }
1235
1236 uv__metrics_inc_events(loop, nevents);
1237 if (uv__get_internal_fields(loop)->current_timeout == 0)
1238 uv__metrics_inc_events_waiting(loop, nevents);
1239 }
1240
1241
1242 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
1243 * executed immediately, otherwise the file descriptor may have been closed
1244 * by the time the kernel starts the operation.
1245 */
1246 static void uv__epoll_ctl_prep(int epollfd,
1247 struct uv__iou* ctl,
1248 struct epoll_event (*events)[256],
1249 int op,
1250 int fd,
1251 struct epoll_event* e) {
1252 struct uv__io_uring_sqe* sqe;
1253 struct epoll_event* pe;
1254 uint32_t mask;
1255 uint32_t slot;
1256
1257 assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
1258 assert(ctl->ringfd != -1);
1259
1260 mask = ctl->sqmask;
1261 slot = (*ctl->sqtail)++ & mask;
1262
1263 pe = &(*events)[slot];
1264 *pe = *e;
1265
1266 sqe = ctl->sqe;
1267 sqe = &sqe[slot];
1268
1269 memset(sqe, 0, sizeof(*sqe));
1270 sqe->addr = (uintptr_t) pe;
1271 sqe->fd = epollfd;
1272 sqe->len = op;
1273 sqe->off = fd;
1274 sqe->opcode = UV__IORING_OP_EPOLL_CTL;
1275 sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
1276
1277 if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
1278 uv__epoll_ctl_flush(epollfd, ctl, events);
1279 }
1280
1281
1282 static void uv__epoll_ctl_flush(int epollfd,
1283 struct uv__iou* ctl,
1284 struct epoll_event (*events)[256]) {
1285 struct epoll_event oldevents[256];
1286 struct uv__io_uring_cqe* cqe;
1287 uint32_t oldslot;
1288 uint32_t slot;
1289 uint32_t n;
1290 int fd;
1291 int op;
1292 int rc;
1293
1294 STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
1295 assert(ctl->ringfd != -1);
1296 assert(*ctl->sqhead != *ctl->sqtail);
1297
1298 n = *ctl->sqtail - *ctl->sqhead;
1299 do
1300 rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
1301 while (rc == -1 && errno == EINTR);
1302
1303 if (rc < 0)
1304 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
1305
1306 if (rc != (int) n)
1307 abort();
1308
1309 assert(*ctl->sqhead == *ctl->sqtail);
1310
1311 memcpy(oldevents, *events, sizeof(*events));
1312
1313 /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
1314 * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
1315 * that we are already watching. Ignore the former and retry the latter
1316 * with EPOLL_CTL_MOD.
1317 */
1318 while (*ctl->cqhead != *ctl->cqtail) {
1319 slot = (*ctl->cqhead)++ & ctl->cqmask;
1320
1321 cqe = ctl->cqe;
1322 cqe = &cqe[slot];
1323
1324 if (cqe->res == 0)
1325 continue;
1326
1327 fd = cqe->user_data >> 32;
1328 op = 3 & cqe->user_data;
1329 oldslot = 255 & (cqe->user_data >> 2);
1330
1331 if (op == EPOLL_CTL_DEL)
1332 continue;
1333
1334 if (op != EPOLL_CTL_ADD)
1335 abort();
1336
1337 if (cqe->res != -EEXIST)
1338 abort();
1339
1340 uv__epoll_ctl_prep(epollfd,
1341 ctl,
1342 events,
1343 EPOLL_CTL_MOD,
1344 fd,
1345 &oldevents[oldslot]);
1346 }
1347 }
1348
1349
1350 void uv__io_poll(uv_loop_t* loop, int timeout) {
1351 uv__loop_internal_fields_t* lfields;
1352 struct epoll_event events[1024];
1353 struct epoll_event prep[256];
1354 struct uv__invalidate inv;
1355 struct epoll_event* pe;
1356 struct epoll_event e;
1357 struct uv__iou* ctl;
1358 struct uv__iou* iou;
1359 int real_timeout;
1360 struct uv__queue* q;
1361 uv__io_t* w;
1362 sigset_t* sigmask;
1363 sigset_t sigset;
1364 uint64_t base;
1365 int have_iou_events;
1366 int have_signals;
1367 int nevents;
1368 int epollfd;
1369 int count;
1370 int nfds;
1371 int fd;
1372 int op;
1373 int i;
1374 int user_timeout;
1375 int reset_timeout;
1376
1377 lfields = uv__get_internal_fields(loop);
1378 ctl = &lfields->ctl;
1379 iou = &lfields->iou;
1380
1381 sigmask = NULL;
1382 if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
1383 sigemptyset(&sigset);
1384 sigaddset(&sigset, SIGPROF);
1385 sigmask = &sigset;
1386 }
1387
1388 assert(timeout >= -1);
1389 base = loop->time;
1390 count = 48; /* Benchmarks suggest this gives the best throughput. */
1391 real_timeout = timeout;
1392
1393 if (lfields->flags & UV_METRICS_IDLE_TIME) {
1394 reset_timeout = 1;
1395 user_timeout = timeout;
1396 timeout = 0;
1397 } else {
1398 reset_timeout = 0;
1399 user_timeout = 0;
1400 }
1401
1402 epollfd = loop->backend_fd;
1403
1404 memset(&e, 0, sizeof(e));
1405
1406 while (!uv__queue_empty(&loop->watcher_queue)) {
1407 q = uv__queue_head(&loop->watcher_queue);
1408 w = uv__queue_data(q, uv__io_t, watcher_queue);
1409 uv__queue_remove(q);
1410 uv__queue_init(q);
1411
1412 op = EPOLL_CTL_MOD;
1413 if (w->events == 0)
1414 op = EPOLL_CTL_ADD;
1415
1416 w->events = w->pevents;
1417 e.events = w->pevents;
1418 e.data.fd = w->fd;
1419 fd = w->fd;
1420
1421 if (ctl->ringfd != -1) {
1422 uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
1423 continue;
1424 }
1425
1426 if (!epoll_ctl(epollfd, op, fd, &e))
1427 continue;
1428
1429 assert(op == EPOLL_CTL_ADD);
1430 assert(errno == EEXIST);
1431
1432 /* File descriptor that's been watched before, update event mask. */
1433 if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
1434 abort();
1435 }
1436
1437 inv.events = events;
1438 inv.prep = &prep;
1439 inv.nfds = -1;
1440
1441 for (;;) {
1442 if (loop->nfds == 0)
1443 if (iou->in_flight == 0)
1444 break;
1445
1446 /* All event mask mutations should be visible to the kernel before
1447 * we enter epoll_pwait().
1448 */
1449 if (ctl->ringfd != -1)
1450 while (*ctl->sqhead != *ctl->sqtail)
1451 uv__epoll_ctl_flush(epollfd, ctl, &prep);
1452
1453 /* Only need to set the provider_entry_time if timeout != 0. The function
1454 * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
1455 */
1456 if (timeout != 0)
1457 uv__metrics_set_provider_entry_time(loop);
1458
1459 /* Store the current timeout in a location that's globally accessible so
1460 * other locations like uv__work_done() can determine whether the queue
1461 * of events in the callback were waiting when poll was called.
1462 */
1463 lfields->current_timeout = timeout;
1464
1465 nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
1466
1467 /* Update loop->time unconditionally. It's tempting to skip the update when
1468 * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
1469 * operating system didn't reschedule our process while in the syscall.
1470 */
1471 SAVE_ERRNO(uv__update_time(loop));
1472
1473 if (nfds == -1)
1474 assert(errno == EINTR);
1475 else if (nfds == 0)
1476 /* Unlimited timeout should only return with events or signal. */
1477 assert(timeout != -1);
1478
1479 if (nfds == 0 || nfds == -1) {
1480 if (reset_timeout != 0) {
1481 timeout = user_timeout;
1482 reset_timeout = 0;
1483 } else if (nfds == 0) {
1484 return;
1485 }
1486
1487 /* Interrupted by a signal. Update timeout and poll again. */
1488 goto update_timeout;
1489 }
1490
1491 have_iou_events = 0;
1492 have_signals = 0;
1493 nevents = 0;
1494
1495 inv.nfds = nfds;
1496 lfields->inv = &inv;
1497
1498 for (i = 0; i < nfds; i++) {
1499 pe = events + i;
1500 fd = pe->data.fd;
1501
1502 /* Skip invalidated events, see uv__platform_invalidate_fd */
1503 if (fd == -1)
1504 continue;
1505
1506 if (fd == iou->ringfd) {
1507 uv__poll_io_uring(loop, iou);
1508 have_iou_events = 1;
1509 continue;
1510 }
1511
1512 assert(fd >= 0);
1513 assert((unsigned) fd < loop->nwatchers);
1514
1515 w = loop->watchers[fd];
1516
1517 if (w == NULL) {
1518 /* File descriptor that we've stopped watching, disarm it.
1519 *
1520 * Ignore all errors because we may be racing with another thread
1521 * when the file descriptor is closed.
1522 *
1523 * Perform EPOLL_CTL_DEL immediately instead of going through
1524 * io_uring's submit queue, otherwise the file descriptor may
1525 * be closed by the time the kernel starts the operation.
1526 */
1527 epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
1528 continue;
1529 }
1530
1531 /* Give users only events they're interested in. Prevents spurious
1532 * callbacks when previous callback invocation in this loop has stopped
1533 * the current watcher. Also, filters out events that users has not
1534 * requested us to watch.
1535 */
1536 pe->events &= w->pevents | POLLERR | POLLHUP;
1537
1538 /* Work around an epoll quirk where it sometimes reports just the
1539 * EPOLLERR or EPOLLHUP event. In order to force the event loop to
1540 * move forward, we merge in the read/write events that the watcher
1541 * is interested in; uv__read() and uv__write() will then deal with
1542 * the error or hangup in the usual fashion.
1543 *
1544 * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
1545 * reads the available data, calls uv_read_stop(), then sometime later
1546 * calls uv_read_start() again. By then, libuv has forgotten about the
1547 * hangup and the kernel won't report EPOLLIN again because there's
1548 * nothing left to read. If anything, libuv is to blame here. The
1549 * current hack is just a quick bandaid; to properly fix it, libuv
1550 * needs to remember the error/hangup event. We should get that for
1551 * free when we switch over to edge-triggered I/O.
1552 */
1553 if (pe->events == POLLERR || pe->events == POLLHUP)
1554 pe->events |=
1555 w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
1556
1557 if (pe->events != 0) {
1558 /* Run signal watchers last. This also affects child process watchers
1559 * because those are implemented in terms of signal watchers.
1560 */
1561 if (w == &loop->signal_io_watcher) {
1562 have_signals = 1;
1563 } else {
1564 uv__metrics_update_idle_time(loop);
1565 w->cb(loop, w, pe->events);
1566 }
1567
1568 nevents++;
1569 }
1570 }
1571
1572 uv__metrics_inc_events(loop, nevents);
1573 if (reset_timeout != 0) {
1574 timeout = user_timeout;
1575 reset_timeout = 0;
1576 uv__metrics_inc_events_waiting(loop, nevents);
1577 }
1578
1579 if (have_signals != 0) {
1580 uv__metrics_update_idle_time(loop);
1581 loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
1582 }
1583
1584 lfields->inv = NULL;
1585
1586 if (have_iou_events != 0)
1587 break; /* Event loop should cycle now so don't poll again. */
1588
1589 if (have_signals != 0)
1590 break; /* Event loop should cycle now so don't poll again. */
1591
1592 if (nevents != 0) {
1593 if (nfds == ARRAY_SIZE(events) && --count != 0) {
1594 /* Poll for more events but don't block this time. */
1595 timeout = 0;
1596 continue;
1597 }
1598 break;
1599 }
1600
1601 update_timeout:
1602 if (timeout == 0)
1603 break;
1604
1605 if (timeout == -1)
1606 continue;
1607
1608 assert(timeout > 0);
1609
1610 real_timeout -= (loop->time - base);
1611 if (real_timeout <= 0)
1612 break;
1613
1614 timeout = real_timeout;
1615 }
1616
1617 if (ctl->ringfd != -1)
1618 while (*ctl->sqhead != *ctl->sqtail)
1619 uv__epoll_ctl_flush(epollfd, ctl, &prep);
1620 }
1621
1622 uint64_t uv__hrtime(uv_clocktype_t type) {
1623 static _Atomic clock_t fast_clock_id = -1;
1624 struct timespec t;
1625 clock_t clock_id;
1626
1627 /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
1628 * millisecond granularity or better. CLOCK_MONOTONIC_COARSE is
1629 * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
1630 * decide to make a costly system call.
1631 */
1632 /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
1633 * when it has microsecond granularity or better (unlikely).
1634 */
1635 clock_id = CLOCK_MONOTONIC;
1636 if (type != UV_CLOCK_FAST)
1637 goto done;
1638
1639 clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
1640 if (clock_id != -1)
1641 goto done;
1642
1643 clock_id = CLOCK_MONOTONIC;
1644 if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
1645 if (t.tv_nsec <= 1 * 1000 * 1000)
1646 clock_id = CLOCK_MONOTONIC_COARSE;
1647
1648 atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
1649
1650 done:
1651
1652 if (clock_gettime(clock_id, &t))
1653 return 0; /* Not really possible. */
1654
1655 return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
1656 }
1657
1658
1659 int uv_resident_set_memory(size_t* rss) {
1660 char buf[1024];
1661 const char* s;
1662 long val;
1663 int rc;
1664 int i;
1665
1666 /* rss: 24th element */
1667 rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
1668 if (rc < 0)
1669 return rc;
1670
1671 /* find the last ')' */
1672 s = strrchr(buf, ')');
1673 if (s == NULL)
1674 goto err;
1675
1676 for (i = 1; i <= 22; i++) {
1677 s = strchr(s + 1, ' ');
1678 if (s == NULL)
1679 goto err;
1680 }
1681
1682 errno = 0;
1683 val = strtol(s, NULL, 10);
1684 if (val < 0 || errno != 0)
1685 goto err;
1686
1687 *rss = val * getpagesize();
1688 return 0;
1689
1690 err:
1691 return UV_EINVAL;
1692 }
1693
1694 int uv_uptime(double* uptime) {
1695 struct timespec now;
1696 char buf[128];
1697
1698 /* Consult /proc/uptime when present (common case), or fall back to
1699 * clock_gettime. Why not always clock_gettime? It doesn't always return the
1700 * right result under OpenVZ and possibly other containerized environments.
1701 */
1702 if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
1703 if (1 == sscanf(buf, "%lf", uptime))
1704 return 0;
1705
1706 if (clock_gettime(CLOCK_BOOTTIME, &now))
1707 return UV__ERR(errno);
1708
1709 *uptime = now.tv_sec;
1710 return 0;
1711 }
1712
1713
1714 int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
1715 #if defined(__PPC__)
1716 static const char model_marker[] = "cpu\t\t: ";
1717 static const char model_marker2[] = "";
1718 #elif defined(__arm__)
1719 static const char model_marker[] = "model name\t: ";
1720 static const char model_marker2[] = "Processor\t: ";
1721 #elif defined(__aarch64__)
1722 static const char model_marker[] = "CPU part\t: ";
1723 static const char model_marker2[] = "";
1724 #elif defined(__mips__)
1725 static const char model_marker[] = "cpu model\t\t: ";
1726 static const char model_marker2[] = "";
1727 #elif defined(__loongarch__)
1728 static const char model_marker[] = "cpu family\t\t: ";
1729 static const char model_marker2[] = "";
1730 #else
1731 static const char model_marker[] = "model name\t: ";
1732 static const char model_marker2[] = "";
1733 #endif
1734 static const char parts[] =
1735 #ifdef __aarch64__
1736 "0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n"
1737 "0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n"
1738 "0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n"
1739 "0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
1740 "0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n"
1741 "0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n"
1742 "0xc0d\nCortex-A17\n" /* Originally A12 */
1743 "0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n"
1744 "0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n"
1745 "0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n"
1746 "0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n"
1747 "0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n"
1748 "0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n"
1749 "0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n"
1750 "0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n"
1751 "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n"
1752 "0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n"
1753 "0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n"
1754 #endif
1755 "";
1756 struct cpu {
1757 unsigned long long freq, user, nice, sys, idle, irq;
1758 unsigned model;
1759 };
1760 FILE* fp;
1761 char* p;
1762 int found;
1763 int n;
1764 unsigned i;
1765 unsigned cpu;
1766 unsigned maxcpu;
1767 unsigned size;
1768 unsigned long long skip;
1769 struct cpu (*cpus)[8192]; /* Kernel maximum. */
1770 struct cpu* c;
1771 struct cpu t;
1772 char (*model)[64];
1773 unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
1774 /* Assumption: even big.LITTLE systems will have only a handful
1775 * of different CPU models. Most systems will just have one.
1776 */
1777 char models[8][64];
1778 char buf[1024];
1779
1780 memset(bitmap, 0, sizeof(bitmap));
1781 memset(models, 0, sizeof(models));
1782 snprintf(*models, sizeof(*models), "unknown");
1783 maxcpu = 0;
1784
1785 cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
1786 if (cpus == NULL)
1787 return UV_ENOMEM;
1788
1789 fp = uv__open_file("/proc/stat");
1790 if (fp == NULL) {
1791 uv__free(cpus);
1792 return UV__ERR(errno);
1793 }
1794
1795 if (NULL == fgets(buf, sizeof(buf), fp))
1796 abort();
1797
1798 for (;;) {
1799 memset(&t, 0, sizeof(t));
1800
1801 n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
1802 &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
1803
1804 if (n != 7)
1805 break;
1806
1807 if (NULL == fgets(buf, sizeof(buf), fp))
1808 abort();
1809
1810 if (cpu >= ARRAY_SIZE(*cpus))
1811 continue;
1812
1813 (*cpus)[cpu] = t;
1814
1815 bitmap[cpu >> 3] |= 1 << (cpu & 7);
1816
1817 if (cpu >= maxcpu)
1818 maxcpu = cpu + 1;
1819 }
1820
1821 fclose(fp);
1822
1823 fp = uv__open_file("/proc/cpuinfo");
1824 if (fp == NULL)
1825 goto nocpuinfo;
1826
1827 for (;;) {
1828 if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
1829 break; /* Parse error. */
1830
1831 while (fgets(buf, sizeof(buf), fp)) {
1832 if (!strncmp(buf, model_marker, sizeof(model_marker) - 1)) {
1833 p = buf + sizeof(model_marker) - 1;
1834 goto parts;
1835 }
1836 if (!*model_marker2)
1837 continue;
1838 if (!strncmp(buf, model_marker2, sizeof(model_marker2) - 1)) {
1839 p = buf + sizeof(model_marker2) - 1;
1840 goto parts;
1841 }
1842 }
1843
1844 goto next; /* Not found. */
1845
1846 parts:
1847 n = (int) strcspn(p, "\n");
1848
1849 /* arm64: translate CPU part code to model name. */
1850 if (*parts) {
1851 p = memmem(parts, sizeof(parts) - 1, p, n + 1);
1852 if (p == NULL)
1853 p = "unknown";
1854 else
1855 p += n + 1;
1856 n = (int) strcspn(p, "\n");
1857 }
1858
1859 found = 0;
1860 for (model = models; !found && model < ARRAY_END(models); model++)
1861 found = !strncmp(p, *model, strlen(*model));
1862
1863 if (!found)
1864 goto next;
1865
1866 if (**model == '\0')
1867 snprintf(*model, sizeof(*model), "%.*s", n, p);
1868
1869 if (cpu < maxcpu)
1870 (*cpus)[cpu].model = model - models;
1871
1872 next:
1873 while (fgets(buf, sizeof(buf), fp))
1874 if (*buf == '\n')
1875 break;
1876 }
1877
1878 fclose(fp);
1879 fp = NULL;
1880
1881 nocpuinfo:
1882
1883 n = 0;
1884 for (cpu = 0; cpu < maxcpu; cpu++) {
1885 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1886 continue;
1887
1888 n++;
1889 snprintf(buf, sizeof(buf),
1890 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
1891
1892 fp = uv__open_file(buf);
1893 if (fp == NULL)
1894 continue;
1895
1896 if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
1897 abort();
1898 fclose(fp);
1899 fp = NULL;
1900 }
1901
1902 size = n * sizeof(**ci) + sizeof(models);
1903 *ci = uv__malloc(size);
1904 *count = 0;
1905
1906 if (*ci == NULL) {
1907 uv__free(cpus);
1908 return UV_ENOMEM;
1909 }
1910
1911 *count = n;
1912 p = memcpy(*ci + n, models, sizeof(models));
1913
1914 i = 0;
1915 for (cpu = 0; cpu < maxcpu; cpu++) {
1916 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
1917 continue;
1918
1919 c = *cpus + cpu;
1920
1921 (*ci)[i++] = (uv_cpu_info_t) {
1922 .model = p + c->model * sizeof(*model),
1923 .speed = c->freq / 1000,
1924 /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
1925 * therefore the multiplier is always 1000/100 = 10.
1926 */
1927 .cpu_times = (struct uv_cpu_times_s) {
1928 .user = 10 * c->user,
1929 .nice = 10 * c->nice,
1930 .sys = 10 * c->sys,
1931 .idle = 10 * c->idle,
1932 .irq = 10 * c->irq,
1933 },
1934 };
1935 }
1936
1937 uv__free(cpus);
1938
1939 return 0;
1940 }
1941
1942
1943 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
1944 if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
1945 return 1;
1946 if (ent->ifa_addr == NULL)
1947 return 1;
1948 /*
1949 * On Linux getifaddrs returns information related to the raw underlying
1950 * devices. We're not interested in this information yet.
1951 */
1952 if (ent->ifa_addr->sa_family == PF_PACKET)
1953 return exclude_type;
1954 return !exclude_type;
1955 }
1956
1957 /* TODO(bnoordhuis) share with bsd-ifaddrs.c */
1958 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
1959 uv_interface_address_t* address;
1960 struct sockaddr_ll* sll;
1961 struct ifaddrs* addrs;
1962 struct ifaddrs* ent;
1963 size_t namelen;
1964 char* name;
1965 int i;
1966
1967 *count = 0;
1968 *addresses = NULL;
1969
1970 if (getifaddrs(&addrs))
1971 return UV__ERR(errno);
1972
1973 /* Count the number of interfaces */
1974 namelen = 0;
1975 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1976 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
1977 continue;
1978
1979 namelen += strlen(ent->ifa_name) + 1;
1980 (*count)++;
1981 }
1982
1983 if (*count == 0) {
1984 freeifaddrs(addrs);
1985 return 0;
1986 }
1987
1988 /* Make sure the memory is initiallized to zero using calloc() */
1989 *addresses = uv__calloc(1, *count * sizeof(**addresses) + namelen);
1990 if (*addresses == NULL) {
1991 freeifaddrs(addrs);
1992 return UV_ENOMEM;
1993 }
1994
1995 name = (char*) &(*addresses)[*count];
1996 address = *addresses;
1997
1998 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
1999 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
2000 continue;
2001
2002 namelen = strlen(ent->ifa_name) + 1;
2003 address->name = memcpy(name, ent->ifa_name, namelen);
2004 name += namelen;
2005
2006 if (ent->ifa_addr->sa_family == AF_INET6) {
2007 address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
2008 } else {
2009 address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
2010 }
2011
2012 if (ent->ifa_netmask->sa_family == AF_INET6) {
2013 address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
2014 } else {
2015 address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
2016 }
2017
2018 address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
2019
2020 address++;
2021 }
2022
2023 /* Fill in physical addresses for each interface */
2024 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
2025 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
2026 continue;
2027
2028 address = *addresses;
2029
2030 for (i = 0; i < (*count); i++) {
2031 size_t namelen = strlen(ent->ifa_name);
2032 /* Alias interface share the same physical address */
2033 if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
2034 (address->name[namelen] == 0 || address->name[namelen] == ':')) {
2035 sll = (struct sockaddr_ll*)ent->ifa_addr;
2036 memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
2037 }
2038 address++;
2039 }
2040 }
2041
2042 freeifaddrs(addrs);
2043
2044 return 0;
2045 }
2046
2047
2048 /* TODO(bnoordhuis) share with bsd-ifaddrs.c */
2049 void uv_free_interface_addresses(uv_interface_address_t* addresses,
2050 int count) {
2051 uv__free(addresses);
2052 }
2053
2054
2055 void uv__set_process_title(const char* title) {
2056 #if defined(PR_SET_NAME)
2057 prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */
2058 #endif
2059 }
2060
2061
2062 static uint64_t uv__read_proc_meminfo(const char* what) {
2063 uint64_t rc;
2064 char* p;
2065 char buf[4096]; /* Large enough to hold all of /proc/meminfo. */
2066
2067 if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
2068 return 0;
2069
2070 p = strstr(buf, what);
2071
2072 if (p == NULL)
2073 return 0;
2074
2075 p += strlen(what);
2076
2077 rc = 0;
2078 sscanf(p, "%" PRIu64 " kB", &rc);
2079
2080 return rc * 1024;
2081 }
2082
2083
2084 uint64_t uv_get_free_memory(void) {
2085 struct sysinfo info;
2086 uint64_t rc;
2087
2088 rc = uv__read_proc_meminfo("MemAvailable:");
2089
2090 if (rc != 0)
2091 return rc;
2092
2093 if (0 == sysinfo(&info))
2094 return (uint64_t) info.freeram * info.mem_unit;
2095
2096 return 0;
2097 }
2098
2099
2100 uint64_t uv_get_total_memory(void) {
2101 struct sysinfo info;
2102 uint64_t rc;
2103
2104 rc = uv__read_proc_meminfo("MemTotal:");
2105
2106 if (rc != 0)
2107 return rc;
2108
2109 if (0 == sysinfo(&info))
2110 return (uint64_t) info.totalram * info.mem_unit;
2111
2112 return 0;
2113 }
2114
2115
2116 static uint64_t uv__read_uint64(const char* filename) {
2117 char buf[32]; /* Large enough to hold an encoded uint64_t. */
2118 uint64_t rc;
2119
2120 rc = 0;
2121 if (0 == uv__slurp(filename, buf, sizeof(buf)))
2122 if (1 != sscanf(buf, "%" PRIu64, &rc))
2123 if (0 == strcmp(buf, "max\n"))
2124 rc = UINT64_MAX;
2125
2126 return rc;
2127 }
2128
2129
2130 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
2131 * finds the location and length of the memory controller mount path.
2132 * This disregards the leading / for easy concatenation of paths.
2133 * Returns NULL if the memory controller wasn't found. */
2134 static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
2135 int* n) {
2136 char* p;
2137
2138 /* Seek to the memory controller line. */
2139 p = strchr(buf, ':');
2140 while (p != NULL && strncmp(p, ":memory:", 8)) {
2141 p = strchr(p, '\n');
2142 if (p != NULL)
2143 p = strchr(p, ':');
2144 }
2145
2146 if (p != NULL) {
2147 /* Determine the length of the mount path. */
2148 p = p + strlen(":memory:/");
2149 *n = (int) strcspn(p, "\n");
2150 }
2151
2152 return p;
2153 }
2154
2155 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
2156 uint64_t* max) {
2157 char filename[4097];
2158 char* p;
2159 int n;
2160 uint64_t cgroup1_max;
2161
2162 /* Find out where the controller is mounted. */
2163 p = uv__cgroup1_find_memory_controller(buf, &n);
2164 if (p != NULL) {
2165 snprintf(filename, sizeof(filename),
2166 "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
2167 *high = uv__read_uint64(filename);
2168
2169 snprintf(filename, sizeof(filename),
2170 "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
2171 *max = uv__read_uint64(filename);
2172
2173 /* If the controller wasn't mounted, the reads above will have failed,
2174 * as indicated by uv__read_uint64 returning 0.
2175 */
2176 if (*high != 0 && *max != 0)
2177 goto update_limits;
2178 }
2179
2180 /* Fall back to the limits of the global memory controller. */
2181 *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
2182 *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
2183
2184 /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
2185 * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
2186 */
2187 update_limits:
2188 cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
2189 if (*high == cgroup1_max)
2190 *high = UINT64_MAX;
2191 if (*max == cgroup1_max)
2192 *max = UINT64_MAX;
2193 }
2194
2195 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
2196 uint64_t* max) {
2197 char filename[4097];
2198 char* p;
2199 int n;
2200
2201 /* Find out where the controller is mounted. */
2202 p = buf + strlen("0::/");
2203 n = (int) strcspn(p, "\n");
2204
2205 /* Read the memory limits of the controller. */
2206 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
2207 *max = uv__read_uint64(filename);
2208 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
2209 *high = uv__read_uint64(filename);
2210 }
2211
2212 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
2213 uint64_t high;
2214 uint64_t max;
2215
2216 /* In the case of cgroupv2, we'll only have a single entry. */
2217 if (strncmp(buf, "0::/", 4))
2218 uv__get_cgroup1_memory_limits(buf, &high, &max);
2219 else
2220 uv__get_cgroup2_memory_limits(buf, &high, &max);
2221
2222 if (high == 0 || max == 0)
2223 return 0;
2224
2225 return high < max ? high : max;
2226 }
2227
2228 uint64_t uv_get_constrained_memory(void) {
2229 char buf[1024];
2230
2231 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2232 return 0;
2233
2234 return uv__get_cgroup_constrained_memory(buf);
2235 }
2236
2237
2238 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
2239 char filename[4097];
2240 uint64_t current;
2241 char* p;
2242 int n;
2243
2244 /* Find out where the controller is mounted. */
2245 p = uv__cgroup1_find_memory_controller(buf, &n);
2246 if (p != NULL) {
2247 snprintf(filename, sizeof(filename),
2248 "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
2249 current = uv__read_uint64(filename);
2250
2251 /* If the controller wasn't mounted, the reads above will have failed,
2252 * as indicated by uv__read_uint64 returning 0.
2253 */
2254 if (current != 0)
2255 return current;
2256 }
2257
2258 /* Fall back to the usage of the global memory controller. */
2259 return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
2260 }
2261
2262 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
2263 char filename[4097];
2264 char* p;
2265 int n;
2266
2267 /* Find out where the controller is mounted. */
2268 p = buf + strlen("0::/");
2269 n = (int) strcspn(p, "\n");
2270
2271 snprintf(filename, sizeof(filename),
2272 "/sys/fs/cgroup/%.*s/memory.current", n, p);
2273 return uv__read_uint64(filename);
2274 }
2275
2276 uint64_t uv_get_available_memory(void) {
2277 char buf[1024];
2278 uint64_t constrained;
2279 uint64_t current;
2280 uint64_t total;
2281
2282 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
2283 return 0;
2284
2285 constrained = uv__get_cgroup_constrained_memory(buf);
2286 if (constrained == 0)
2287 return uv_get_free_memory();
2288
2289 total = uv_get_total_memory();
2290 if (constrained > total)
2291 return uv_get_free_memory();
2292
2293 /* In the case of cgroupv2, we'll only have a single entry. */
2294 if (strncmp(buf, "0::/", 4))
2295 current = uv__get_cgroup1_current_memory(buf);
2296 else
2297 current = uv__get_cgroup2_current_memory(buf);
2298
2299 /* memory usage can be higher than the limit (for short bursts of time) */
2300 if (constrained < current)
2301 return 0;
2302
2303 return constrained - current;
2304 }
2305
2306
2307 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
2308 long long* quota) {
2309 static const char cgroup_mount[] = "/sys/fs/cgroup";
2310 const char* cgroup_trimmed;
2311 char buf[1024];
2312 char full_path[256];
2313 char path[256];
2314 char quota_buf[16];
2315 char* last_slash;
2316 int cgroup_size;
2317 long long limit;
2318 long long min_quota;
2319 long long period;
2320
2321 if (strncmp(cgroup, "0::/", 4) != 0)
2322 return UV_EINVAL;
2323
2324 /* Trim ending \n by replacing it with a 0 */
2325 cgroup_trimmed = cgroup + sizeof("0::/") - 1; /* Skip the prefix "0::/" */
2326 cgroup_size = (int)strcspn(cgroup_trimmed, "\n"); /* Find the first \n */
2327 min_quota = LLONG_MAX;
2328
2329 /* Construct the path to the cpu.max files */
2330 snprintf(path, sizeof(path), "%s/%.*s/cgroup.controllers", cgroup_mount,
2331 cgroup_size, cgroup_trimmed);
2332
2333 /* Read controllers, if not exists, not really a cgroup */
2334 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2335 return UV_EIO;
2336
2337 snprintf(path, sizeof(path), "%s/%.*s", cgroup_mount, cgroup_size,
2338 cgroup_trimmed);
2339
2340 /*
2341 * Traverse up the cgroup v2 hierarchy, starting from the current cgroup path.
2342 * At each level, attempt to read the "cpu.max" file, which defines the CPU
2343 * quota and period.
2344 *
2345 * This reflects how Linux applies cgroup limits hierarchically.
2346 *
2347 * e.g: given a path like /sys/fs/cgroup/foo/bar/baz, we check:
2348 * - /sys/fs/cgroup/foo/bar/baz/cpu.max
2349 * - /sys/fs/cgroup/foo/bar/cpu.max
2350 * - /sys/fs/cgroup/foo/cpu.max
2351 * - /sys/fs/cgroup/cpu.max
2352 */
2353 while (strncmp(path, cgroup_mount, strlen(cgroup_mount)) == 0) {
2354 snprintf(full_path, sizeof(full_path), "%s/cpu.max", path);
2355
2356 /* Silently ignore and continue if the file does not exist */
2357 if (uv__slurp(full_path, quota_buf, sizeof(quota_buf)) < 0)
2358 goto next;
2359
2360 /* No limit, move on */
2361 if (strncmp(quota_buf, "max", 3) == 0)
2362 goto next;
2363
2364 /* Read cpu.max */
2365 if (sscanf(quota_buf, "%lld %lld", &limit, &period) != 2)
2366 goto next;
2367
2368 /* Can't divide by 0 */
2369 if (period == 0)
2370 goto next;
2371
2372 *quota = limit / period;
2373 if (*quota < min_quota)
2374 min_quota = *quota;
2375
2376 next:
2377 /* Move up one level in the cgroup hierarchy by trimming the last path.
2378 * The loop ends once we reach the cgroup root mount point.
2379 */
2380 last_slash = strrchr(path, '/');
2381 if (last_slash == NULL || strcmp(path, cgroup_mount) == 0)
2382 break;
2383 *last_slash = '\0';
2384 }
2385
2386 return 0;
2387 }
2388
2389 static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
2390 int* cgroup_size) {
2391 /* Seek to the cpu controller line. */
2392 char* cgroup_cpu = strstr(cgroup, ":cpu,");
2393
2394 if (cgroup_cpu != NULL) {
2395 /* Skip the controller prefix to the start of the cgroup path. */
2396 cgroup_cpu += sizeof(":cpu,") - 1;
2397 /* Determine the length of the cgroup path, excluding the newline. */
2398 *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
2399 }
2400
2401 return cgroup_cpu;
2402 }
2403
2404 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
2405 long long* quota) {
2406 char path[256];
2407 char buf[1024];
2408 int cgroup_size;
2409 char* cgroup_cpu;
2410 long long period_length;
2411 long long quota_per_period;
2412
2413 cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
2414
2415 if (cgroup_cpu == NULL)
2416 return UV_EIO;
2417
2418 /* Construct the path to the cpu.cfs_quota_us file */
2419 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
2420 cgroup_size, cgroup_cpu);
2421
2422 /* Read cpu.cfs_quota_us */
2423 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2424 return UV_EIO;
2425
2426 if (sscanf(buf, "%lld", &quota_per_period) != 1)
2427 return UV_EINVAL;
2428
2429 /* Construct the path to the cpu.cfs_period_us file */
2430 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
2431 cgroup_size, cgroup_cpu);
2432
2433 /* Read cpu.cfs_period_us */
2434 if (uv__slurp(path, buf, sizeof(buf)) < 0)
2435 return UV_EIO;
2436
2437 if (sscanf(buf, "%lld", &period_length) != 1)
2438 return UV_EINVAL;
2439
2440 /* Can't divide by 0 */
2441 if (period_length == 0)
2442 return UV_EINVAL;
2443
2444 *quota = quota_per_period / period_length;
2445
2446 return 0;
2447 }
2448
2449 int uv__get_constrained_cpu(long long* quota) {
2450 char cgroup[1024];
2451
2452 /* Read the cgroup from /proc/self/cgroup */
2453 if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
2454 return UV_EIO;
2455
2456 /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
2457 * The entry for cgroup v2 is always in the format "0::$PATH"
2458 * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
2459 if (strncmp(cgroup, "0::/", 4) == 0)
2460 return uv__get_cgroupv2_constrained_cpu(cgroup, quota);
2461 else
2462 return uv__get_cgroupv1_constrained_cpu(cgroup, quota);
2463 }
2464
2465
2466 void uv_loadavg(double avg[3]) {
2467 struct sysinfo info;
2468 char buf[128]; /* Large enough to hold all of /proc/loadavg. */
2469
2470 if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
2471 if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
2472 return;
2473
2474 if (sysinfo(&info) < 0)
2475 return;
2476
2477 avg[0] = (double) info.loads[0] / 65536.0;
2478 avg[1] = (double) info.loads[1] / 65536.0;
2479 avg[2] = (double) info.loads[2] / 65536.0;
2480 }
2481
2482
2483 static int compare_watchers(const struct watcher_list* a,
2484 const struct watcher_list* b) {
2485 if (a->wd < b->wd) return -1;
2486 if (a->wd > b->wd) return 1;
2487 return 0;
2488 }
2489
2490
2491 static int init_inotify(uv_loop_t* loop) {
2492 int err;
2493 int fd;
2494
2495 if (loop->inotify_fd != -1)
2496 return 0;
2497
2498 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
2499 if (fd < 0)
2500 return UV__ERR(errno);
2501
2502 err = uv__io_init_start(loop, &loop->inotify_read_watcher, uv__inotify_read,
2503 fd, POLLIN);
2504 if (err) {
2505 uv__close(fd);
2506 return err;
2507 }
2508
2509 loop->inotify_fd = fd;
2510 return 0;
2511 }
2512
2513
2514 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
2515 /* Open the inotify_fd, and re-arm all the inotify watchers. */
2516 int err;
2517 struct watcher_list* tmp_watcher_list_iter;
2518 struct watcher_list* watcher_list;
2519 struct watcher_list tmp_watcher_list;
2520 struct uv__queue queue;
2521 struct uv__queue* q;
2522 uv_fs_event_t* handle;
2523 char* tmp_path;
2524
2525 if (root == NULL)
2526 return 0;
2527
2528 /* We must restore the old watcher list to be able to close items
2529 * out of it.
2530 */
2531 loop->inotify_watchers = root;
2532
2533 uv__queue_init(&tmp_watcher_list.watchers);
2534 /* Note that the queue we use is shared with the start and stop()
2535 * functions, making uv__queue_foreach unsafe to use. So we use the
2536 * uv__queue_move trick to safely iterate. Also don't free the watcher
2537 * list until we're done iterating. c.f. uv__inotify_read.
2538 */
2539 RB_FOREACH_SAFE(watcher_list, watcher_root,
2540 uv__inotify_watchers(loop), tmp_watcher_list_iter) {
2541 watcher_list->iterating = 1;
2542 uv__queue_move(&watcher_list->watchers, &queue);
2543 while (!uv__queue_empty(&queue)) {
2544 q = uv__queue_head(&queue);
2545 handle = uv__queue_data(q, uv_fs_event_t, watchers);
2546 /* It's critical to keep a copy of path here, because it
2547 * will be set to NULL by stop() and then deallocated by
2548 * maybe_free_watcher_list
2549 */
2550 tmp_path = uv__strdup(handle->path);
2551 assert(tmp_path != NULL);
2552 uv__queue_remove(q);
2553 uv__queue_insert_tail(&watcher_list->watchers, q);
2554 uv_fs_event_stop(handle);
2555
2556 uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
2557 handle->path = tmp_path;
2558 }
2559 watcher_list->iterating = 0;
2560 maybe_free_watcher_list(watcher_list, loop);
2561 }
2562
2563 uv__queue_move(&tmp_watcher_list.watchers, &queue);
2564 while (!uv__queue_empty(&queue)) {
2565 q = uv__queue_head(&queue);
2566 uv__queue_remove(q);
2567 handle = uv__queue_data(q, uv_fs_event_t, watchers);
2568 tmp_path = handle->path;
2569 handle->path = NULL;
2570 err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
2571 uv__free(tmp_path);
2572 if (err)
2573 return err;
2574 }
2575
2576 return 0;
2577 }
2578
2579
2580 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
2581 struct watcher_list w;
2582 w.wd = wd;
2583 return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
2584 }
2585
2586
2587 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
2588 /* if the watcher_list->watchers is being iterated over, we can't free it. */
2589 if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
2590 /* No watchers left for this path. Clean up. */
2591 RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
2592 inotify_rm_watch(loop->inotify_fd, w->wd);
2593 uv__free(w);
2594 }
2595 }
2596
2597
2598 static void uv__inotify_read(uv_loop_t* loop,
2599 uv__io_t* dummy,
2600 unsigned int events) {
2601 const struct inotify_event* e;
2602 struct watcher_list* w;
2603 uv_fs_event_t* h;
2604 struct uv__queue queue;
2605 struct uv__queue* q;
2606 const char* path;
2607 ssize_t size;
2608 const char *p;
2609 /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
2610 char buf[4096];
2611
2612 for (;;) {
2613 do
2614 size = read(loop->inotify_fd, buf, sizeof(buf));
2615 while (size == -1 && errno == EINTR);
2616
2617 if (size == -1) {
2618 assert(errno == EAGAIN || errno == EWOULDBLOCK);
2619 break;
2620 }
2621
2622 assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
2623
2624 /* Now we have one or more inotify_event structs. */
2625 for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
2626 e = (const struct inotify_event*) p;
2627
2628 events = 0;
2629 if (e->mask & (IN_ATTRIB|IN_MODIFY))
2630 events |= UV_CHANGE;
2631 if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
2632 events |= UV_RENAME;
2633
2634 w = find_watcher(loop, e->wd);
2635 if (w == NULL)
2636 continue; /* Stale event, no watchers left. */
2637
2638 /* inotify does not return the filename when monitoring a single file
2639 * for modifications. Repurpose the filename for API compatibility.
2640 * I'm not convinced this is a good thing, maybe it should go.
2641 */
2642 path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
2643
2644 /* We're about to iterate over the queue and call user's callbacks.
2645 * What can go wrong?
2646 * A callback could call uv_fs_event_stop()
2647 * and the queue can change under our feet.
2648 * So, we use uv__queue_move() trick to safely iterate over the queue.
2649 * And we don't free the watcher_list until we're done iterating.
2650 *
2651 * First,
2652 * tell uv_fs_event_stop() (that could be called from a user's callback)
2653 * not to free watcher_list.
2654 */
2655 w->iterating = 1;
2656 uv__queue_move(&w->watchers, &queue);
2657 while (!uv__queue_empty(&queue)) {
2658 q = uv__queue_head(&queue);
2659 h = uv__queue_data(q, uv_fs_event_t, watchers);
2660
2661 uv__queue_remove(q);
2662 uv__queue_insert_tail(&w->watchers, q);
2663
2664 h->cb(h, path, events, 0);
2665 }
2666 /* done iterating, time to (maybe) free empty watcher_list */
2667 w->iterating = 0;
2668 maybe_free_watcher_list(w, loop);
2669 }
2670 }
2671 }
2672
2673
2674 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
2675 uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
2676 return 0;
2677 }
2678
2679
2680 int uv_fs_event_start(uv_fs_event_t* handle,
2681 uv_fs_event_cb cb,
2682 const char* path,
2683 unsigned int flags) {
2684 struct watcher_list* w;
2685 uv_loop_t* loop;
2686 size_t len;
2687 int events;
2688 int err;
2689 int wd;
2690
2691 if (uv__is_active(handle))
2692 return UV_EINVAL;
2693
2694 loop = handle->loop;
2695
2696 err = init_inotify(loop);
2697 if (err)
2698 return err;
2699
2700 events = IN_ATTRIB
2701 | IN_CREATE
2702 | IN_MODIFY
2703 | IN_DELETE
2704 | IN_DELETE_SELF
2705 | IN_MOVE_SELF
2706 | IN_MOVED_FROM
2707 | IN_MOVED_TO;
2708
2709 wd = inotify_add_watch(loop->inotify_fd, path, events);
2710 if (wd == -1)
2711 return UV__ERR(errno);
2712
2713 w = find_watcher(loop, wd);
2714 if (w)
2715 goto no_insert;
2716
2717 len = strlen(path) + 1;
2718 w = uv__malloc(sizeof(*w) + len);
2719 if (w == NULL)
2720 return UV_ENOMEM;
2721
2722 w->wd = wd;
2723 w->path = memcpy(w + 1, path, len);
2724 uv__queue_init(&w->watchers);
2725 w->iterating = 0;
2726 RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
2727
2728 no_insert:
2729 uv__handle_start(handle);
2730 uv__queue_insert_tail(&w->watchers, &handle->watchers);
2731 handle->path = w->path;
2732 handle->cb = cb;
2733 handle->wd = wd;
2734
2735 return 0;
2736 }
2737
2738
2739 int uv_fs_event_stop(uv_fs_event_t* handle) {
2740 struct watcher_list* w;
2741
2742 if (!uv__is_active(handle))
2743 return 0;
2744
2745 w = find_watcher(handle->loop, handle->wd);
2746 assert(w != NULL);
2747
2748 handle->wd = -1;
2749 handle->path = NULL;
2750 uv__handle_stop(handle);
2751 uv__queue_remove(&handle->watchers);
2752
2753 maybe_free_watcher_list(w, handle->loop);
2754
2755 return 0;
2756 }
2757
2758
2759 void uv__fs_event_close(uv_fs_event_t* handle) {
2760 uv_fs_event_stop(handle);
2761 }