|
160
|
1 /* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
|
|
|
2 * Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
3 * of this software and associated documentation files (the "Software"), to
|
|
|
4 * deal in the Software without restriction, including without limitation the
|
|
|
5 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
|
6 * sell copies of the Software, and to permit persons to whom the Software is
|
|
|
7 * furnished to do so, subject to the following conditions:
|
|
|
8 *
|
|
|
9 * The above copyright notice and this permission notice shall be included in
|
|
|
10 * all copies or substantial portions of the Software.
|
|
|
11 *
|
|
|
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
17 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
18 * IN THE SOFTWARE.
|
|
|
19 */
|
|
|
20
|
|
|
21 /* We lean on the fact that POLL{IN,OUT,ERR,HUP} correspond with their
|
|
|
22 * EPOLL* counterparts. We use the POLL* variants in this file because that
|
|
|
23 * is what libuv uses elsewhere.
|
|
|
24 */
|
|
|
25
|
|
|
26 #include "uv.h"
|
|
|
27 #include "internal.h"
|
|
|
28
|
|
|
29 #include <inttypes.h>
|
|
|
30 #include <stdatomic.h>
|
|
|
31 #include <stddef.h> /* offsetof */
|
|
|
32 #include <stdint.h>
|
|
|
33 #include <stdio.h>
|
|
|
34 #include <stdlib.h>
|
|
|
35 #include <string.h>
|
|
|
36 #include <assert.h>
|
|
|
37 #include <errno.h>
|
|
|
38
|
|
|
39 #include <fcntl.h>
|
|
|
40 #include <ifaddrs.h>
|
|
|
41 #include <net/ethernet.h>
|
|
|
42 #include <net/if.h>
|
|
|
43 #include <netpacket/packet.h>
|
|
|
44 #include <sys/epoll.h>
|
|
|
45 #include <sys/inotify.h>
|
|
|
46 #include <sys/mman.h>
|
|
|
47 #include <sys/param.h>
|
|
|
48 #include <sys/prctl.h>
|
|
|
49 #include <sys/socket.h>
|
|
|
50 #include <sys/stat.h>
|
|
|
51 #include <sys/syscall.h>
|
|
|
52 #include <sys/sysinfo.h>
|
|
|
53 #include <sys/sysmacros.h>
|
|
|
54 #include <sys/types.h>
|
|
|
55 #include <sys/utsname.h>
|
|
|
56 #include <time.h>
|
|
|
57 #include <unistd.h>
|
|
|
58
|
|
|
59 #ifndef __NR_io_uring_setup
|
|
|
60 # define __NR_io_uring_setup 425
|
|
|
61 #endif
|
|
|
62
|
|
|
63 #ifndef __NR_io_uring_enter
|
|
|
64 # define __NR_io_uring_enter 426
|
|
|
65 #endif
|
|
|
66
|
|
|
67 #ifndef __NR_io_uring_register
|
|
|
68 # define __NR_io_uring_register 427
|
|
|
69 #endif
|
|
|
70
|
|
|
71 #ifndef __NR_copy_file_range
|
|
|
72 # if defined(__x86_64__)
|
|
|
73 # define __NR_copy_file_range 326
|
|
|
74 # elif defined(__i386__)
|
|
|
75 # define __NR_copy_file_range 377
|
|
|
76 # elif defined(__s390__)
|
|
|
77 # define __NR_copy_file_range 375
|
|
|
78 # elif defined(__arm__)
|
|
|
79 # define __NR_copy_file_range 391
|
|
|
80 # elif defined(__aarch64__)
|
|
|
81 # define __NR_copy_file_range 285
|
|
|
82 # elif defined(__powerpc__)
|
|
|
83 # define __NR_copy_file_range 379
|
|
|
84 # elif defined(__arc__)
|
|
|
85 # define __NR_copy_file_range 285
|
|
|
86 # elif defined(__riscv)
|
|
|
87 # define __NR_copy_file_range 285
|
|
|
88 # endif
|
|
|
89 #endif /* __NR_copy_file_range */
|
|
|
90
|
|
|
91 #ifndef __NR_statx
|
|
|
92 # if defined(__x86_64__)
|
|
|
93 # define __NR_statx 332
|
|
|
94 # elif defined(__i386__)
|
|
|
95 # define __NR_statx 383
|
|
|
96 # elif defined(__aarch64__)
|
|
|
97 # define __NR_statx 397
|
|
|
98 # elif defined(__arm__)
|
|
|
99 # define __NR_statx 397
|
|
|
100 # elif defined(__ppc__)
|
|
|
101 # define __NR_statx 383
|
|
|
102 # elif defined(__s390__)
|
|
|
103 # define __NR_statx 379
|
|
|
104 # elif defined(__riscv)
|
|
|
105 # define __NR_statx 291
|
|
|
106 # endif
|
|
|
107 #endif /* __NR_statx */
|
|
|
108
|
|
|
109 #ifndef __NR_getrandom
|
|
|
110 # if defined(__x86_64__)
|
|
|
111 # define __NR_getrandom 318
|
|
|
112 # elif defined(__i386__)
|
|
|
113 # define __NR_getrandom 355
|
|
|
114 # elif defined(__aarch64__)
|
|
|
115 # define __NR_getrandom 384
|
|
|
116 # elif defined(__arm__)
|
|
|
117 # define __NR_getrandom 384
|
|
|
118 # elif defined(__ppc__)
|
|
|
119 # define __NR_getrandom 359
|
|
|
120 # elif defined(__s390__)
|
|
|
121 # define __NR_getrandom 349
|
|
|
122 # elif defined(__riscv)
|
|
|
123 # define __NR_getrandom 278
|
|
|
124 # endif
|
|
|
125 #endif /* __NR_getrandom */
|
|
|
126
|
|
|
127 enum {
|
|
|
128 UV__IORING_SETUP_SQPOLL = 2u,
|
|
|
129 UV__IORING_SETUP_NO_SQARRAY = 0x10000u,
|
|
|
130 };
|
|
|
131
|
|
|
132 enum {
|
|
|
133 UV__IORING_FEAT_SINGLE_MMAP = 1u,
|
|
|
134 UV__IORING_FEAT_NODROP = 2u,
|
|
|
135 UV__IORING_FEAT_RSRC_TAGS = 1024u, /* linux v5.13 */
|
|
|
136 };
|
|
|
137
|
|
|
138 enum {
|
|
|
139 UV__IORING_OP_READV = 1,
|
|
|
140 UV__IORING_OP_WRITEV = 2,
|
|
|
141 UV__IORING_OP_FSYNC = 3,
|
|
|
142 UV__IORING_OP_OPENAT = 18,
|
|
|
143 UV__IORING_OP_CLOSE = 19,
|
|
|
144 UV__IORING_OP_STATX = 21,
|
|
|
145 UV__IORING_OP_EPOLL_CTL = 29,
|
|
|
146 UV__IORING_OP_RENAMEAT = 35,
|
|
|
147 UV__IORING_OP_UNLINKAT = 36,
|
|
|
148 UV__IORING_OP_MKDIRAT = 37,
|
|
|
149 UV__IORING_OP_SYMLINKAT = 38,
|
|
|
150 UV__IORING_OP_LINKAT = 39,
|
|
|
151 UV__IORING_OP_FTRUNCATE = 55,
|
|
|
152 };
|
|
|
153
|
|
|
154 enum {
|
|
|
155 UV__IORING_ENTER_GETEVENTS = 1u,
|
|
|
156 UV__IORING_ENTER_SQ_WAKEUP = 2u,
|
|
|
157 };
|
|
|
158
|
|
|
159 enum {
|
|
|
160 UV__IORING_SQ_NEED_WAKEUP = 1u,
|
|
|
161 UV__IORING_SQ_CQ_OVERFLOW = 2u,
|
|
|
162 };
|
|
|
163
|
|
|
164 struct uv__io_cqring_offsets {
|
|
|
165 uint32_t head;
|
|
|
166 uint32_t tail;
|
|
|
167 uint32_t ring_mask;
|
|
|
168 uint32_t ring_entries;
|
|
|
169 uint32_t overflow;
|
|
|
170 uint32_t cqes;
|
|
|
171 uint64_t reserved0;
|
|
|
172 uint64_t reserved1;
|
|
|
173 };
|
|
|
174
|
|
|
175 STATIC_ASSERT(40 == sizeof(struct uv__io_cqring_offsets));
|
|
|
176
|
|
|
177 struct uv__io_sqring_offsets {
|
|
|
178 uint32_t head;
|
|
|
179 uint32_t tail;
|
|
|
180 uint32_t ring_mask;
|
|
|
181 uint32_t ring_entries;
|
|
|
182 uint32_t flags;
|
|
|
183 uint32_t dropped;
|
|
|
184 uint32_t array;
|
|
|
185 uint32_t reserved0;
|
|
|
186 uint64_t reserved1;
|
|
|
187 };
|
|
|
188
|
|
|
189 STATIC_ASSERT(40 == sizeof(struct uv__io_sqring_offsets));
|
|
|
190
|
|
|
191 struct uv__io_uring_cqe {
|
|
|
192 uint64_t user_data;
|
|
|
193 int32_t res;
|
|
|
194 uint32_t flags;
|
|
|
195 };
|
|
|
196
|
|
|
197 STATIC_ASSERT(16 == sizeof(struct uv__io_uring_cqe));
|
|
|
198
|
|
|
199 struct uv__io_uring_sqe {
|
|
|
200 uint8_t opcode;
|
|
|
201 uint8_t flags;
|
|
|
202 uint16_t ioprio;
|
|
|
203 int32_t fd;
|
|
|
204 union {
|
|
|
205 uint64_t off;
|
|
|
206 uint64_t addr2;
|
|
|
207 };
|
|
|
208 union {
|
|
|
209 uint64_t addr;
|
|
|
210 };
|
|
|
211 uint32_t len;
|
|
|
212 union {
|
|
|
213 uint32_t rw_flags;
|
|
|
214 uint32_t fsync_flags;
|
|
|
215 uint32_t open_flags;
|
|
|
216 uint32_t statx_flags;
|
|
|
217 };
|
|
|
218 uint64_t user_data;
|
|
|
219 union {
|
|
|
220 uint16_t buf_index;
|
|
|
221 uint64_t pad[3];
|
|
|
222 };
|
|
|
223 };
|
|
|
224
|
|
|
225 STATIC_ASSERT(64 == sizeof(struct uv__io_uring_sqe));
|
|
|
226 STATIC_ASSERT(0 == offsetof(struct uv__io_uring_sqe, opcode));
|
|
|
227 STATIC_ASSERT(1 == offsetof(struct uv__io_uring_sqe, flags));
|
|
|
228 STATIC_ASSERT(2 == offsetof(struct uv__io_uring_sqe, ioprio));
|
|
|
229 STATIC_ASSERT(4 == offsetof(struct uv__io_uring_sqe, fd));
|
|
|
230 STATIC_ASSERT(8 == offsetof(struct uv__io_uring_sqe, off));
|
|
|
231 STATIC_ASSERT(16 == offsetof(struct uv__io_uring_sqe, addr));
|
|
|
232 STATIC_ASSERT(24 == offsetof(struct uv__io_uring_sqe, len));
|
|
|
233 STATIC_ASSERT(28 == offsetof(struct uv__io_uring_sqe, rw_flags));
|
|
|
234 STATIC_ASSERT(32 == offsetof(struct uv__io_uring_sqe, user_data));
|
|
|
235 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_sqe, buf_index));
|
|
|
236
|
|
|
237 struct uv__io_uring_params {
|
|
|
238 uint32_t sq_entries;
|
|
|
239 uint32_t cq_entries;
|
|
|
240 uint32_t flags;
|
|
|
241 uint32_t sq_thread_cpu;
|
|
|
242 uint32_t sq_thread_idle;
|
|
|
243 uint32_t features;
|
|
|
244 uint32_t reserved[4];
|
|
|
245 struct uv__io_sqring_offsets sq_off; /* 40 bytes */
|
|
|
246 struct uv__io_cqring_offsets cq_off; /* 40 bytes */
|
|
|
247 };
|
|
|
248
|
|
|
249 STATIC_ASSERT(40 + 40 + 40 == sizeof(struct uv__io_uring_params));
|
|
|
250 STATIC_ASSERT(40 == offsetof(struct uv__io_uring_params, sq_off));
|
|
|
251 STATIC_ASSERT(80 == offsetof(struct uv__io_uring_params, cq_off));
|
|
|
252
|
|
|
253 STATIC_ASSERT(EPOLL_CTL_ADD < 4);
|
|
|
254 STATIC_ASSERT(EPOLL_CTL_DEL < 4);
|
|
|
255 STATIC_ASSERT(EPOLL_CTL_MOD < 4);
|
|
|
256
|
|
|
257 struct watcher_list {
|
|
|
258 RB_ENTRY(watcher_list) entry;
|
|
|
259 struct uv__queue watchers;
|
|
|
260 int iterating;
|
|
|
261 char* path;
|
|
|
262 int wd;
|
|
|
263 };
|
|
|
264
|
|
|
265 struct watcher_root {
|
|
|
266 struct watcher_list* rbh_root;
|
|
|
267 };
|
|
|
268
|
|
|
269 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root);
|
|
|
270 static void uv__inotify_read(uv_loop_t* loop,
|
|
|
271 uv__io_t* w,
|
|
|
272 unsigned int revents);
|
|
|
273 static int compare_watchers(const struct watcher_list* a,
|
|
|
274 const struct watcher_list* b);
|
|
|
275 static void maybe_free_watcher_list(struct watcher_list* w,
|
|
|
276 uv_loop_t* loop);
|
|
|
277
|
|
|
278 static void uv__epoll_ctl_flush(int epollfd,
|
|
|
279 struct uv__iou* ctl,
|
|
|
280 struct epoll_event (*events)[256]);
|
|
|
281
|
|
|
282 static void uv__epoll_ctl_prep(int epollfd,
|
|
|
283 struct uv__iou* ctl,
|
|
|
284 struct epoll_event (*events)[256],
|
|
|
285 int op,
|
|
|
286 int fd,
|
|
|
287 struct epoll_event* e);
|
|
|
288
|
|
|
289 RB_GENERATE_STATIC(watcher_root, watcher_list, entry, compare_watchers)
|
|
|
290
|
|
|
291
|
|
|
292 static struct watcher_root* uv__inotify_watchers(uv_loop_t* loop) {
|
|
|
293 /* This cast works because watcher_root is a struct with a pointer as its
|
|
|
294 * sole member. Such type punning is unsafe in the presence of strict
|
|
|
295 * pointer aliasing (and is just plain nasty) but that is why libuv
|
|
|
296 * is compiled with -fno-strict-aliasing.
|
|
|
297 */
|
|
|
298 return (struct watcher_root*) &loop->inotify_watchers;
|
|
|
299 }
|
|
|
300
|
|
|
301
|
|
|
302 unsigned uv__kernel_version(void) {
|
|
|
303 static _Atomic unsigned cached_version;
|
|
|
304 struct utsname u;
|
|
|
305 unsigned version;
|
|
|
306 unsigned major;
|
|
|
307 unsigned minor;
|
|
|
308 unsigned patch;
|
|
|
309 char v_sig[256];
|
|
|
310 char* needle;
|
|
|
311
|
|
|
312 version = atomic_load_explicit(&cached_version, memory_order_relaxed);
|
|
|
313 if (version != 0)
|
|
|
314 return version;
|
|
|
315
|
|
|
316 /* Check /proc/version_signature first as it's the way to get the mainline
|
|
|
317 * kernel version in Ubuntu. The format is:
|
|
|
318 * Ubuntu ubuntu_kernel_version mainline_kernel_version
|
|
|
319 * For example:
|
|
|
320 * Ubuntu 5.15.0-79.86-generic 5.15.111
|
|
|
321 */
|
|
|
322 if (0 == uv__slurp("/proc/version_signature", v_sig, sizeof(v_sig)))
|
|
|
323 if (3 == sscanf(v_sig, "Ubuntu %*s %u.%u.%u", &major, &minor, &patch))
|
|
|
324 goto calculate_version;
|
|
|
325
|
|
|
326 if (-1 == uname(&u))
|
|
|
327 return 0;
|
|
|
328
|
|
|
329 /* In Debian we need to check `version` instead of `release` to extract the
|
|
|
330 * mainline kernel version. This is an example of how it looks like:
|
|
|
331 * #1 SMP Debian 5.10.46-4 (2021-08-03)
|
|
|
332 */
|
|
|
333 needle = strstr(u.version, "Debian ");
|
|
|
334 if (needle != NULL)
|
|
|
335 if (3 == sscanf(needle, "Debian %u.%u.%u", &major, &minor, &patch))
|
|
|
336 goto calculate_version;
|
|
|
337
|
|
|
338 if (3 != sscanf(u.release, "%u.%u.%u", &major, &minor, &patch))
|
|
|
339 return 0;
|
|
|
340
|
|
|
341 /* Handle it when the process runs under the UNAME26 personality:
|
|
|
342 *
|
|
|
343 * - kernels >= 3.x identify as 2.6.40+x
|
|
|
344 * - kernels >= 4.x identify as 2.6.60+x
|
|
|
345 *
|
|
|
346 * UNAME26 is a poorly conceived hack that doesn't let us distinguish
|
|
|
347 * between 4.x kernels and 5.x/6.x kernels so we conservatively assume
|
|
|
348 * that 2.6.60+x means 4.x.
|
|
|
349 *
|
|
|
350 * Fun fact of the day: it's technically possible to observe the actual
|
|
|
351 * kernel version for a brief moment because uname() first copies out the
|
|
|
352 * real release string before overwriting it with the backcompat string.
|
|
|
353 */
|
|
|
354 if (major == 2 && minor == 6) {
|
|
|
355 if (patch >= 60) {
|
|
|
356 major = 4;
|
|
|
357 minor = patch - 60;
|
|
|
358 patch = 0;
|
|
|
359 } else if (patch >= 40) {
|
|
|
360 major = 3;
|
|
|
361 minor = patch - 40;
|
|
|
362 patch = 0;
|
|
|
363 }
|
|
|
364 }
|
|
|
365
|
|
|
366 calculate_version:
|
|
|
367 version = major * 65536 + minor * 256 + patch;
|
|
|
368 atomic_store_explicit(&cached_version, version, memory_order_relaxed);
|
|
|
369
|
|
|
370 return version;
|
|
|
371 }
|
|
|
372
|
|
|
373
|
|
|
374 ssize_t
|
|
|
375 uv__fs_copy_file_range(int fd_in,
|
|
|
376 off_t* off_in,
|
|
|
377 int fd_out,
|
|
|
378 off_t* off_out,
|
|
|
379 size_t len,
|
|
|
380 unsigned int flags)
|
|
|
381 {
|
|
|
382 #ifdef __NR_copy_file_range
|
|
|
383 return syscall(__NR_copy_file_range,
|
|
|
384 fd_in,
|
|
|
385 off_in,
|
|
|
386 fd_out,
|
|
|
387 off_out,
|
|
|
388 len,
|
|
|
389 flags);
|
|
|
390 #else
|
|
|
391 return errno = ENOSYS, -1;
|
|
|
392 #endif
|
|
|
393 }
|
|
|
394
|
|
|
395
|
|
|
396 int uv__statx(int dirfd,
|
|
|
397 const char* path,
|
|
|
398 int flags,
|
|
|
399 unsigned int mask,
|
|
|
400 struct uv__statx* statxbuf) {
|
|
|
401 #if !defined(__NR_statx) || defined(__ANDROID_API__) && __ANDROID_API__ < 30
|
|
|
402 return errno = ENOSYS, -1;
|
|
|
403 #else
|
|
|
404 int rc;
|
|
|
405
|
|
|
406 rc = syscall(__NR_statx, dirfd, path, flags, mask, statxbuf);
|
|
|
407 if (rc >= 0)
|
|
|
408 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
|
|
|
409
|
|
|
410 return rc;
|
|
|
411 #endif
|
|
|
412 }
|
|
|
413
|
|
|
414
|
|
|
415 ssize_t uv__getrandom(void* buf, size_t buflen, unsigned flags) {
|
|
|
416 #if !defined(__NR_getrandom) || defined(__ANDROID_API__) && __ANDROID_API__ < 28
|
|
|
417 return errno = ENOSYS, -1;
|
|
|
418 #else
|
|
|
419 ssize_t rc;
|
|
|
420
|
|
|
421 rc = syscall(__NR_getrandom, buf, buflen, flags);
|
|
|
422 if (rc >= 0)
|
|
|
423 uv__msan_unpoison(buf, buflen);
|
|
|
424
|
|
|
425 return rc;
|
|
|
426 #endif
|
|
|
427 }
|
|
|
428
|
|
|
429
|
|
|
430 int uv__io_uring_setup(int entries, struct uv__io_uring_params* params) {
|
|
|
431 return syscall(__NR_io_uring_setup, entries, params);
|
|
|
432 }
|
|
|
433
|
|
|
434
|
|
|
435 int uv__io_uring_enter(int fd,
|
|
|
436 unsigned to_submit,
|
|
|
437 unsigned min_complete,
|
|
|
438 unsigned flags) {
|
|
|
439 /* io_uring_enter used to take a sigset_t but it's unused
|
|
|
440 * in newer kernels unless IORING_ENTER_EXT_ARG is set,
|
|
|
441 * in which case it takes a struct io_uring_getevents_arg.
|
|
|
442 */
|
|
|
443 return syscall(__NR_io_uring_enter,
|
|
|
444 fd,
|
|
|
445 to_submit,
|
|
|
446 min_complete,
|
|
|
447 flags,
|
|
|
448 NULL,
|
|
|
449 0L);
|
|
|
450 }
|
|
|
451
|
|
|
452
|
|
|
453 int uv__io_uring_register(int fd, unsigned opcode, void* arg, unsigned nargs) {
|
|
|
454 return syscall(__NR_io_uring_register, fd, opcode, arg, nargs);
|
|
|
455 }
|
|
|
456
|
|
|
457
|
|
|
458 static int uv__use_io_uring(uint32_t flags) {
|
|
|
459 #if defined(__ANDROID_API__)
|
|
|
460 return 0; /* Possibly available but blocked by seccomp. */
|
|
|
461 #elif defined(__arm__) && __SIZEOF_POINTER__ == 4
|
|
|
462 /* See https://github.com/libuv/libuv/issues/4158. */
|
|
|
463 return 0; /* All 32 bits kernels appear buggy. */
|
|
|
464 #elif defined(__powerpc64__) || defined(__ppc64__)
|
|
|
465 /* See https://github.com/libuv/libuv/issues/4283. */
|
|
|
466 return 0; /* Random SIGSEGV in signal handler. */
|
|
|
467 #else
|
|
|
468 /* Ternary: unknown=0, yes=1, no=-1 */
|
|
|
469 static _Atomic int use_io_uring;
|
|
|
470 char* val;
|
|
|
471 int use;
|
|
|
472
|
|
|
473 #if defined(__hppa__)
|
|
|
474 /* io_uring first supported on parisc in 6.1, functional in .51
|
|
|
475 * https://lore.kernel.org/all/[email protected]/
|
|
|
476 */
|
|
|
477 if (uv__kernel_version() < /*6.1.51*/0x060133)
|
|
|
478 return 0;
|
|
|
479 #endif
|
|
|
480
|
|
|
481 /* SQPOLL is all kinds of buggy but epoll batching should work fine. */
|
|
|
482 if (0 == (flags & UV__IORING_SETUP_SQPOLL))
|
|
|
483 return 1;
|
|
|
484
|
|
|
485 /* Older kernels have a bug where the sqpoll thread uses 100% CPU. */
|
|
|
486 if (uv__kernel_version() < /*5.10.186*/0x050ABA)
|
|
|
487 return 0;
|
|
|
488
|
|
|
489 use = atomic_load_explicit(&use_io_uring, memory_order_relaxed);
|
|
|
490
|
|
|
491 if (use == 0) {
|
|
|
492 val = getenv("UV_USE_IO_URING");
|
|
|
493 use = val != NULL && atoi(val) > 0 ? 1 : -1;
|
|
|
494 atomic_store_explicit(&use_io_uring, use, memory_order_relaxed);
|
|
|
495 }
|
|
|
496
|
|
|
497 return use > 0;
|
|
|
498 #endif
|
|
|
499 }
|
|
|
500
|
|
|
501
|
|
|
502 static void uv__iou_init(int epollfd,
|
|
|
503 struct uv__iou* iou,
|
|
|
504 uint32_t entries,
|
|
|
505 uint32_t flags) {
|
|
|
506 struct uv__io_uring_params params;
|
|
|
507 struct epoll_event e;
|
|
|
508 size_t cqlen;
|
|
|
509 size_t sqlen;
|
|
|
510 size_t maxlen;
|
|
|
511 size_t sqelen;
|
|
|
512 unsigned kernel_version;
|
|
|
513 uint32_t* sqarray;
|
|
|
514 uint32_t i;
|
|
|
515 char* sq;
|
|
|
516 char* sqe;
|
|
|
517 int ringfd;
|
|
|
518 int no_sqarray;
|
|
|
519
|
|
|
520 sq = MAP_FAILED;
|
|
|
521 sqe = MAP_FAILED;
|
|
|
522
|
|
|
523 if (!uv__use_io_uring(flags))
|
|
|
524 return;
|
|
|
525
|
|
|
526 kernel_version = uv__kernel_version();
|
|
|
527 no_sqarray =
|
|
|
528 UV__IORING_SETUP_NO_SQARRAY * (kernel_version >= /* 6.6 */0x060600);
|
|
|
529
|
|
|
530 /* SQPOLL required CAP_SYS_NICE until linux v5.12 relaxed that requirement.
|
|
|
531 * Mostly academic because we check for a v5.13 kernel afterwards anyway.
|
|
|
532 */
|
|
|
533 memset(¶ms, 0, sizeof(params));
|
|
|
534 params.flags = flags | no_sqarray;
|
|
|
535
|
|
|
536 if (flags & UV__IORING_SETUP_SQPOLL)
|
|
|
537 params.sq_thread_idle = 10; /* milliseconds */
|
|
|
538
|
|
|
539 /* Kernel returns a file descriptor with O_CLOEXEC flag set. */
|
|
|
540 ringfd = uv__io_uring_setup(entries, ¶ms);
|
|
|
541 if (ringfd == -1)
|
|
|
542 return;
|
|
|
543
|
|
|
544 /* IORING_FEAT_RSRC_TAGS is used to detect linux v5.13 but what we're
|
|
|
545 * actually detecting is whether IORING_OP_STATX works with SQPOLL.
|
|
|
546 */
|
|
|
547 if (!(params.features & UV__IORING_FEAT_RSRC_TAGS))
|
|
|
548 goto fail;
|
|
|
549
|
|
|
550 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
|
|
|
551 if (!(params.features & UV__IORING_FEAT_SINGLE_MMAP))
|
|
|
552 goto fail;
|
|
|
553
|
|
|
554 /* Implied by IORING_FEAT_RSRC_TAGS but checked explicitly anyway. */
|
|
|
555 if (!(params.features & UV__IORING_FEAT_NODROP))
|
|
|
556 goto fail;
|
|
|
557
|
|
|
558 sqlen = params.sq_off.array + params.sq_entries * sizeof(uint32_t);
|
|
|
559 cqlen =
|
|
|
560 params.cq_off.cqes + params.cq_entries * sizeof(struct uv__io_uring_cqe);
|
|
|
561 maxlen = sqlen < cqlen ? cqlen : sqlen;
|
|
|
562 sqelen = params.sq_entries * sizeof(struct uv__io_uring_sqe);
|
|
|
563
|
|
|
564 sq = mmap(0,
|
|
|
565 maxlen,
|
|
|
566 PROT_READ | PROT_WRITE,
|
|
|
567 MAP_SHARED | MAP_POPULATE,
|
|
|
568 ringfd,
|
|
|
569 0); /* IORING_OFF_SQ_RING */
|
|
|
570
|
|
|
571 sqe = mmap(0,
|
|
|
572 sqelen,
|
|
|
573 PROT_READ | PROT_WRITE,
|
|
|
574 MAP_SHARED | MAP_POPULATE,
|
|
|
575 ringfd,
|
|
|
576 0x10000000ull); /* IORING_OFF_SQES */
|
|
|
577
|
|
|
578 if (sq == MAP_FAILED || sqe == MAP_FAILED)
|
|
|
579 goto fail;
|
|
|
580
|
|
|
581 if (flags & UV__IORING_SETUP_SQPOLL) {
|
|
|
582 /* Only interested in completion events. To get notified when
|
|
|
583 * the kernel pulls items from the submission ring, add POLLOUT.
|
|
|
584 */
|
|
|
585 memset(&e, 0, sizeof(e));
|
|
|
586 e.events = POLLIN;
|
|
|
587 e.data.fd = ringfd;
|
|
|
588
|
|
|
589 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ringfd, &e))
|
|
|
590 goto fail;
|
|
|
591 }
|
|
|
592
|
|
|
593 iou->sqhead = (uint32_t*) (sq + params.sq_off.head);
|
|
|
594 iou->sqtail = (uint32_t*) (sq + params.sq_off.tail);
|
|
|
595 iou->sqmask = *(uint32_t*) (sq + params.sq_off.ring_mask);
|
|
|
596 iou->sqflags = (uint32_t*) (sq + params.sq_off.flags);
|
|
|
597 iou->cqhead = (uint32_t*) (sq + params.cq_off.head);
|
|
|
598 iou->cqtail = (uint32_t*) (sq + params.cq_off.tail);
|
|
|
599 iou->cqmask = *(uint32_t*) (sq + params.cq_off.ring_mask);
|
|
|
600 iou->sq = sq;
|
|
|
601 iou->cqe = sq + params.cq_off.cqes;
|
|
|
602 iou->sqe = sqe;
|
|
|
603 iou->sqlen = sqlen;
|
|
|
604 iou->cqlen = cqlen;
|
|
|
605 iou->maxlen = maxlen;
|
|
|
606 iou->sqelen = sqelen;
|
|
|
607 iou->ringfd = ringfd;
|
|
|
608 iou->in_flight = 0;
|
|
|
609
|
|
|
610 if (no_sqarray)
|
|
|
611 return;
|
|
|
612
|
|
|
613 sqarray = (uint32_t*) (sq + params.sq_off.array);
|
|
|
614 for (i = 0; i <= iou->sqmask; i++)
|
|
|
615 sqarray[i] = i; /* Slot -> sqe identity mapping. */
|
|
|
616
|
|
|
617 return;
|
|
|
618
|
|
|
619 fail:
|
|
|
620 if (sq != MAP_FAILED)
|
|
|
621 munmap(sq, maxlen);
|
|
|
622
|
|
|
623 if (sqe != MAP_FAILED)
|
|
|
624 munmap(sqe, sqelen);
|
|
|
625
|
|
|
626 uv__close(ringfd);
|
|
|
627 }
|
|
|
628
|
|
|
629
|
|
|
630 static void uv__iou_delete(struct uv__iou* iou) {
|
|
|
631 if (iou->ringfd > -1) {
|
|
|
632 munmap(iou->sq, iou->maxlen);
|
|
|
633 munmap(iou->sqe, iou->sqelen);
|
|
|
634 uv__close(iou->ringfd);
|
|
|
635 iou->ringfd = -1;
|
|
|
636 }
|
|
|
637 }
|
|
|
638
|
|
|
639
|
|
|
640 int uv__platform_loop_init(uv_loop_t* loop) {
|
|
|
641 uv__loop_internal_fields_t* lfields;
|
|
|
642
|
|
|
643 lfields = uv__get_internal_fields(loop);
|
|
|
644 lfields->ctl.ringfd = -1;
|
|
|
645 lfields->iou.ringfd = -2; /* "uninitialized" */
|
|
|
646
|
|
|
647 loop->inotify_watchers = NULL;
|
|
|
648 loop->inotify_fd = -1;
|
|
|
649 loop->backend_fd = epoll_create1(O_CLOEXEC);
|
|
|
650
|
|
|
651 if (loop->backend_fd == -1)
|
|
|
652 return UV__ERR(errno);
|
|
|
653
|
|
|
654 uv__iou_init(loop->backend_fd, &lfields->ctl, 256, 0);
|
|
|
655
|
|
|
656 return 0;
|
|
|
657 }
|
|
|
658
|
|
|
659
|
|
|
660 int uv__io_fork(uv_loop_t* loop) {
|
|
|
661 int err;
|
|
|
662 struct watcher_list* root;
|
|
|
663
|
|
|
664 root = uv__inotify_watchers(loop)->rbh_root;
|
|
|
665
|
|
|
666 uv__close(loop->backend_fd);
|
|
|
667 loop->backend_fd = -1;
|
|
|
668
|
|
|
669 /* TODO(bnoordhuis) Loses items from the submission and completion rings. */
|
|
|
670 uv__platform_loop_delete(loop);
|
|
|
671
|
|
|
672 err = uv__platform_loop_init(loop);
|
|
|
673 if (err)
|
|
|
674 return err;
|
|
|
675
|
|
|
676 return uv__inotify_fork(loop, root);
|
|
|
677 }
|
|
|
678
|
|
|
679
|
|
|
680 void uv__platform_loop_delete(uv_loop_t* loop) {
|
|
|
681 uv__loop_internal_fields_t* lfields;
|
|
|
682
|
|
|
683 lfields = uv__get_internal_fields(loop);
|
|
|
684 uv__iou_delete(&lfields->ctl);
|
|
|
685 uv__iou_delete(&lfields->iou);
|
|
|
686
|
|
|
687 if (loop->inotify_fd != -1) {
|
|
|
688 uv__io_stop(loop, &loop->inotify_read_watcher, POLLIN);
|
|
|
689 uv__close(loop->inotify_fd);
|
|
|
690 loop->inotify_fd = -1;
|
|
|
691 }
|
|
|
692 }
|
|
|
693
|
|
|
694
|
|
|
695 struct uv__invalidate {
|
|
|
696 struct epoll_event (*prep)[256];
|
|
|
697 struct epoll_event* events;
|
|
|
698 int nfds;
|
|
|
699 };
|
|
|
700
|
|
|
701
|
|
|
702 void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
|
|
|
703 uv__loop_internal_fields_t* lfields;
|
|
|
704 struct uv__invalidate* inv;
|
|
|
705 struct epoll_event dummy;
|
|
|
706 int i;
|
|
|
707
|
|
|
708 lfields = uv__get_internal_fields(loop);
|
|
|
709 inv = lfields->inv;
|
|
|
710
|
|
|
711 /* Invalidate events with same file descriptor */
|
|
|
712 if (inv != NULL)
|
|
|
713 for (i = 0; i < inv->nfds; i++)
|
|
|
714 if (inv->events[i].data.fd == fd)
|
|
|
715 inv->events[i].data.fd = -1;
|
|
|
716
|
|
|
717 /* Remove the file descriptor from the epoll.
|
|
|
718 * This avoids a problem where the same file description remains open
|
|
|
719 * in another process, causing repeated junk epoll events.
|
|
|
720 *
|
|
|
721 * Perform EPOLL_CTL_DEL immediately instead of going through
|
|
|
722 * io_uring's submit queue, otherwise the file descriptor may
|
|
|
723 * be closed by the time the kernel starts the operation.
|
|
|
724 *
|
|
|
725 * We pass in a dummy epoll_event, to work around a bug in old kernels.
|
|
|
726 *
|
|
|
727 * Work around a bug in kernels 3.10 to 3.19 where passing a struct that
|
|
|
728 * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
|
|
|
729 */
|
|
|
730 memset(&dummy, 0, sizeof(dummy));
|
|
|
731 epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
|
|
|
732 }
|
|
|
733
|
|
|
734
|
|
|
735 int uv__io_check_fd(uv_loop_t* loop, int fd) {
|
|
|
736 struct epoll_event e;
|
|
|
737 int rc;
|
|
|
738
|
|
|
739 memset(&e, 0, sizeof(e));
|
|
|
740 e.events = POLLIN;
|
|
|
741 e.data.fd = -1;
|
|
|
742
|
|
|
743 rc = 0;
|
|
|
744 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
|
|
|
745 if (errno != EEXIST)
|
|
|
746 rc = UV__ERR(errno);
|
|
|
747
|
|
|
748 if (rc == 0)
|
|
|
749 if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
|
|
|
750 abort();
|
|
|
751
|
|
|
752 return rc;
|
|
|
753 }
|
|
|
754
|
|
|
755
|
|
|
756 /* Caller must initialize SQE and call uv__iou_submit(). */
|
|
|
757 static struct uv__io_uring_sqe* uv__iou_get_sqe(struct uv__iou* iou,
|
|
|
758 uv_loop_t* loop,
|
|
|
759 uv_fs_t* req) {
|
|
|
760 struct uv__io_uring_sqe* sqe;
|
|
|
761 uint32_t head;
|
|
|
762 uint32_t tail;
|
|
|
763 uint32_t mask;
|
|
|
764 uint32_t slot;
|
|
|
765
|
|
|
766 /* Lazily create the ring. State machine: -2 means uninitialized, -1 means
|
|
|
767 * initialization failed. Anything else is a valid ring file descriptor.
|
|
|
768 */
|
|
|
769 if (iou->ringfd == -2) {
|
|
|
770 /* By default, the SQPOLL is not created. Enable only if the loop is
|
|
|
771 * configured with UV_LOOP_USE_IO_URING_SQPOLL and the UV_USE_IO_URING
|
|
|
772 * environment variable is unset or a positive number.
|
|
|
773 */
|
|
|
774 if (loop->flags & UV_LOOP_ENABLE_IO_URING_SQPOLL)
|
|
|
775 if (uv__use_io_uring(UV__IORING_SETUP_SQPOLL))
|
|
|
776 uv__iou_init(loop->backend_fd, iou, 64, UV__IORING_SETUP_SQPOLL);
|
|
|
777
|
|
|
778 if (iou->ringfd == -2)
|
|
|
779 iou->ringfd = -1; /* "failed" */
|
|
|
780 }
|
|
|
781
|
|
|
782 if (iou->ringfd == -1)
|
|
|
783 return NULL;
|
|
|
784
|
|
|
785 head = atomic_load_explicit((_Atomic uint32_t*) iou->sqhead,
|
|
|
786 memory_order_acquire);
|
|
|
787 tail = *iou->sqtail;
|
|
|
788 mask = iou->sqmask;
|
|
|
789
|
|
|
790 if ((head & mask) == ((tail + 1) & mask))
|
|
|
791 return NULL; /* No room in ring buffer. TODO(bnoordhuis) maybe flush it? */
|
|
|
792
|
|
|
793 slot = tail & mask;
|
|
|
794 sqe = iou->sqe;
|
|
|
795 sqe = &sqe[slot];
|
|
|
796 memset(sqe, 0, sizeof(*sqe));
|
|
|
797 sqe->user_data = (uintptr_t) req;
|
|
|
798
|
|
|
799 /* Pacify uv_cancel(). */
|
|
|
800 req->work_req.loop = loop;
|
|
|
801 req->work_req.work = NULL;
|
|
|
802 req->work_req.done = NULL;
|
|
|
803 uv__queue_init(&req->work_req.wq);
|
|
|
804
|
|
|
805 uv__req_register(loop);
|
|
|
806 iou->in_flight++;
|
|
|
807
|
|
|
808 return sqe;
|
|
|
809 }
|
|
|
810
|
|
|
811
|
|
|
812 static void uv__iou_submit(struct uv__iou* iou) {
|
|
|
813 uint32_t flags;
|
|
|
814
|
|
|
815 atomic_store_explicit((_Atomic uint32_t*) iou->sqtail,
|
|
|
816 *iou->sqtail + 1,
|
|
|
817 memory_order_release);
|
|
|
818
|
|
|
819 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
|
|
|
820 memory_order_acquire);
|
|
|
821
|
|
|
822 if (flags & UV__IORING_SQ_NEED_WAKEUP)
|
|
|
823 if (uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_SQ_WAKEUP))
|
|
|
824 if (errno != EOWNERDEAD) /* Kernel bug. Harmless, ignore. */
|
|
|
825 perror("libuv: io_uring_enter(wakeup)"); /* Can't happen. */
|
|
|
826 }
|
|
|
827
|
|
|
828
|
|
|
829 int uv__iou_fs_close(uv_loop_t* loop, uv_fs_t* req) {
|
|
|
830 struct uv__io_uring_sqe* sqe;
|
|
|
831 struct uv__iou* iou;
|
|
|
832 int kv;
|
|
|
833
|
|
|
834 kv = uv__kernel_version();
|
|
|
835 /* Work around a poorly understood bug in older kernels where closing a file
|
|
|
836 * descriptor pointing to /foo/bar results in ETXTBSY errors when trying to
|
|
|
837 * execve("/foo/bar") later on. The bug seems to have been fixed somewhere
|
|
|
838 * between 5.15.85 and 5.15.90. I couldn't pinpoint the responsible commit
|
|
|
839 * but good candidates are the several data race fixes. Interestingly, it
|
|
|
840 * seems to manifest only when running under Docker so the possibility of
|
|
|
841 * a Docker bug can't be completely ruled out either. Yay, computers.
|
|
|
842 * Also, disable on non-longterm versions between 5.16.0 (non-longterm) and
|
|
|
843 * 6.1.0 (longterm). Starting with longterm 6.1.x, the issue seems to be
|
|
|
844 * solved.
|
|
|
845 */
|
|
|
846 if (kv < /* 5.15.90 */ 0x050F5A)
|
|
|
847 return 0;
|
|
|
848
|
|
|
849 if (kv >= /* 5.16.0 */ 0x050A00 && kv < /* 6.1.0 */ 0x060100)
|
|
|
850 return 0;
|
|
|
851
|
|
|
852
|
|
|
853 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
854
|
|
|
855 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
856 if (sqe == NULL)
|
|
|
857 return 0;
|
|
|
858
|
|
|
859 sqe->fd = req->file;
|
|
|
860 sqe->opcode = UV__IORING_OP_CLOSE;
|
|
|
861
|
|
|
862 uv__iou_submit(iou);
|
|
|
863
|
|
|
864 return 1;
|
|
|
865 }
|
|
|
866
|
|
|
867
|
|
|
868 int uv__iou_fs_ftruncate(uv_loop_t* loop, uv_fs_t* req) {
|
|
|
869 struct uv__io_uring_sqe* sqe;
|
|
|
870 struct uv__iou* iou;
|
|
|
871
|
|
|
872 if (uv__kernel_version() < /* 6.9 */0x060900)
|
|
|
873 return 0;
|
|
|
874
|
|
|
875 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
876 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
877 if (sqe == NULL)
|
|
|
878 return 0;
|
|
|
879
|
|
|
880 sqe->fd = req->file;
|
|
|
881 sqe->len = req->off;
|
|
|
882 sqe->opcode = UV__IORING_OP_FTRUNCATE;
|
|
|
883 uv__iou_submit(iou);
|
|
|
884
|
|
|
885 return 1;
|
|
|
886 }
|
|
|
887
|
|
|
888 int uv__iou_fs_fsync_or_fdatasync(uv_loop_t* loop,
|
|
|
889 uv_fs_t* req,
|
|
|
890 uint32_t fsync_flags) {
|
|
|
891 struct uv__io_uring_sqe* sqe;
|
|
|
892 struct uv__iou* iou;
|
|
|
893
|
|
|
894 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
895
|
|
|
896 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
897 if (sqe == NULL)
|
|
|
898 return 0;
|
|
|
899
|
|
|
900 /* Little known fact: setting seq->off and seq->len turns
|
|
|
901 * it into an asynchronous sync_file_range() operation.
|
|
|
902 */
|
|
|
903 sqe->fd = req->file;
|
|
|
904 sqe->fsync_flags = fsync_flags;
|
|
|
905 sqe->opcode = UV__IORING_OP_FSYNC;
|
|
|
906
|
|
|
907 uv__iou_submit(iou);
|
|
|
908
|
|
|
909 return 1;
|
|
|
910 }
|
|
|
911
|
|
|
912
|
|
|
913 int uv__iou_fs_link(uv_loop_t* loop, uv_fs_t* req) {
|
|
|
914 struct uv__io_uring_sqe* sqe;
|
|
|
915 struct uv__iou* iou;
|
|
|
916
|
|
|
917 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
|
|
|
918 return 0;
|
|
|
919
|
|
|
920 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
921 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
922 if (sqe == NULL)
|
|
|
923 return 0;
|
|
|
924
|
|
|
925 sqe->addr = (uintptr_t) req->path;
|
|
|
926 sqe->fd = AT_FDCWD;
|
|
|
927 sqe->addr2 = (uintptr_t) req->new_path;
|
|
|
928 sqe->len = AT_FDCWD;
|
|
|
929 sqe->opcode = UV__IORING_OP_LINKAT;
|
|
|
930
|
|
|
931 uv__iou_submit(iou);
|
|
|
932
|
|
|
933 return 1;
|
|
|
934 }
|
|
|
935
|
|
|
936
|
|
|
937 int uv__iou_fs_mkdir(uv_loop_t* loop, uv_fs_t* req) {
|
|
|
938 struct uv__io_uring_sqe* sqe;
|
|
|
939 struct uv__iou* iou;
|
|
|
940
|
|
|
941 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
|
|
|
942 return 0;
|
|
|
943
|
|
|
944 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
945 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
946 if (sqe == NULL)
|
|
|
947 return 0;
|
|
|
948
|
|
|
949 sqe->addr = (uintptr_t) req->path;
|
|
|
950 sqe->fd = AT_FDCWD;
|
|
|
951 sqe->len = req->mode;
|
|
|
952 sqe->opcode = UV__IORING_OP_MKDIRAT;
|
|
|
953
|
|
|
954 uv__iou_submit(iou);
|
|
|
955
|
|
|
956 return 1;
|
|
|
957 }
|
|
|
958
|
|
|
959
|
|
|
960 int uv__iou_fs_open(uv_loop_t* loop, uv_fs_t* req) {
|
|
|
961 struct uv__io_uring_sqe* sqe;
|
|
|
962 struct uv__iou* iou;
|
|
|
963
|
|
|
964 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
965
|
|
|
966 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
967 if (sqe == NULL)
|
|
|
968 return 0;
|
|
|
969
|
|
|
970 sqe->addr = (uintptr_t) req->path;
|
|
|
971 sqe->fd = AT_FDCWD;
|
|
|
972 sqe->len = req->mode;
|
|
|
973 sqe->opcode = UV__IORING_OP_OPENAT;
|
|
|
974 sqe->open_flags = req->flags | O_CLOEXEC;
|
|
|
975
|
|
|
976 uv__iou_submit(iou);
|
|
|
977
|
|
|
978 return 1;
|
|
|
979 }
|
|
|
980
|
|
|
981
|
|
|
982 int uv__iou_fs_rename(uv_loop_t* loop, uv_fs_t* req) {
|
|
|
983 struct uv__io_uring_sqe* sqe;
|
|
|
984 struct uv__iou* iou;
|
|
|
985
|
|
|
986 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
987
|
|
|
988 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
989 if (sqe == NULL)
|
|
|
990 return 0;
|
|
|
991
|
|
|
992 sqe->addr = (uintptr_t) req->path;
|
|
|
993 sqe->fd = AT_FDCWD;
|
|
|
994 sqe->addr2 = (uintptr_t) req->new_path;
|
|
|
995 sqe->len = AT_FDCWD;
|
|
|
996 sqe->opcode = UV__IORING_OP_RENAMEAT;
|
|
|
997
|
|
|
998 uv__iou_submit(iou);
|
|
|
999
|
|
|
1000 return 1;
|
|
|
1001 }
|
|
|
1002
|
|
|
1003
|
|
|
1004 int uv__iou_fs_symlink(uv_loop_t* loop, uv_fs_t* req) {
|
|
|
1005 struct uv__io_uring_sqe* sqe;
|
|
|
1006 struct uv__iou* iou;
|
|
|
1007
|
|
|
1008 if (uv__kernel_version() < /* 5.15.0 */0x050F00)
|
|
|
1009 return 0;
|
|
|
1010
|
|
|
1011 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
1012 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
1013 if (sqe == NULL)
|
|
|
1014 return 0;
|
|
|
1015
|
|
|
1016 sqe->addr = (uintptr_t) req->path;
|
|
|
1017 sqe->fd = AT_FDCWD;
|
|
|
1018 sqe->addr2 = (uintptr_t) req->new_path;
|
|
|
1019 sqe->opcode = UV__IORING_OP_SYMLINKAT;
|
|
|
1020
|
|
|
1021 uv__iou_submit(iou);
|
|
|
1022
|
|
|
1023 return 1;
|
|
|
1024 }
|
|
|
1025
|
|
|
1026
|
|
|
1027 int uv__iou_fs_unlink(uv_loop_t* loop, uv_fs_t* req) {
|
|
|
1028 struct uv__io_uring_sqe* sqe;
|
|
|
1029 struct uv__iou* iou;
|
|
|
1030
|
|
|
1031 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
1032
|
|
|
1033 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
1034 if (sqe == NULL)
|
|
|
1035 return 0;
|
|
|
1036
|
|
|
1037 sqe->addr = (uintptr_t) req->path;
|
|
|
1038 sqe->fd = AT_FDCWD;
|
|
|
1039 sqe->opcode = UV__IORING_OP_UNLINKAT;
|
|
|
1040
|
|
|
1041 uv__iou_submit(iou);
|
|
|
1042
|
|
|
1043 return 1;
|
|
|
1044 }
|
|
|
1045
|
|
|
1046
|
|
|
1047 int uv__iou_fs_read_or_write(uv_loop_t* loop,
|
|
|
1048 uv_fs_t* req,
|
|
|
1049 int is_read) {
|
|
|
1050 struct uv__io_uring_sqe* sqe;
|
|
|
1051 struct uv__iou* iou;
|
|
|
1052
|
|
|
1053 /* If iovcnt is greater than IOV_MAX, cap it to IOV_MAX on reads and fallback
|
|
|
1054 * to the threadpool on writes */
|
|
|
1055 if (req->nbufs > IOV_MAX) {
|
|
|
1056 if (is_read)
|
|
|
1057 req->nbufs = IOV_MAX;
|
|
|
1058 else
|
|
|
1059 return 0;
|
|
|
1060 }
|
|
|
1061
|
|
|
1062 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
1063
|
|
|
1064 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
1065 if (sqe == NULL)
|
|
|
1066 return 0;
|
|
|
1067
|
|
|
1068 sqe->addr = (uintptr_t) req->bufs;
|
|
|
1069 sqe->fd = req->file;
|
|
|
1070 sqe->len = req->nbufs;
|
|
|
1071 sqe->off = req->off < 0 ? -1 : req->off;
|
|
|
1072 sqe->opcode = is_read ? UV__IORING_OP_READV : UV__IORING_OP_WRITEV;
|
|
|
1073
|
|
|
1074 uv__iou_submit(iou);
|
|
|
1075
|
|
|
1076 return 1;
|
|
|
1077 }
|
|
|
1078
|
|
|
1079
|
|
|
1080 int uv__iou_fs_statx(uv_loop_t* loop,
|
|
|
1081 uv_fs_t* req,
|
|
|
1082 int is_fstat,
|
|
|
1083 int is_lstat) {
|
|
|
1084 struct uv__io_uring_sqe* sqe;
|
|
|
1085 struct uv__statx* statxbuf;
|
|
|
1086 struct uv__iou* iou;
|
|
|
1087
|
|
|
1088 statxbuf = uv__malloc(sizeof(*statxbuf));
|
|
|
1089 if (statxbuf == NULL)
|
|
|
1090 return 0;
|
|
|
1091
|
|
|
1092 iou = &uv__get_internal_fields(loop)->iou;
|
|
|
1093
|
|
|
1094 sqe = uv__iou_get_sqe(iou, loop, req);
|
|
|
1095 if (sqe == NULL) {
|
|
|
1096 uv__free(statxbuf);
|
|
|
1097 return 0;
|
|
|
1098 }
|
|
|
1099
|
|
|
1100 req->ptr = statxbuf;
|
|
|
1101
|
|
|
1102 sqe->addr = (uintptr_t) req->path;
|
|
|
1103 sqe->addr2 = (uintptr_t) statxbuf;
|
|
|
1104 sqe->fd = AT_FDCWD;
|
|
|
1105 sqe->len = 0xFFF; /* STATX_BASIC_STATS + STATX_BTIME */
|
|
|
1106 sqe->opcode = UV__IORING_OP_STATX;
|
|
|
1107
|
|
|
1108 if (is_fstat) {
|
|
|
1109 sqe->addr = (uintptr_t) "";
|
|
|
1110 sqe->fd = req->file;
|
|
|
1111 sqe->statx_flags |= 0x1000; /* AT_EMPTY_PATH */
|
|
|
1112 }
|
|
|
1113
|
|
|
1114 if (is_lstat)
|
|
|
1115 sqe->statx_flags |= AT_SYMLINK_NOFOLLOW;
|
|
|
1116
|
|
|
1117 uv__iou_submit(iou);
|
|
|
1118
|
|
|
1119 return 1;
|
|
|
1120 }
|
|
|
1121
|
|
|
1122
|
|
|
1123 void uv__statx_to_stat(const struct uv__statx* statxbuf, uv_stat_t* buf) {
|
|
|
1124 buf->st_dev = makedev(statxbuf->stx_dev_major, statxbuf->stx_dev_minor);
|
|
|
1125 buf->st_mode = statxbuf->stx_mode;
|
|
|
1126 buf->st_nlink = statxbuf->stx_nlink;
|
|
|
1127 buf->st_uid = statxbuf->stx_uid;
|
|
|
1128 buf->st_gid = statxbuf->stx_gid;
|
|
|
1129 buf->st_rdev = makedev(statxbuf->stx_rdev_major, statxbuf->stx_rdev_minor);
|
|
|
1130 buf->st_ino = statxbuf->stx_ino;
|
|
|
1131 buf->st_size = statxbuf->stx_size;
|
|
|
1132 buf->st_blksize = statxbuf->stx_blksize;
|
|
|
1133 buf->st_blocks = statxbuf->stx_blocks;
|
|
|
1134 buf->st_atim.tv_sec = statxbuf->stx_atime.tv_sec;
|
|
|
1135 buf->st_atim.tv_nsec = statxbuf->stx_atime.tv_nsec;
|
|
|
1136 buf->st_mtim.tv_sec = statxbuf->stx_mtime.tv_sec;
|
|
|
1137 buf->st_mtim.tv_nsec = statxbuf->stx_mtime.tv_nsec;
|
|
|
1138 buf->st_ctim.tv_sec = statxbuf->stx_ctime.tv_sec;
|
|
|
1139 buf->st_ctim.tv_nsec = statxbuf->stx_ctime.tv_nsec;
|
|
|
1140 buf->st_birthtim.tv_sec = statxbuf->stx_btime.tv_sec;
|
|
|
1141 buf->st_birthtim.tv_nsec = statxbuf->stx_btime.tv_nsec;
|
|
|
1142 buf->st_flags = 0;
|
|
|
1143 buf->st_gen = 0;
|
|
|
1144 }
|
|
|
1145
|
|
|
1146
|
|
|
1147 static void uv__iou_fs_statx_post(uv_fs_t* req) {
|
|
|
1148 struct uv__statx* statxbuf;
|
|
|
1149 uv_stat_t* buf;
|
|
|
1150
|
|
|
1151 buf = &req->statbuf;
|
|
|
1152 statxbuf = req->ptr;
|
|
|
1153 req->ptr = NULL;
|
|
|
1154
|
|
|
1155 if (req->result == 0) {
|
|
|
1156 uv__msan_unpoison(statxbuf, sizeof(*statxbuf));
|
|
|
1157 uv__statx_to_stat(statxbuf, buf);
|
|
|
1158 req->ptr = buf;
|
|
|
1159 }
|
|
|
1160
|
|
|
1161 uv__free(statxbuf);
|
|
|
1162 }
|
|
|
1163
|
|
|
1164
|
|
|
1165 static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
|
|
|
1166 struct uv__io_uring_cqe* cqe;
|
|
|
1167 struct uv__io_uring_cqe* e;
|
|
|
1168 uv_fs_t* req;
|
|
|
1169 uint32_t head;
|
|
|
1170 uint32_t tail;
|
|
|
1171 uint32_t mask;
|
|
|
1172 uint32_t i;
|
|
|
1173 uint32_t flags;
|
|
|
1174 int nevents;
|
|
|
1175 int rc;
|
|
|
1176
|
|
|
1177 head = *iou->cqhead;
|
|
|
1178 tail = atomic_load_explicit((_Atomic uint32_t*) iou->cqtail,
|
|
|
1179 memory_order_acquire);
|
|
|
1180 mask = iou->cqmask;
|
|
|
1181 cqe = iou->cqe;
|
|
|
1182 nevents = 0;
|
|
|
1183
|
|
|
1184 for (i = head; i != tail; i++) {
|
|
|
1185 e = &cqe[i & mask];
|
|
|
1186
|
|
|
1187 req = (uv_fs_t*) (uintptr_t) e->user_data;
|
|
|
1188 assert(req->type == UV_FS);
|
|
|
1189
|
|
|
1190 uv__req_unregister(loop);
|
|
|
1191 iou->in_flight--;
|
|
|
1192
|
|
|
1193 /* If the op is not supported by the kernel retry using the thread pool */
|
|
|
1194 if (e->res == -EOPNOTSUPP) {
|
|
|
1195 uv__fs_post(loop, req);
|
|
|
1196 continue;
|
|
|
1197 }
|
|
|
1198
|
|
|
1199 /* io_uring stores error codes as negative numbers, same as libuv. */
|
|
|
1200 req->result = e->res;
|
|
|
1201
|
|
|
1202 switch (req->fs_type) {
|
|
|
1203 case UV_FS_FSTAT:
|
|
|
1204 case UV_FS_LSTAT:
|
|
|
1205 case UV_FS_STAT:
|
|
|
1206 uv__iou_fs_statx_post(req);
|
|
|
1207 break;
|
|
|
1208 default: /* Squelch -Wswitch warnings. */
|
|
|
1209 break;
|
|
|
1210 }
|
|
|
1211
|
|
|
1212 uv__metrics_update_idle_time(loop);
|
|
|
1213 req->cb(req);
|
|
|
1214 nevents++;
|
|
|
1215 }
|
|
|
1216
|
|
|
1217 atomic_store_explicit((_Atomic uint32_t*) iou->cqhead,
|
|
|
1218 tail,
|
|
|
1219 memory_order_release);
|
|
|
1220
|
|
|
1221 /* Check whether CQE's overflowed, if so enter the kernel to make them
|
|
|
1222 * available. Don't grab them immediately but in the next loop iteration to
|
|
|
1223 * avoid loop starvation. */
|
|
|
1224 flags = atomic_load_explicit((_Atomic uint32_t*) iou->sqflags,
|
|
|
1225 memory_order_acquire);
|
|
|
1226
|
|
|
1227 if (flags & UV__IORING_SQ_CQ_OVERFLOW) {
|
|
|
1228 do
|
|
|
1229 rc = uv__io_uring_enter(iou->ringfd, 0, 0, UV__IORING_ENTER_GETEVENTS);
|
|
|
1230 while (rc == -1 && errno == EINTR);
|
|
|
1231
|
|
|
1232 if (rc < 0)
|
|
|
1233 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
|
|
|
1234 }
|
|
|
1235
|
|
|
1236 uv__metrics_inc_events(loop, nevents);
|
|
|
1237 if (uv__get_internal_fields(loop)->current_timeout == 0)
|
|
|
1238 uv__metrics_inc_events_waiting(loop, nevents);
|
|
|
1239 }
|
|
|
1240
|
|
|
1241
|
|
|
1242 /* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
|
|
|
1243 * executed immediately, otherwise the file descriptor may have been closed
|
|
|
1244 * by the time the kernel starts the operation.
|
|
|
1245 */
|
|
|
1246 static void uv__epoll_ctl_prep(int epollfd,
|
|
|
1247 struct uv__iou* ctl,
|
|
|
1248 struct epoll_event (*events)[256],
|
|
|
1249 int op,
|
|
|
1250 int fd,
|
|
|
1251 struct epoll_event* e) {
|
|
|
1252 struct uv__io_uring_sqe* sqe;
|
|
|
1253 struct epoll_event* pe;
|
|
|
1254 uint32_t mask;
|
|
|
1255 uint32_t slot;
|
|
|
1256
|
|
|
1257 assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
|
|
|
1258 assert(ctl->ringfd != -1);
|
|
|
1259
|
|
|
1260 mask = ctl->sqmask;
|
|
|
1261 slot = (*ctl->sqtail)++ & mask;
|
|
|
1262
|
|
|
1263 pe = &(*events)[slot];
|
|
|
1264 *pe = *e;
|
|
|
1265
|
|
|
1266 sqe = ctl->sqe;
|
|
|
1267 sqe = &sqe[slot];
|
|
|
1268
|
|
|
1269 memset(sqe, 0, sizeof(*sqe));
|
|
|
1270 sqe->addr = (uintptr_t) pe;
|
|
|
1271 sqe->fd = epollfd;
|
|
|
1272 sqe->len = op;
|
|
|
1273 sqe->off = fd;
|
|
|
1274 sqe->opcode = UV__IORING_OP_EPOLL_CTL;
|
|
|
1275 sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
|
|
|
1276
|
|
|
1277 if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
|
|
|
1278 uv__epoll_ctl_flush(epollfd, ctl, events);
|
|
|
1279 }
|
|
|
1280
|
|
|
1281
|
|
|
1282 static void uv__epoll_ctl_flush(int epollfd,
|
|
|
1283 struct uv__iou* ctl,
|
|
|
1284 struct epoll_event (*events)[256]) {
|
|
|
1285 struct epoll_event oldevents[256];
|
|
|
1286 struct uv__io_uring_cqe* cqe;
|
|
|
1287 uint32_t oldslot;
|
|
|
1288 uint32_t slot;
|
|
|
1289 uint32_t n;
|
|
|
1290 int fd;
|
|
|
1291 int op;
|
|
|
1292 int rc;
|
|
|
1293
|
|
|
1294 STATIC_ASSERT(sizeof(oldevents) == sizeof(*events));
|
|
|
1295 assert(ctl->ringfd != -1);
|
|
|
1296 assert(*ctl->sqhead != *ctl->sqtail);
|
|
|
1297
|
|
|
1298 n = *ctl->sqtail - *ctl->sqhead;
|
|
|
1299 do
|
|
|
1300 rc = uv__io_uring_enter(ctl->ringfd, n, n, UV__IORING_ENTER_GETEVENTS);
|
|
|
1301 while (rc == -1 && errno == EINTR);
|
|
|
1302
|
|
|
1303 if (rc < 0)
|
|
|
1304 perror("libuv: io_uring_enter(getevents)"); /* Can't happen. */
|
|
|
1305
|
|
|
1306 if (rc != (int) n)
|
|
|
1307 abort();
|
|
|
1308
|
|
|
1309 assert(*ctl->sqhead == *ctl->sqtail);
|
|
|
1310
|
|
|
1311 memcpy(oldevents, *events, sizeof(*events));
|
|
|
1312
|
|
|
1313 /* Failed submissions are either EPOLL_CTL_DEL commands for file descriptors
|
|
|
1314 * that have been closed, or EPOLL_CTL_ADD commands for file descriptors
|
|
|
1315 * that we are already watching. Ignore the former and retry the latter
|
|
|
1316 * with EPOLL_CTL_MOD.
|
|
|
1317 */
|
|
|
1318 while (*ctl->cqhead != *ctl->cqtail) {
|
|
|
1319 slot = (*ctl->cqhead)++ & ctl->cqmask;
|
|
|
1320
|
|
|
1321 cqe = ctl->cqe;
|
|
|
1322 cqe = &cqe[slot];
|
|
|
1323
|
|
|
1324 if (cqe->res == 0)
|
|
|
1325 continue;
|
|
|
1326
|
|
|
1327 fd = cqe->user_data >> 32;
|
|
|
1328 op = 3 & cqe->user_data;
|
|
|
1329 oldslot = 255 & (cqe->user_data >> 2);
|
|
|
1330
|
|
|
1331 if (op == EPOLL_CTL_DEL)
|
|
|
1332 continue;
|
|
|
1333
|
|
|
1334 if (op != EPOLL_CTL_ADD)
|
|
|
1335 abort();
|
|
|
1336
|
|
|
1337 if (cqe->res != -EEXIST)
|
|
|
1338 abort();
|
|
|
1339
|
|
|
1340 uv__epoll_ctl_prep(epollfd,
|
|
|
1341 ctl,
|
|
|
1342 events,
|
|
|
1343 EPOLL_CTL_MOD,
|
|
|
1344 fd,
|
|
|
1345 &oldevents[oldslot]);
|
|
|
1346 }
|
|
|
1347 }
|
|
|
1348
|
|
|
1349
|
|
|
1350 void uv__io_poll(uv_loop_t* loop, int timeout) {
|
|
|
1351 uv__loop_internal_fields_t* lfields;
|
|
|
1352 struct epoll_event events[1024];
|
|
|
1353 struct epoll_event prep[256];
|
|
|
1354 struct uv__invalidate inv;
|
|
|
1355 struct epoll_event* pe;
|
|
|
1356 struct epoll_event e;
|
|
|
1357 struct uv__iou* ctl;
|
|
|
1358 struct uv__iou* iou;
|
|
|
1359 int real_timeout;
|
|
|
1360 struct uv__queue* q;
|
|
|
1361 uv__io_t* w;
|
|
|
1362 sigset_t* sigmask;
|
|
|
1363 sigset_t sigset;
|
|
|
1364 uint64_t base;
|
|
|
1365 int have_iou_events;
|
|
|
1366 int have_signals;
|
|
|
1367 int nevents;
|
|
|
1368 int epollfd;
|
|
|
1369 int count;
|
|
|
1370 int nfds;
|
|
|
1371 int fd;
|
|
|
1372 int op;
|
|
|
1373 int i;
|
|
|
1374 int user_timeout;
|
|
|
1375 int reset_timeout;
|
|
|
1376
|
|
|
1377 lfields = uv__get_internal_fields(loop);
|
|
|
1378 ctl = &lfields->ctl;
|
|
|
1379 iou = &lfields->iou;
|
|
|
1380
|
|
|
1381 sigmask = NULL;
|
|
|
1382 if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
|
|
|
1383 sigemptyset(&sigset);
|
|
|
1384 sigaddset(&sigset, SIGPROF);
|
|
|
1385 sigmask = &sigset;
|
|
|
1386 }
|
|
|
1387
|
|
|
1388 assert(timeout >= -1);
|
|
|
1389 base = loop->time;
|
|
|
1390 count = 48; /* Benchmarks suggest this gives the best throughput. */
|
|
|
1391 real_timeout = timeout;
|
|
|
1392
|
|
|
1393 if (lfields->flags & UV_METRICS_IDLE_TIME) {
|
|
|
1394 reset_timeout = 1;
|
|
|
1395 user_timeout = timeout;
|
|
|
1396 timeout = 0;
|
|
|
1397 } else {
|
|
|
1398 reset_timeout = 0;
|
|
|
1399 user_timeout = 0;
|
|
|
1400 }
|
|
|
1401
|
|
|
1402 epollfd = loop->backend_fd;
|
|
|
1403
|
|
|
1404 memset(&e, 0, sizeof(e));
|
|
|
1405
|
|
|
1406 while (!uv__queue_empty(&loop->watcher_queue)) {
|
|
|
1407 q = uv__queue_head(&loop->watcher_queue);
|
|
|
1408 w = uv__queue_data(q, uv__io_t, watcher_queue);
|
|
|
1409 uv__queue_remove(q);
|
|
|
1410 uv__queue_init(q);
|
|
|
1411
|
|
|
1412 op = EPOLL_CTL_MOD;
|
|
|
1413 if (w->events == 0)
|
|
|
1414 op = EPOLL_CTL_ADD;
|
|
|
1415
|
|
|
1416 w->events = w->pevents;
|
|
|
1417 e.events = w->pevents;
|
|
|
1418 e.data.fd = w->fd;
|
|
|
1419 fd = w->fd;
|
|
|
1420
|
|
|
1421 if (ctl->ringfd != -1) {
|
|
|
1422 uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
|
|
|
1423 continue;
|
|
|
1424 }
|
|
|
1425
|
|
|
1426 if (!epoll_ctl(epollfd, op, fd, &e))
|
|
|
1427 continue;
|
|
|
1428
|
|
|
1429 assert(op == EPOLL_CTL_ADD);
|
|
|
1430 assert(errno == EEXIST);
|
|
|
1431
|
|
|
1432 /* File descriptor that's been watched before, update event mask. */
|
|
|
1433 if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
|
|
|
1434 abort();
|
|
|
1435 }
|
|
|
1436
|
|
|
1437 inv.events = events;
|
|
|
1438 inv.prep = &prep;
|
|
|
1439 inv.nfds = -1;
|
|
|
1440
|
|
|
1441 for (;;) {
|
|
|
1442 if (loop->nfds == 0)
|
|
|
1443 if (iou->in_flight == 0)
|
|
|
1444 break;
|
|
|
1445
|
|
|
1446 /* All event mask mutations should be visible to the kernel before
|
|
|
1447 * we enter epoll_pwait().
|
|
|
1448 */
|
|
|
1449 if (ctl->ringfd != -1)
|
|
|
1450 while (*ctl->sqhead != *ctl->sqtail)
|
|
|
1451 uv__epoll_ctl_flush(epollfd, ctl, &prep);
|
|
|
1452
|
|
|
1453 /* Only need to set the provider_entry_time if timeout != 0. The function
|
|
|
1454 * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
|
|
|
1455 */
|
|
|
1456 if (timeout != 0)
|
|
|
1457 uv__metrics_set_provider_entry_time(loop);
|
|
|
1458
|
|
|
1459 /* Store the current timeout in a location that's globally accessible so
|
|
|
1460 * other locations like uv__work_done() can determine whether the queue
|
|
|
1461 * of events in the callback were waiting when poll was called.
|
|
|
1462 */
|
|
|
1463 lfields->current_timeout = timeout;
|
|
|
1464
|
|
|
1465 nfds = epoll_pwait(epollfd, events, ARRAY_SIZE(events), timeout, sigmask);
|
|
|
1466
|
|
|
1467 /* Update loop->time unconditionally. It's tempting to skip the update when
|
|
|
1468 * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
|
|
|
1469 * operating system didn't reschedule our process while in the syscall.
|
|
|
1470 */
|
|
|
1471 SAVE_ERRNO(uv__update_time(loop));
|
|
|
1472
|
|
|
1473 if (nfds == -1)
|
|
|
1474 assert(errno == EINTR);
|
|
|
1475 else if (nfds == 0)
|
|
|
1476 /* Unlimited timeout should only return with events or signal. */
|
|
|
1477 assert(timeout != -1);
|
|
|
1478
|
|
|
1479 if (nfds == 0 || nfds == -1) {
|
|
|
1480 if (reset_timeout != 0) {
|
|
|
1481 timeout = user_timeout;
|
|
|
1482 reset_timeout = 0;
|
|
|
1483 } else if (nfds == 0) {
|
|
|
1484 return;
|
|
|
1485 }
|
|
|
1486
|
|
|
1487 /* Interrupted by a signal. Update timeout and poll again. */
|
|
|
1488 goto update_timeout;
|
|
|
1489 }
|
|
|
1490
|
|
|
1491 have_iou_events = 0;
|
|
|
1492 have_signals = 0;
|
|
|
1493 nevents = 0;
|
|
|
1494
|
|
|
1495 inv.nfds = nfds;
|
|
|
1496 lfields->inv = &inv;
|
|
|
1497
|
|
|
1498 for (i = 0; i < nfds; i++) {
|
|
|
1499 pe = events + i;
|
|
|
1500 fd = pe->data.fd;
|
|
|
1501
|
|
|
1502 /* Skip invalidated events, see uv__platform_invalidate_fd */
|
|
|
1503 if (fd == -1)
|
|
|
1504 continue;
|
|
|
1505
|
|
|
1506 if (fd == iou->ringfd) {
|
|
|
1507 uv__poll_io_uring(loop, iou);
|
|
|
1508 have_iou_events = 1;
|
|
|
1509 continue;
|
|
|
1510 }
|
|
|
1511
|
|
|
1512 assert(fd >= 0);
|
|
|
1513 assert((unsigned) fd < loop->nwatchers);
|
|
|
1514
|
|
|
1515 w = loop->watchers[fd];
|
|
|
1516
|
|
|
1517 if (w == NULL) {
|
|
|
1518 /* File descriptor that we've stopped watching, disarm it.
|
|
|
1519 *
|
|
|
1520 * Ignore all errors because we may be racing with another thread
|
|
|
1521 * when the file descriptor is closed.
|
|
|
1522 *
|
|
|
1523 * Perform EPOLL_CTL_DEL immediately instead of going through
|
|
|
1524 * io_uring's submit queue, otherwise the file descriptor may
|
|
|
1525 * be closed by the time the kernel starts the operation.
|
|
|
1526 */
|
|
|
1527 epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
|
|
|
1528 continue;
|
|
|
1529 }
|
|
|
1530
|
|
|
1531 /* Give users only events they're interested in. Prevents spurious
|
|
|
1532 * callbacks when previous callback invocation in this loop has stopped
|
|
|
1533 * the current watcher. Also, filters out events that users has not
|
|
|
1534 * requested us to watch.
|
|
|
1535 */
|
|
|
1536 pe->events &= w->pevents | POLLERR | POLLHUP;
|
|
|
1537
|
|
|
1538 /* Work around an epoll quirk where it sometimes reports just the
|
|
|
1539 * EPOLLERR or EPOLLHUP event. In order to force the event loop to
|
|
|
1540 * move forward, we merge in the read/write events that the watcher
|
|
|
1541 * is interested in; uv__read() and uv__write() will then deal with
|
|
|
1542 * the error or hangup in the usual fashion.
|
|
|
1543 *
|
|
|
1544 * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
|
|
|
1545 * reads the available data, calls uv_read_stop(), then sometime later
|
|
|
1546 * calls uv_read_start() again. By then, libuv has forgotten about the
|
|
|
1547 * hangup and the kernel won't report EPOLLIN again because there's
|
|
|
1548 * nothing left to read. If anything, libuv is to blame here. The
|
|
|
1549 * current hack is just a quick bandaid; to properly fix it, libuv
|
|
|
1550 * needs to remember the error/hangup event. We should get that for
|
|
|
1551 * free when we switch over to edge-triggered I/O.
|
|
|
1552 */
|
|
|
1553 if (pe->events == POLLERR || pe->events == POLLHUP)
|
|
|
1554 pe->events |=
|
|
|
1555 w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
|
|
|
1556
|
|
|
1557 if (pe->events != 0) {
|
|
|
1558 /* Run signal watchers last. This also affects child process watchers
|
|
|
1559 * because those are implemented in terms of signal watchers.
|
|
|
1560 */
|
|
|
1561 if (w == &loop->signal_io_watcher) {
|
|
|
1562 have_signals = 1;
|
|
|
1563 } else {
|
|
|
1564 uv__metrics_update_idle_time(loop);
|
|
|
1565 w->cb(loop, w, pe->events);
|
|
|
1566 }
|
|
|
1567
|
|
|
1568 nevents++;
|
|
|
1569 }
|
|
|
1570 }
|
|
|
1571
|
|
|
1572 uv__metrics_inc_events(loop, nevents);
|
|
|
1573 if (reset_timeout != 0) {
|
|
|
1574 timeout = user_timeout;
|
|
|
1575 reset_timeout = 0;
|
|
|
1576 uv__metrics_inc_events_waiting(loop, nevents);
|
|
|
1577 }
|
|
|
1578
|
|
|
1579 if (have_signals != 0) {
|
|
|
1580 uv__metrics_update_idle_time(loop);
|
|
|
1581 loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
|
|
|
1582 }
|
|
|
1583
|
|
|
1584 lfields->inv = NULL;
|
|
|
1585
|
|
|
1586 if (have_iou_events != 0)
|
|
|
1587 break; /* Event loop should cycle now so don't poll again. */
|
|
|
1588
|
|
|
1589 if (have_signals != 0)
|
|
|
1590 break; /* Event loop should cycle now so don't poll again. */
|
|
|
1591
|
|
|
1592 if (nevents != 0) {
|
|
|
1593 if (nfds == ARRAY_SIZE(events) && --count != 0) {
|
|
|
1594 /* Poll for more events but don't block this time. */
|
|
|
1595 timeout = 0;
|
|
|
1596 continue;
|
|
|
1597 }
|
|
|
1598 break;
|
|
|
1599 }
|
|
|
1600
|
|
|
1601 update_timeout:
|
|
|
1602 if (timeout == 0)
|
|
|
1603 break;
|
|
|
1604
|
|
|
1605 if (timeout == -1)
|
|
|
1606 continue;
|
|
|
1607
|
|
|
1608 assert(timeout > 0);
|
|
|
1609
|
|
|
1610 real_timeout -= (loop->time - base);
|
|
|
1611 if (real_timeout <= 0)
|
|
|
1612 break;
|
|
|
1613
|
|
|
1614 timeout = real_timeout;
|
|
|
1615 }
|
|
|
1616
|
|
|
1617 if (ctl->ringfd != -1)
|
|
|
1618 while (*ctl->sqhead != *ctl->sqtail)
|
|
|
1619 uv__epoll_ctl_flush(epollfd, ctl, &prep);
|
|
|
1620 }
|
|
|
1621
|
|
|
1622 uint64_t uv__hrtime(uv_clocktype_t type) {
|
|
|
1623 static _Atomic clock_t fast_clock_id = -1;
|
|
|
1624 struct timespec t;
|
|
|
1625 clock_t clock_id;
|
|
|
1626
|
|
|
1627 /* Prefer CLOCK_MONOTONIC_COARSE if available but only when it has
|
|
|
1628 * millisecond granularity or better. CLOCK_MONOTONIC_COARSE is
|
|
|
1629 * serviced entirely from the vDSO, whereas CLOCK_MONOTONIC may
|
|
|
1630 * decide to make a costly system call.
|
|
|
1631 */
|
|
|
1632 /* TODO(bnoordhuis) Use CLOCK_MONOTONIC_COARSE for UV_CLOCK_PRECISE
|
|
|
1633 * when it has microsecond granularity or better (unlikely).
|
|
|
1634 */
|
|
|
1635 clock_id = CLOCK_MONOTONIC;
|
|
|
1636 if (type != UV_CLOCK_FAST)
|
|
|
1637 goto done;
|
|
|
1638
|
|
|
1639 clock_id = atomic_load_explicit(&fast_clock_id, memory_order_relaxed);
|
|
|
1640 if (clock_id != -1)
|
|
|
1641 goto done;
|
|
|
1642
|
|
|
1643 clock_id = CLOCK_MONOTONIC;
|
|
|
1644 if (0 == clock_getres(CLOCK_MONOTONIC_COARSE, &t))
|
|
|
1645 if (t.tv_nsec <= 1 * 1000 * 1000)
|
|
|
1646 clock_id = CLOCK_MONOTONIC_COARSE;
|
|
|
1647
|
|
|
1648 atomic_store_explicit(&fast_clock_id, clock_id, memory_order_relaxed);
|
|
|
1649
|
|
|
1650 done:
|
|
|
1651
|
|
|
1652 if (clock_gettime(clock_id, &t))
|
|
|
1653 return 0; /* Not really possible. */
|
|
|
1654
|
|
|
1655 return t.tv_sec * (uint64_t) 1e9 + t.tv_nsec;
|
|
|
1656 }
|
|
|
1657
|
|
|
1658
|
|
|
1659 int uv_resident_set_memory(size_t* rss) {
|
|
|
1660 char buf[1024];
|
|
|
1661 const char* s;
|
|
|
1662 long val;
|
|
|
1663 int rc;
|
|
|
1664 int i;
|
|
|
1665
|
|
|
1666 /* rss: 24th element */
|
|
|
1667 rc = uv__slurp("/proc/self/stat", buf, sizeof(buf));
|
|
|
1668 if (rc < 0)
|
|
|
1669 return rc;
|
|
|
1670
|
|
|
1671 /* find the last ')' */
|
|
|
1672 s = strrchr(buf, ')');
|
|
|
1673 if (s == NULL)
|
|
|
1674 goto err;
|
|
|
1675
|
|
|
1676 for (i = 1; i <= 22; i++) {
|
|
|
1677 s = strchr(s + 1, ' ');
|
|
|
1678 if (s == NULL)
|
|
|
1679 goto err;
|
|
|
1680 }
|
|
|
1681
|
|
|
1682 errno = 0;
|
|
|
1683 val = strtol(s, NULL, 10);
|
|
|
1684 if (val < 0 || errno != 0)
|
|
|
1685 goto err;
|
|
|
1686
|
|
|
1687 *rss = val * getpagesize();
|
|
|
1688 return 0;
|
|
|
1689
|
|
|
1690 err:
|
|
|
1691 return UV_EINVAL;
|
|
|
1692 }
|
|
|
1693
|
|
|
1694 int uv_uptime(double* uptime) {
|
|
|
1695 struct timespec now;
|
|
|
1696 char buf[128];
|
|
|
1697
|
|
|
1698 /* Consult /proc/uptime when present (common case), or fall back to
|
|
|
1699 * clock_gettime. Why not always clock_gettime? It doesn't always return the
|
|
|
1700 * right result under OpenVZ and possibly other containerized environments.
|
|
|
1701 */
|
|
|
1702 if (0 == uv__slurp("/proc/uptime", buf, sizeof(buf)))
|
|
|
1703 if (1 == sscanf(buf, "%lf", uptime))
|
|
|
1704 return 0;
|
|
|
1705
|
|
|
1706 if (clock_gettime(CLOCK_BOOTTIME, &now))
|
|
|
1707 return UV__ERR(errno);
|
|
|
1708
|
|
|
1709 *uptime = now.tv_sec;
|
|
|
1710 return 0;
|
|
|
1711 }
|
|
|
1712
|
|
|
1713
|
|
|
1714 int uv_cpu_info(uv_cpu_info_t** ci, int* count) {
|
|
|
1715 #if defined(__PPC__)
|
|
|
1716 static const char model_marker[] = "cpu\t\t: ";
|
|
|
1717 static const char model_marker2[] = "";
|
|
|
1718 #elif defined(__arm__)
|
|
|
1719 static const char model_marker[] = "model name\t: ";
|
|
|
1720 static const char model_marker2[] = "Processor\t: ";
|
|
|
1721 #elif defined(__aarch64__)
|
|
|
1722 static const char model_marker[] = "CPU part\t: ";
|
|
|
1723 static const char model_marker2[] = "";
|
|
|
1724 #elif defined(__mips__)
|
|
|
1725 static const char model_marker[] = "cpu model\t\t: ";
|
|
|
1726 static const char model_marker2[] = "";
|
|
|
1727 #elif defined(__loongarch__)
|
|
|
1728 static const char model_marker[] = "cpu family\t\t: ";
|
|
|
1729 static const char model_marker2[] = "";
|
|
|
1730 #else
|
|
|
1731 static const char model_marker[] = "model name\t: ";
|
|
|
1732 static const char model_marker2[] = "";
|
|
|
1733 #endif
|
|
|
1734 static const char parts[] =
|
|
|
1735 #ifdef __aarch64__
|
|
|
1736 "0x811\nARM810\n" "0x920\nARM920\n" "0x922\nARM922\n"
|
|
|
1737 "0x926\nARM926\n" "0x940\nARM940\n" "0x946\nARM946\n"
|
|
|
1738 "0x966\nARM966\n" "0xa20\nARM1020\n" "0xa22\nARM1022\n"
|
|
|
1739 "0xa26\nARM1026\n" "0xb02\nARM11 MPCore\n" "0xb36\nARM1136\n"
|
|
|
1740 "0xb56\nARM1156\n" "0xb76\nARM1176\n" "0xc05\nCortex-A5\n"
|
|
|
1741 "0xc07\nCortex-A7\n" "0xc08\nCortex-A8\n" "0xc09\nCortex-A9\n"
|
|
|
1742 "0xc0d\nCortex-A17\n" /* Originally A12 */
|
|
|
1743 "0xc0f\nCortex-A15\n" "0xc0e\nCortex-A17\n" "0xc14\nCortex-R4\n"
|
|
|
1744 "0xc15\nCortex-R5\n" "0xc17\nCortex-R7\n" "0xc18\nCortex-R8\n"
|
|
|
1745 "0xc20\nCortex-M0\n" "0xc21\nCortex-M1\n" "0xc23\nCortex-M3\n"
|
|
|
1746 "0xc24\nCortex-M4\n" "0xc27\nCortex-M7\n" "0xc60\nCortex-M0+\n"
|
|
|
1747 "0xd01\nCortex-A32\n" "0xd03\nCortex-A53\n" "0xd04\nCortex-A35\n"
|
|
|
1748 "0xd05\nCortex-A55\n" "0xd06\nCortex-A65\n" "0xd07\nCortex-A57\n"
|
|
|
1749 "0xd08\nCortex-A72\n" "0xd09\nCortex-A73\n" "0xd0a\nCortex-A75\n"
|
|
|
1750 "0xd0b\nCortex-A76\n" "0xd0c\nNeoverse-N1\n" "0xd0d\nCortex-A77\n"
|
|
|
1751 "0xd0e\nCortex-A76AE\n" "0xd13\nCortex-R52\n" "0xd20\nCortex-M23\n"
|
|
|
1752 "0xd21\nCortex-M33\n" "0xd41\nCortex-A78\n" "0xd42\nCortex-A78AE\n"
|
|
|
1753 "0xd4a\nNeoverse-E1\n" "0xd4b\nCortex-A78C\n"
|
|
|
1754 #endif
|
|
|
1755 "";
|
|
|
1756 struct cpu {
|
|
|
1757 unsigned long long freq, user, nice, sys, idle, irq;
|
|
|
1758 unsigned model;
|
|
|
1759 };
|
|
|
1760 FILE* fp;
|
|
|
1761 char* p;
|
|
|
1762 int found;
|
|
|
1763 int n;
|
|
|
1764 unsigned i;
|
|
|
1765 unsigned cpu;
|
|
|
1766 unsigned maxcpu;
|
|
|
1767 unsigned size;
|
|
|
1768 unsigned long long skip;
|
|
|
1769 struct cpu (*cpus)[8192]; /* Kernel maximum. */
|
|
|
1770 struct cpu* c;
|
|
|
1771 struct cpu t;
|
|
|
1772 char (*model)[64];
|
|
|
1773 unsigned char bitmap[ARRAY_SIZE(*cpus) / 8];
|
|
|
1774 /* Assumption: even big.LITTLE systems will have only a handful
|
|
|
1775 * of different CPU models. Most systems will just have one.
|
|
|
1776 */
|
|
|
1777 char models[8][64];
|
|
|
1778 char buf[1024];
|
|
|
1779
|
|
|
1780 memset(bitmap, 0, sizeof(bitmap));
|
|
|
1781 memset(models, 0, sizeof(models));
|
|
|
1782 snprintf(*models, sizeof(*models), "unknown");
|
|
|
1783 maxcpu = 0;
|
|
|
1784
|
|
|
1785 cpus = uv__calloc(ARRAY_SIZE(*cpus), sizeof(**cpus));
|
|
|
1786 if (cpus == NULL)
|
|
|
1787 return UV_ENOMEM;
|
|
|
1788
|
|
|
1789 fp = uv__open_file("/proc/stat");
|
|
|
1790 if (fp == NULL) {
|
|
|
1791 uv__free(cpus);
|
|
|
1792 return UV__ERR(errno);
|
|
|
1793 }
|
|
|
1794
|
|
|
1795 if (NULL == fgets(buf, sizeof(buf), fp))
|
|
|
1796 abort();
|
|
|
1797
|
|
|
1798 for (;;) {
|
|
|
1799 memset(&t, 0, sizeof(t));
|
|
|
1800
|
|
|
1801 n = fscanf(fp, "cpu%u %llu %llu %llu %llu %llu %llu",
|
|
|
1802 &cpu, &t.user, &t.nice, &t.sys, &t.idle, &skip, &t.irq);
|
|
|
1803
|
|
|
1804 if (n != 7)
|
|
|
1805 break;
|
|
|
1806
|
|
|
1807 if (NULL == fgets(buf, sizeof(buf), fp))
|
|
|
1808 abort();
|
|
|
1809
|
|
|
1810 if (cpu >= ARRAY_SIZE(*cpus))
|
|
|
1811 continue;
|
|
|
1812
|
|
|
1813 (*cpus)[cpu] = t;
|
|
|
1814
|
|
|
1815 bitmap[cpu >> 3] |= 1 << (cpu & 7);
|
|
|
1816
|
|
|
1817 if (cpu >= maxcpu)
|
|
|
1818 maxcpu = cpu + 1;
|
|
|
1819 }
|
|
|
1820
|
|
|
1821 fclose(fp);
|
|
|
1822
|
|
|
1823 fp = uv__open_file("/proc/cpuinfo");
|
|
|
1824 if (fp == NULL)
|
|
|
1825 goto nocpuinfo;
|
|
|
1826
|
|
|
1827 for (;;) {
|
|
|
1828 if (1 != fscanf(fp, "processor\t: %u\n", &cpu))
|
|
|
1829 break; /* Parse error. */
|
|
|
1830
|
|
|
1831 while (fgets(buf, sizeof(buf), fp)) {
|
|
|
1832 if (!strncmp(buf, model_marker, sizeof(model_marker) - 1)) {
|
|
|
1833 p = buf + sizeof(model_marker) - 1;
|
|
|
1834 goto parts;
|
|
|
1835 }
|
|
|
1836 if (!*model_marker2)
|
|
|
1837 continue;
|
|
|
1838 if (!strncmp(buf, model_marker2, sizeof(model_marker2) - 1)) {
|
|
|
1839 p = buf + sizeof(model_marker2) - 1;
|
|
|
1840 goto parts;
|
|
|
1841 }
|
|
|
1842 }
|
|
|
1843
|
|
|
1844 goto next; /* Not found. */
|
|
|
1845
|
|
|
1846 parts:
|
|
|
1847 n = (int) strcspn(p, "\n");
|
|
|
1848
|
|
|
1849 /* arm64: translate CPU part code to model name. */
|
|
|
1850 if (*parts) {
|
|
|
1851 p = memmem(parts, sizeof(parts) - 1, p, n + 1);
|
|
|
1852 if (p == NULL)
|
|
|
1853 p = "unknown";
|
|
|
1854 else
|
|
|
1855 p += n + 1;
|
|
|
1856 n = (int) strcspn(p, "\n");
|
|
|
1857 }
|
|
|
1858
|
|
|
1859 found = 0;
|
|
|
1860 for (model = models; !found && model < ARRAY_END(models); model++)
|
|
|
1861 found = !strncmp(p, *model, strlen(*model));
|
|
|
1862
|
|
|
1863 if (!found)
|
|
|
1864 goto next;
|
|
|
1865
|
|
|
1866 if (**model == '\0')
|
|
|
1867 snprintf(*model, sizeof(*model), "%.*s", n, p);
|
|
|
1868
|
|
|
1869 if (cpu < maxcpu)
|
|
|
1870 (*cpus)[cpu].model = model - models;
|
|
|
1871
|
|
|
1872 next:
|
|
|
1873 while (fgets(buf, sizeof(buf), fp))
|
|
|
1874 if (*buf == '\n')
|
|
|
1875 break;
|
|
|
1876 }
|
|
|
1877
|
|
|
1878 fclose(fp);
|
|
|
1879 fp = NULL;
|
|
|
1880
|
|
|
1881 nocpuinfo:
|
|
|
1882
|
|
|
1883 n = 0;
|
|
|
1884 for (cpu = 0; cpu < maxcpu; cpu++) {
|
|
|
1885 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
|
|
|
1886 continue;
|
|
|
1887
|
|
|
1888 n++;
|
|
|
1889 snprintf(buf, sizeof(buf),
|
|
|
1890 "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq", cpu);
|
|
|
1891
|
|
|
1892 fp = uv__open_file(buf);
|
|
|
1893 if (fp == NULL)
|
|
|
1894 continue;
|
|
|
1895
|
|
|
1896 if (1 != fscanf(fp, "%llu", &(*cpus)[cpu].freq))
|
|
|
1897 abort();
|
|
|
1898 fclose(fp);
|
|
|
1899 fp = NULL;
|
|
|
1900 }
|
|
|
1901
|
|
|
1902 size = n * sizeof(**ci) + sizeof(models);
|
|
|
1903 *ci = uv__malloc(size);
|
|
|
1904 *count = 0;
|
|
|
1905
|
|
|
1906 if (*ci == NULL) {
|
|
|
1907 uv__free(cpus);
|
|
|
1908 return UV_ENOMEM;
|
|
|
1909 }
|
|
|
1910
|
|
|
1911 *count = n;
|
|
|
1912 p = memcpy(*ci + n, models, sizeof(models));
|
|
|
1913
|
|
|
1914 i = 0;
|
|
|
1915 for (cpu = 0; cpu < maxcpu; cpu++) {
|
|
|
1916 if (!(bitmap[cpu >> 3] & (1 << (cpu & 7))))
|
|
|
1917 continue;
|
|
|
1918
|
|
|
1919 c = *cpus + cpu;
|
|
|
1920
|
|
|
1921 (*ci)[i++] = (uv_cpu_info_t) {
|
|
|
1922 .model = p + c->model * sizeof(*model),
|
|
|
1923 .speed = c->freq / 1000,
|
|
|
1924 /* Note: sysconf(_SC_CLK_TCK) is fixed at 100 Hz,
|
|
|
1925 * therefore the multiplier is always 1000/100 = 10.
|
|
|
1926 */
|
|
|
1927 .cpu_times = (struct uv_cpu_times_s) {
|
|
|
1928 .user = 10 * c->user,
|
|
|
1929 .nice = 10 * c->nice,
|
|
|
1930 .sys = 10 * c->sys,
|
|
|
1931 .idle = 10 * c->idle,
|
|
|
1932 .irq = 10 * c->irq,
|
|
|
1933 },
|
|
|
1934 };
|
|
|
1935 }
|
|
|
1936
|
|
|
1937 uv__free(cpus);
|
|
|
1938
|
|
|
1939 return 0;
|
|
|
1940 }
|
|
|
1941
|
|
|
1942
|
|
|
1943 static int uv__ifaddr_exclude(struct ifaddrs *ent, int exclude_type) {
|
|
|
1944 if (!((ent->ifa_flags & IFF_UP) && (ent->ifa_flags & IFF_RUNNING)))
|
|
|
1945 return 1;
|
|
|
1946 if (ent->ifa_addr == NULL)
|
|
|
1947 return 1;
|
|
|
1948 /*
|
|
|
1949 * On Linux getifaddrs returns information related to the raw underlying
|
|
|
1950 * devices. We're not interested in this information yet.
|
|
|
1951 */
|
|
|
1952 if (ent->ifa_addr->sa_family == PF_PACKET)
|
|
|
1953 return exclude_type;
|
|
|
1954 return !exclude_type;
|
|
|
1955 }
|
|
|
1956
|
|
|
1957 /* TODO(bnoordhuis) share with bsd-ifaddrs.c */
|
|
|
1958 int uv_interface_addresses(uv_interface_address_t** addresses, int* count) {
|
|
|
1959 uv_interface_address_t* address;
|
|
|
1960 struct sockaddr_ll* sll;
|
|
|
1961 struct ifaddrs* addrs;
|
|
|
1962 struct ifaddrs* ent;
|
|
|
1963 size_t namelen;
|
|
|
1964 char* name;
|
|
|
1965 int i;
|
|
|
1966
|
|
|
1967 *count = 0;
|
|
|
1968 *addresses = NULL;
|
|
|
1969
|
|
|
1970 if (getifaddrs(&addrs))
|
|
|
1971 return UV__ERR(errno);
|
|
|
1972
|
|
|
1973 /* Count the number of interfaces */
|
|
|
1974 namelen = 0;
|
|
|
1975 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
|
|
|
1976 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
|
|
|
1977 continue;
|
|
|
1978
|
|
|
1979 namelen += strlen(ent->ifa_name) + 1;
|
|
|
1980 (*count)++;
|
|
|
1981 }
|
|
|
1982
|
|
|
1983 if (*count == 0) {
|
|
|
1984 freeifaddrs(addrs);
|
|
|
1985 return 0;
|
|
|
1986 }
|
|
|
1987
|
|
|
1988 /* Make sure the memory is initiallized to zero using calloc() */
|
|
|
1989 *addresses = uv__calloc(1, *count * sizeof(**addresses) + namelen);
|
|
|
1990 if (*addresses == NULL) {
|
|
|
1991 freeifaddrs(addrs);
|
|
|
1992 return UV_ENOMEM;
|
|
|
1993 }
|
|
|
1994
|
|
|
1995 name = (char*) &(*addresses)[*count];
|
|
|
1996 address = *addresses;
|
|
|
1997
|
|
|
1998 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
|
|
|
1999 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFADDR))
|
|
|
2000 continue;
|
|
|
2001
|
|
|
2002 namelen = strlen(ent->ifa_name) + 1;
|
|
|
2003 address->name = memcpy(name, ent->ifa_name, namelen);
|
|
|
2004 name += namelen;
|
|
|
2005
|
|
|
2006 if (ent->ifa_addr->sa_family == AF_INET6) {
|
|
|
2007 address->address.address6 = *((struct sockaddr_in6*) ent->ifa_addr);
|
|
|
2008 } else {
|
|
|
2009 address->address.address4 = *((struct sockaddr_in*) ent->ifa_addr);
|
|
|
2010 }
|
|
|
2011
|
|
|
2012 if (ent->ifa_netmask->sa_family == AF_INET6) {
|
|
|
2013 address->netmask.netmask6 = *((struct sockaddr_in6*) ent->ifa_netmask);
|
|
|
2014 } else {
|
|
|
2015 address->netmask.netmask4 = *((struct sockaddr_in*) ent->ifa_netmask);
|
|
|
2016 }
|
|
|
2017
|
|
|
2018 address->is_internal = !!(ent->ifa_flags & IFF_LOOPBACK);
|
|
|
2019
|
|
|
2020 address++;
|
|
|
2021 }
|
|
|
2022
|
|
|
2023 /* Fill in physical addresses for each interface */
|
|
|
2024 for (ent = addrs; ent != NULL; ent = ent->ifa_next) {
|
|
|
2025 if (uv__ifaddr_exclude(ent, UV__EXCLUDE_IFPHYS))
|
|
|
2026 continue;
|
|
|
2027
|
|
|
2028 address = *addresses;
|
|
|
2029
|
|
|
2030 for (i = 0; i < (*count); i++) {
|
|
|
2031 size_t namelen = strlen(ent->ifa_name);
|
|
|
2032 /* Alias interface share the same physical address */
|
|
|
2033 if (strncmp(address->name, ent->ifa_name, namelen) == 0 &&
|
|
|
2034 (address->name[namelen] == 0 || address->name[namelen] == ':')) {
|
|
|
2035 sll = (struct sockaddr_ll*)ent->ifa_addr;
|
|
|
2036 memcpy(address->phys_addr, sll->sll_addr, sizeof(address->phys_addr));
|
|
|
2037 }
|
|
|
2038 address++;
|
|
|
2039 }
|
|
|
2040 }
|
|
|
2041
|
|
|
2042 freeifaddrs(addrs);
|
|
|
2043
|
|
|
2044 return 0;
|
|
|
2045 }
|
|
|
2046
|
|
|
2047
|
|
|
2048 /* TODO(bnoordhuis) share with bsd-ifaddrs.c */
|
|
|
2049 void uv_free_interface_addresses(uv_interface_address_t* addresses,
|
|
|
2050 int count) {
|
|
|
2051 uv__free(addresses);
|
|
|
2052 }
|
|
|
2053
|
|
|
2054
|
|
|
2055 void uv__set_process_title(const char* title) {
|
|
|
2056 #if defined(PR_SET_NAME)
|
|
|
2057 prctl(PR_SET_NAME, title); /* Only copies first 16 characters. */
|
|
|
2058 #endif
|
|
|
2059 }
|
|
|
2060
|
|
|
2061
|
|
|
2062 static uint64_t uv__read_proc_meminfo(const char* what) {
|
|
|
2063 uint64_t rc;
|
|
|
2064 char* p;
|
|
|
2065 char buf[4096]; /* Large enough to hold all of /proc/meminfo. */
|
|
|
2066
|
|
|
2067 if (uv__slurp("/proc/meminfo", buf, sizeof(buf)))
|
|
|
2068 return 0;
|
|
|
2069
|
|
|
2070 p = strstr(buf, what);
|
|
|
2071
|
|
|
2072 if (p == NULL)
|
|
|
2073 return 0;
|
|
|
2074
|
|
|
2075 p += strlen(what);
|
|
|
2076
|
|
|
2077 rc = 0;
|
|
|
2078 sscanf(p, "%" PRIu64 " kB", &rc);
|
|
|
2079
|
|
|
2080 return rc * 1024;
|
|
|
2081 }
|
|
|
2082
|
|
|
2083
|
|
|
2084 uint64_t uv_get_free_memory(void) {
|
|
|
2085 struct sysinfo info;
|
|
|
2086 uint64_t rc;
|
|
|
2087
|
|
|
2088 rc = uv__read_proc_meminfo("MemAvailable:");
|
|
|
2089
|
|
|
2090 if (rc != 0)
|
|
|
2091 return rc;
|
|
|
2092
|
|
|
2093 if (0 == sysinfo(&info))
|
|
|
2094 return (uint64_t) info.freeram * info.mem_unit;
|
|
|
2095
|
|
|
2096 return 0;
|
|
|
2097 }
|
|
|
2098
|
|
|
2099
|
|
|
2100 uint64_t uv_get_total_memory(void) {
|
|
|
2101 struct sysinfo info;
|
|
|
2102 uint64_t rc;
|
|
|
2103
|
|
|
2104 rc = uv__read_proc_meminfo("MemTotal:");
|
|
|
2105
|
|
|
2106 if (rc != 0)
|
|
|
2107 return rc;
|
|
|
2108
|
|
|
2109 if (0 == sysinfo(&info))
|
|
|
2110 return (uint64_t) info.totalram * info.mem_unit;
|
|
|
2111
|
|
|
2112 return 0;
|
|
|
2113 }
|
|
|
2114
|
|
|
2115
|
|
|
2116 static uint64_t uv__read_uint64(const char* filename) {
|
|
|
2117 char buf[32]; /* Large enough to hold an encoded uint64_t. */
|
|
|
2118 uint64_t rc;
|
|
|
2119
|
|
|
2120 rc = 0;
|
|
|
2121 if (0 == uv__slurp(filename, buf, sizeof(buf)))
|
|
|
2122 if (1 != sscanf(buf, "%" PRIu64, &rc))
|
|
|
2123 if (0 == strcmp(buf, "max\n"))
|
|
|
2124 rc = UINT64_MAX;
|
|
|
2125
|
|
|
2126 return rc;
|
|
|
2127 }
|
|
|
2128
|
|
|
2129
|
|
|
2130 /* Given a buffer with the contents of a cgroup1 /proc/self/cgroups,
|
|
|
2131 * finds the location and length of the memory controller mount path.
|
|
|
2132 * This disregards the leading / for easy concatenation of paths.
|
|
|
2133 * Returns NULL if the memory controller wasn't found. */
|
|
|
2134 static char* uv__cgroup1_find_memory_controller(char buf[static 1024],
|
|
|
2135 int* n) {
|
|
|
2136 char* p;
|
|
|
2137
|
|
|
2138 /* Seek to the memory controller line. */
|
|
|
2139 p = strchr(buf, ':');
|
|
|
2140 while (p != NULL && strncmp(p, ":memory:", 8)) {
|
|
|
2141 p = strchr(p, '\n');
|
|
|
2142 if (p != NULL)
|
|
|
2143 p = strchr(p, ':');
|
|
|
2144 }
|
|
|
2145
|
|
|
2146 if (p != NULL) {
|
|
|
2147 /* Determine the length of the mount path. */
|
|
|
2148 p = p + strlen(":memory:/");
|
|
|
2149 *n = (int) strcspn(p, "\n");
|
|
|
2150 }
|
|
|
2151
|
|
|
2152 return p;
|
|
|
2153 }
|
|
|
2154
|
|
|
2155 static void uv__get_cgroup1_memory_limits(char buf[static 1024], uint64_t* high,
|
|
|
2156 uint64_t* max) {
|
|
|
2157 char filename[4097];
|
|
|
2158 char* p;
|
|
|
2159 int n;
|
|
|
2160 uint64_t cgroup1_max;
|
|
|
2161
|
|
|
2162 /* Find out where the controller is mounted. */
|
|
|
2163 p = uv__cgroup1_find_memory_controller(buf, &n);
|
|
|
2164 if (p != NULL) {
|
|
|
2165 snprintf(filename, sizeof(filename),
|
|
|
2166 "/sys/fs/cgroup/memory/%.*s/memory.soft_limit_in_bytes", n, p);
|
|
|
2167 *high = uv__read_uint64(filename);
|
|
|
2168
|
|
|
2169 snprintf(filename, sizeof(filename),
|
|
|
2170 "/sys/fs/cgroup/memory/%.*s/memory.limit_in_bytes", n, p);
|
|
|
2171 *max = uv__read_uint64(filename);
|
|
|
2172
|
|
|
2173 /* If the controller wasn't mounted, the reads above will have failed,
|
|
|
2174 * as indicated by uv__read_uint64 returning 0.
|
|
|
2175 */
|
|
|
2176 if (*high != 0 && *max != 0)
|
|
|
2177 goto update_limits;
|
|
|
2178 }
|
|
|
2179
|
|
|
2180 /* Fall back to the limits of the global memory controller. */
|
|
|
2181 *high = uv__read_uint64("/sys/fs/cgroup/memory/memory.soft_limit_in_bytes");
|
|
|
2182 *max = uv__read_uint64("/sys/fs/cgroup/memory/memory.limit_in_bytes");
|
|
|
2183
|
|
|
2184 /* uv__read_uint64 detects cgroup2's "max", so we need to separately detect
|
|
|
2185 * cgroup1's maximum value (which is derived from LONG_MAX and PAGE_SIZE).
|
|
|
2186 */
|
|
|
2187 update_limits:
|
|
|
2188 cgroup1_max = LONG_MAX & ~(sysconf(_SC_PAGESIZE) - 1);
|
|
|
2189 if (*high == cgroup1_max)
|
|
|
2190 *high = UINT64_MAX;
|
|
|
2191 if (*max == cgroup1_max)
|
|
|
2192 *max = UINT64_MAX;
|
|
|
2193 }
|
|
|
2194
|
|
|
2195 static void uv__get_cgroup2_memory_limits(char buf[static 1024], uint64_t* high,
|
|
|
2196 uint64_t* max) {
|
|
|
2197 char filename[4097];
|
|
|
2198 char* p;
|
|
|
2199 int n;
|
|
|
2200
|
|
|
2201 /* Find out where the controller is mounted. */
|
|
|
2202 p = buf + strlen("0::/");
|
|
|
2203 n = (int) strcspn(p, "\n");
|
|
|
2204
|
|
|
2205 /* Read the memory limits of the controller. */
|
|
|
2206 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.max", n, p);
|
|
|
2207 *max = uv__read_uint64(filename);
|
|
|
2208 snprintf(filename, sizeof(filename), "/sys/fs/cgroup/%.*s/memory.high", n, p);
|
|
|
2209 *high = uv__read_uint64(filename);
|
|
|
2210 }
|
|
|
2211
|
|
|
2212 static uint64_t uv__get_cgroup_constrained_memory(char buf[static 1024]) {
|
|
|
2213 uint64_t high;
|
|
|
2214 uint64_t max;
|
|
|
2215
|
|
|
2216 /* In the case of cgroupv2, we'll only have a single entry. */
|
|
|
2217 if (strncmp(buf, "0::/", 4))
|
|
|
2218 uv__get_cgroup1_memory_limits(buf, &high, &max);
|
|
|
2219 else
|
|
|
2220 uv__get_cgroup2_memory_limits(buf, &high, &max);
|
|
|
2221
|
|
|
2222 if (high == 0 || max == 0)
|
|
|
2223 return 0;
|
|
|
2224
|
|
|
2225 return high < max ? high : max;
|
|
|
2226 }
|
|
|
2227
|
|
|
2228 uint64_t uv_get_constrained_memory(void) {
|
|
|
2229 char buf[1024];
|
|
|
2230
|
|
|
2231 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
|
|
|
2232 return 0;
|
|
|
2233
|
|
|
2234 return uv__get_cgroup_constrained_memory(buf);
|
|
|
2235 }
|
|
|
2236
|
|
|
2237
|
|
|
2238 static uint64_t uv__get_cgroup1_current_memory(char buf[static 1024]) {
|
|
|
2239 char filename[4097];
|
|
|
2240 uint64_t current;
|
|
|
2241 char* p;
|
|
|
2242 int n;
|
|
|
2243
|
|
|
2244 /* Find out where the controller is mounted. */
|
|
|
2245 p = uv__cgroup1_find_memory_controller(buf, &n);
|
|
|
2246 if (p != NULL) {
|
|
|
2247 snprintf(filename, sizeof(filename),
|
|
|
2248 "/sys/fs/cgroup/memory/%.*s/memory.usage_in_bytes", n, p);
|
|
|
2249 current = uv__read_uint64(filename);
|
|
|
2250
|
|
|
2251 /* If the controller wasn't mounted, the reads above will have failed,
|
|
|
2252 * as indicated by uv__read_uint64 returning 0.
|
|
|
2253 */
|
|
|
2254 if (current != 0)
|
|
|
2255 return current;
|
|
|
2256 }
|
|
|
2257
|
|
|
2258 /* Fall back to the usage of the global memory controller. */
|
|
|
2259 return uv__read_uint64("/sys/fs/cgroup/memory/memory.usage_in_bytes");
|
|
|
2260 }
|
|
|
2261
|
|
|
2262 static uint64_t uv__get_cgroup2_current_memory(char buf[static 1024]) {
|
|
|
2263 char filename[4097];
|
|
|
2264 char* p;
|
|
|
2265 int n;
|
|
|
2266
|
|
|
2267 /* Find out where the controller is mounted. */
|
|
|
2268 p = buf + strlen("0::/");
|
|
|
2269 n = (int) strcspn(p, "\n");
|
|
|
2270
|
|
|
2271 snprintf(filename, sizeof(filename),
|
|
|
2272 "/sys/fs/cgroup/%.*s/memory.current", n, p);
|
|
|
2273 return uv__read_uint64(filename);
|
|
|
2274 }
|
|
|
2275
|
|
|
2276 uint64_t uv_get_available_memory(void) {
|
|
|
2277 char buf[1024];
|
|
|
2278 uint64_t constrained;
|
|
|
2279 uint64_t current;
|
|
|
2280 uint64_t total;
|
|
|
2281
|
|
|
2282 if (uv__slurp("/proc/self/cgroup", buf, sizeof(buf)))
|
|
|
2283 return 0;
|
|
|
2284
|
|
|
2285 constrained = uv__get_cgroup_constrained_memory(buf);
|
|
|
2286 if (constrained == 0)
|
|
|
2287 return uv_get_free_memory();
|
|
|
2288
|
|
|
2289 total = uv_get_total_memory();
|
|
|
2290 if (constrained > total)
|
|
|
2291 return uv_get_free_memory();
|
|
|
2292
|
|
|
2293 /* In the case of cgroupv2, we'll only have a single entry. */
|
|
|
2294 if (strncmp(buf, "0::/", 4))
|
|
|
2295 current = uv__get_cgroup1_current_memory(buf);
|
|
|
2296 else
|
|
|
2297 current = uv__get_cgroup2_current_memory(buf);
|
|
|
2298
|
|
|
2299 /* memory usage can be higher than the limit (for short bursts of time) */
|
|
|
2300 if (constrained < current)
|
|
|
2301 return 0;
|
|
|
2302
|
|
|
2303 return constrained - current;
|
|
|
2304 }
|
|
|
2305
|
|
|
2306
|
|
|
2307 static int uv__get_cgroupv2_constrained_cpu(const char* cgroup,
|
|
|
2308 long long* quota) {
|
|
|
2309 static const char cgroup_mount[] = "/sys/fs/cgroup";
|
|
|
2310 const char* cgroup_trimmed;
|
|
|
2311 char buf[1024];
|
|
|
2312 char full_path[256];
|
|
|
2313 char path[256];
|
|
|
2314 char quota_buf[16];
|
|
|
2315 char* last_slash;
|
|
|
2316 int cgroup_size;
|
|
|
2317 long long limit;
|
|
|
2318 long long min_quota;
|
|
|
2319 long long period;
|
|
|
2320
|
|
|
2321 if (strncmp(cgroup, "0::/", 4) != 0)
|
|
|
2322 return UV_EINVAL;
|
|
|
2323
|
|
|
2324 /* Trim ending \n by replacing it with a 0 */
|
|
|
2325 cgroup_trimmed = cgroup + sizeof("0::/") - 1; /* Skip the prefix "0::/" */
|
|
|
2326 cgroup_size = (int)strcspn(cgroup_trimmed, "\n"); /* Find the first \n */
|
|
|
2327 min_quota = LLONG_MAX;
|
|
|
2328
|
|
|
2329 /* Construct the path to the cpu.max files */
|
|
|
2330 snprintf(path, sizeof(path), "%s/%.*s/cgroup.controllers", cgroup_mount,
|
|
|
2331 cgroup_size, cgroup_trimmed);
|
|
|
2332
|
|
|
2333 /* Read controllers, if not exists, not really a cgroup */
|
|
|
2334 if (uv__slurp(path, buf, sizeof(buf)) < 0)
|
|
|
2335 return UV_EIO;
|
|
|
2336
|
|
|
2337 snprintf(path, sizeof(path), "%s/%.*s", cgroup_mount, cgroup_size,
|
|
|
2338 cgroup_trimmed);
|
|
|
2339
|
|
|
2340 /*
|
|
|
2341 * Traverse up the cgroup v2 hierarchy, starting from the current cgroup path.
|
|
|
2342 * At each level, attempt to read the "cpu.max" file, which defines the CPU
|
|
|
2343 * quota and period.
|
|
|
2344 *
|
|
|
2345 * This reflects how Linux applies cgroup limits hierarchically.
|
|
|
2346 *
|
|
|
2347 * e.g: given a path like /sys/fs/cgroup/foo/bar/baz, we check:
|
|
|
2348 * - /sys/fs/cgroup/foo/bar/baz/cpu.max
|
|
|
2349 * - /sys/fs/cgroup/foo/bar/cpu.max
|
|
|
2350 * - /sys/fs/cgroup/foo/cpu.max
|
|
|
2351 * - /sys/fs/cgroup/cpu.max
|
|
|
2352 */
|
|
|
2353 while (strncmp(path, cgroup_mount, strlen(cgroup_mount)) == 0) {
|
|
|
2354 snprintf(full_path, sizeof(full_path), "%s/cpu.max", path);
|
|
|
2355
|
|
|
2356 /* Silently ignore and continue if the file does not exist */
|
|
|
2357 if (uv__slurp(full_path, quota_buf, sizeof(quota_buf)) < 0)
|
|
|
2358 goto next;
|
|
|
2359
|
|
|
2360 /* No limit, move on */
|
|
|
2361 if (strncmp(quota_buf, "max", 3) == 0)
|
|
|
2362 goto next;
|
|
|
2363
|
|
|
2364 /* Read cpu.max */
|
|
|
2365 if (sscanf(quota_buf, "%lld %lld", &limit, &period) != 2)
|
|
|
2366 goto next;
|
|
|
2367
|
|
|
2368 /* Can't divide by 0 */
|
|
|
2369 if (period == 0)
|
|
|
2370 goto next;
|
|
|
2371
|
|
|
2372 *quota = limit / period;
|
|
|
2373 if (*quota < min_quota)
|
|
|
2374 min_quota = *quota;
|
|
|
2375
|
|
|
2376 next:
|
|
|
2377 /* Move up one level in the cgroup hierarchy by trimming the last path.
|
|
|
2378 * The loop ends once we reach the cgroup root mount point.
|
|
|
2379 */
|
|
|
2380 last_slash = strrchr(path, '/');
|
|
|
2381 if (last_slash == NULL || strcmp(path, cgroup_mount) == 0)
|
|
|
2382 break;
|
|
|
2383 *last_slash = '\0';
|
|
|
2384 }
|
|
|
2385
|
|
|
2386 return 0;
|
|
|
2387 }
|
|
|
2388
|
|
|
2389 static char* uv__cgroup1_find_cpu_controller(const char* cgroup,
|
|
|
2390 int* cgroup_size) {
|
|
|
2391 /* Seek to the cpu controller line. */
|
|
|
2392 char* cgroup_cpu = strstr(cgroup, ":cpu,");
|
|
|
2393
|
|
|
2394 if (cgroup_cpu != NULL) {
|
|
|
2395 /* Skip the controller prefix to the start of the cgroup path. */
|
|
|
2396 cgroup_cpu += sizeof(":cpu,") - 1;
|
|
|
2397 /* Determine the length of the cgroup path, excluding the newline. */
|
|
|
2398 *cgroup_size = (int)strcspn(cgroup_cpu, "\n");
|
|
|
2399 }
|
|
|
2400
|
|
|
2401 return cgroup_cpu;
|
|
|
2402 }
|
|
|
2403
|
|
|
2404 static int uv__get_cgroupv1_constrained_cpu(const char* cgroup,
|
|
|
2405 long long* quota) {
|
|
|
2406 char path[256];
|
|
|
2407 char buf[1024];
|
|
|
2408 int cgroup_size;
|
|
|
2409 char* cgroup_cpu;
|
|
|
2410 long long period_length;
|
|
|
2411 long long quota_per_period;
|
|
|
2412
|
|
|
2413 cgroup_cpu = uv__cgroup1_find_cpu_controller(cgroup, &cgroup_size);
|
|
|
2414
|
|
|
2415 if (cgroup_cpu == NULL)
|
|
|
2416 return UV_EIO;
|
|
|
2417
|
|
|
2418 /* Construct the path to the cpu.cfs_quota_us file */
|
|
|
2419 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_quota_us",
|
|
|
2420 cgroup_size, cgroup_cpu);
|
|
|
2421
|
|
|
2422 /* Read cpu.cfs_quota_us */
|
|
|
2423 if (uv__slurp(path, buf, sizeof(buf)) < 0)
|
|
|
2424 return UV_EIO;
|
|
|
2425
|
|
|
2426 if (sscanf(buf, "%lld", "a_per_period) != 1)
|
|
|
2427 return UV_EINVAL;
|
|
|
2428
|
|
|
2429 /* Construct the path to the cpu.cfs_period_us file */
|
|
|
2430 snprintf(path, sizeof(path), "/sys/fs/cgroup/%.*s/cpu.cfs_period_us",
|
|
|
2431 cgroup_size, cgroup_cpu);
|
|
|
2432
|
|
|
2433 /* Read cpu.cfs_period_us */
|
|
|
2434 if (uv__slurp(path, buf, sizeof(buf)) < 0)
|
|
|
2435 return UV_EIO;
|
|
|
2436
|
|
|
2437 if (sscanf(buf, "%lld", &period_length) != 1)
|
|
|
2438 return UV_EINVAL;
|
|
|
2439
|
|
|
2440 /* Can't divide by 0 */
|
|
|
2441 if (period_length == 0)
|
|
|
2442 return UV_EINVAL;
|
|
|
2443
|
|
|
2444 *quota = quota_per_period / period_length;
|
|
|
2445
|
|
|
2446 return 0;
|
|
|
2447 }
|
|
|
2448
|
|
|
2449 int uv__get_constrained_cpu(long long* quota) {
|
|
|
2450 char cgroup[1024];
|
|
|
2451
|
|
|
2452 /* Read the cgroup from /proc/self/cgroup */
|
|
|
2453 if (uv__slurp("/proc/self/cgroup", cgroup, sizeof(cgroup)) < 0)
|
|
|
2454 return UV_EIO;
|
|
|
2455
|
|
|
2456 /* Check if the system is using cgroup v2 by examining /proc/self/cgroup
|
|
|
2457 * The entry for cgroup v2 is always in the format "0::$PATH"
|
|
|
2458 * see https://docs.kernel.org/admin-guide/cgroup-v2.html */
|
|
|
2459 if (strncmp(cgroup, "0::/", 4) == 0)
|
|
|
2460 return uv__get_cgroupv2_constrained_cpu(cgroup, quota);
|
|
|
2461 else
|
|
|
2462 return uv__get_cgroupv1_constrained_cpu(cgroup, quota);
|
|
|
2463 }
|
|
|
2464
|
|
|
2465
|
|
|
2466 void uv_loadavg(double avg[3]) {
|
|
|
2467 struct sysinfo info;
|
|
|
2468 char buf[128]; /* Large enough to hold all of /proc/loadavg. */
|
|
|
2469
|
|
|
2470 if (0 == uv__slurp("/proc/loadavg", buf, sizeof(buf)))
|
|
|
2471 if (3 == sscanf(buf, "%lf %lf %lf", &avg[0], &avg[1], &avg[2]))
|
|
|
2472 return;
|
|
|
2473
|
|
|
2474 if (sysinfo(&info) < 0)
|
|
|
2475 return;
|
|
|
2476
|
|
|
2477 avg[0] = (double) info.loads[0] / 65536.0;
|
|
|
2478 avg[1] = (double) info.loads[1] / 65536.0;
|
|
|
2479 avg[2] = (double) info.loads[2] / 65536.0;
|
|
|
2480 }
|
|
|
2481
|
|
|
2482
|
|
|
2483 static int compare_watchers(const struct watcher_list* a,
|
|
|
2484 const struct watcher_list* b) {
|
|
|
2485 if (a->wd < b->wd) return -1;
|
|
|
2486 if (a->wd > b->wd) return 1;
|
|
|
2487 return 0;
|
|
|
2488 }
|
|
|
2489
|
|
|
2490
|
|
|
2491 static int init_inotify(uv_loop_t* loop) {
|
|
|
2492 int err;
|
|
|
2493 int fd;
|
|
|
2494
|
|
|
2495 if (loop->inotify_fd != -1)
|
|
|
2496 return 0;
|
|
|
2497
|
|
|
2498 fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
|
|
|
2499 if (fd < 0)
|
|
|
2500 return UV__ERR(errno);
|
|
|
2501
|
|
|
2502 err = uv__io_init_start(loop, &loop->inotify_read_watcher, uv__inotify_read,
|
|
|
2503 fd, POLLIN);
|
|
|
2504 if (err) {
|
|
|
2505 uv__close(fd);
|
|
|
2506 return err;
|
|
|
2507 }
|
|
|
2508
|
|
|
2509 loop->inotify_fd = fd;
|
|
|
2510 return 0;
|
|
|
2511 }
|
|
|
2512
|
|
|
2513
|
|
|
2514 static int uv__inotify_fork(uv_loop_t* loop, struct watcher_list* root) {
|
|
|
2515 /* Open the inotify_fd, and re-arm all the inotify watchers. */
|
|
|
2516 int err;
|
|
|
2517 struct watcher_list* tmp_watcher_list_iter;
|
|
|
2518 struct watcher_list* watcher_list;
|
|
|
2519 struct watcher_list tmp_watcher_list;
|
|
|
2520 struct uv__queue queue;
|
|
|
2521 struct uv__queue* q;
|
|
|
2522 uv_fs_event_t* handle;
|
|
|
2523 char* tmp_path;
|
|
|
2524
|
|
|
2525 if (root == NULL)
|
|
|
2526 return 0;
|
|
|
2527
|
|
|
2528 /* We must restore the old watcher list to be able to close items
|
|
|
2529 * out of it.
|
|
|
2530 */
|
|
|
2531 loop->inotify_watchers = root;
|
|
|
2532
|
|
|
2533 uv__queue_init(&tmp_watcher_list.watchers);
|
|
|
2534 /* Note that the queue we use is shared with the start and stop()
|
|
|
2535 * functions, making uv__queue_foreach unsafe to use. So we use the
|
|
|
2536 * uv__queue_move trick to safely iterate. Also don't free the watcher
|
|
|
2537 * list until we're done iterating. c.f. uv__inotify_read.
|
|
|
2538 */
|
|
|
2539 RB_FOREACH_SAFE(watcher_list, watcher_root,
|
|
|
2540 uv__inotify_watchers(loop), tmp_watcher_list_iter) {
|
|
|
2541 watcher_list->iterating = 1;
|
|
|
2542 uv__queue_move(&watcher_list->watchers, &queue);
|
|
|
2543 while (!uv__queue_empty(&queue)) {
|
|
|
2544 q = uv__queue_head(&queue);
|
|
|
2545 handle = uv__queue_data(q, uv_fs_event_t, watchers);
|
|
|
2546 /* It's critical to keep a copy of path here, because it
|
|
|
2547 * will be set to NULL by stop() and then deallocated by
|
|
|
2548 * maybe_free_watcher_list
|
|
|
2549 */
|
|
|
2550 tmp_path = uv__strdup(handle->path);
|
|
|
2551 assert(tmp_path != NULL);
|
|
|
2552 uv__queue_remove(q);
|
|
|
2553 uv__queue_insert_tail(&watcher_list->watchers, q);
|
|
|
2554 uv_fs_event_stop(handle);
|
|
|
2555
|
|
|
2556 uv__queue_insert_tail(&tmp_watcher_list.watchers, &handle->watchers);
|
|
|
2557 handle->path = tmp_path;
|
|
|
2558 }
|
|
|
2559 watcher_list->iterating = 0;
|
|
|
2560 maybe_free_watcher_list(watcher_list, loop);
|
|
|
2561 }
|
|
|
2562
|
|
|
2563 uv__queue_move(&tmp_watcher_list.watchers, &queue);
|
|
|
2564 while (!uv__queue_empty(&queue)) {
|
|
|
2565 q = uv__queue_head(&queue);
|
|
|
2566 uv__queue_remove(q);
|
|
|
2567 handle = uv__queue_data(q, uv_fs_event_t, watchers);
|
|
|
2568 tmp_path = handle->path;
|
|
|
2569 handle->path = NULL;
|
|
|
2570 err = uv_fs_event_start(handle, handle->cb, tmp_path, 0);
|
|
|
2571 uv__free(tmp_path);
|
|
|
2572 if (err)
|
|
|
2573 return err;
|
|
|
2574 }
|
|
|
2575
|
|
|
2576 return 0;
|
|
|
2577 }
|
|
|
2578
|
|
|
2579
|
|
|
2580 static struct watcher_list* find_watcher(uv_loop_t* loop, int wd) {
|
|
|
2581 struct watcher_list w;
|
|
|
2582 w.wd = wd;
|
|
|
2583 return RB_FIND(watcher_root, uv__inotify_watchers(loop), &w);
|
|
|
2584 }
|
|
|
2585
|
|
|
2586
|
|
|
2587 static void maybe_free_watcher_list(struct watcher_list* w, uv_loop_t* loop) {
|
|
|
2588 /* if the watcher_list->watchers is being iterated over, we can't free it. */
|
|
|
2589 if ((!w->iterating) && uv__queue_empty(&w->watchers)) {
|
|
|
2590 /* No watchers left for this path. Clean up. */
|
|
|
2591 RB_REMOVE(watcher_root, uv__inotify_watchers(loop), w);
|
|
|
2592 inotify_rm_watch(loop->inotify_fd, w->wd);
|
|
|
2593 uv__free(w);
|
|
|
2594 }
|
|
|
2595 }
|
|
|
2596
|
|
|
2597
|
|
|
2598 static void uv__inotify_read(uv_loop_t* loop,
|
|
|
2599 uv__io_t* dummy,
|
|
|
2600 unsigned int events) {
|
|
|
2601 const struct inotify_event* e;
|
|
|
2602 struct watcher_list* w;
|
|
|
2603 uv_fs_event_t* h;
|
|
|
2604 struct uv__queue queue;
|
|
|
2605 struct uv__queue* q;
|
|
|
2606 const char* path;
|
|
|
2607 ssize_t size;
|
|
|
2608 const char *p;
|
|
|
2609 /* needs to be large enough for sizeof(inotify_event) + strlen(path) */
|
|
|
2610 char buf[4096];
|
|
|
2611
|
|
|
2612 for (;;) {
|
|
|
2613 do
|
|
|
2614 size = read(loop->inotify_fd, buf, sizeof(buf));
|
|
|
2615 while (size == -1 && errno == EINTR);
|
|
|
2616
|
|
|
2617 if (size == -1) {
|
|
|
2618 assert(errno == EAGAIN || errno == EWOULDBLOCK);
|
|
|
2619 break;
|
|
|
2620 }
|
|
|
2621
|
|
|
2622 assert(size > 0); /* pre-2.6.21 thing, size=0 == read buffer too small */
|
|
|
2623
|
|
|
2624 /* Now we have one or more inotify_event structs. */
|
|
|
2625 for (p = buf; p < buf + size; p += sizeof(*e) + e->len) {
|
|
|
2626 e = (const struct inotify_event*) p;
|
|
|
2627
|
|
|
2628 events = 0;
|
|
|
2629 if (e->mask & (IN_ATTRIB|IN_MODIFY))
|
|
|
2630 events |= UV_CHANGE;
|
|
|
2631 if (e->mask & ~(IN_ATTRIB|IN_MODIFY))
|
|
|
2632 events |= UV_RENAME;
|
|
|
2633
|
|
|
2634 w = find_watcher(loop, e->wd);
|
|
|
2635 if (w == NULL)
|
|
|
2636 continue; /* Stale event, no watchers left. */
|
|
|
2637
|
|
|
2638 /* inotify does not return the filename when monitoring a single file
|
|
|
2639 * for modifications. Repurpose the filename for API compatibility.
|
|
|
2640 * I'm not convinced this is a good thing, maybe it should go.
|
|
|
2641 */
|
|
|
2642 path = e->len ? (const char*) (e + 1) : uv__basename_r(w->path);
|
|
|
2643
|
|
|
2644 /* We're about to iterate over the queue and call user's callbacks.
|
|
|
2645 * What can go wrong?
|
|
|
2646 * A callback could call uv_fs_event_stop()
|
|
|
2647 * and the queue can change under our feet.
|
|
|
2648 * So, we use uv__queue_move() trick to safely iterate over the queue.
|
|
|
2649 * And we don't free the watcher_list until we're done iterating.
|
|
|
2650 *
|
|
|
2651 * First,
|
|
|
2652 * tell uv_fs_event_stop() (that could be called from a user's callback)
|
|
|
2653 * not to free watcher_list.
|
|
|
2654 */
|
|
|
2655 w->iterating = 1;
|
|
|
2656 uv__queue_move(&w->watchers, &queue);
|
|
|
2657 while (!uv__queue_empty(&queue)) {
|
|
|
2658 q = uv__queue_head(&queue);
|
|
|
2659 h = uv__queue_data(q, uv_fs_event_t, watchers);
|
|
|
2660
|
|
|
2661 uv__queue_remove(q);
|
|
|
2662 uv__queue_insert_tail(&w->watchers, q);
|
|
|
2663
|
|
|
2664 h->cb(h, path, events, 0);
|
|
|
2665 }
|
|
|
2666 /* done iterating, time to (maybe) free empty watcher_list */
|
|
|
2667 w->iterating = 0;
|
|
|
2668 maybe_free_watcher_list(w, loop);
|
|
|
2669 }
|
|
|
2670 }
|
|
|
2671 }
|
|
|
2672
|
|
|
2673
|
|
|
2674 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
|
|
|
2675 uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
|
|
|
2676 return 0;
|
|
|
2677 }
|
|
|
2678
|
|
|
2679
|
|
|
2680 int uv_fs_event_start(uv_fs_event_t* handle,
|
|
|
2681 uv_fs_event_cb cb,
|
|
|
2682 const char* path,
|
|
|
2683 unsigned int flags) {
|
|
|
2684 struct watcher_list* w;
|
|
|
2685 uv_loop_t* loop;
|
|
|
2686 size_t len;
|
|
|
2687 int events;
|
|
|
2688 int err;
|
|
|
2689 int wd;
|
|
|
2690
|
|
|
2691 if (uv__is_active(handle))
|
|
|
2692 return UV_EINVAL;
|
|
|
2693
|
|
|
2694 loop = handle->loop;
|
|
|
2695
|
|
|
2696 err = init_inotify(loop);
|
|
|
2697 if (err)
|
|
|
2698 return err;
|
|
|
2699
|
|
|
2700 events = IN_ATTRIB
|
|
|
2701 | IN_CREATE
|
|
|
2702 | IN_MODIFY
|
|
|
2703 | IN_DELETE
|
|
|
2704 | IN_DELETE_SELF
|
|
|
2705 | IN_MOVE_SELF
|
|
|
2706 | IN_MOVED_FROM
|
|
|
2707 | IN_MOVED_TO;
|
|
|
2708
|
|
|
2709 wd = inotify_add_watch(loop->inotify_fd, path, events);
|
|
|
2710 if (wd == -1)
|
|
|
2711 return UV__ERR(errno);
|
|
|
2712
|
|
|
2713 w = find_watcher(loop, wd);
|
|
|
2714 if (w)
|
|
|
2715 goto no_insert;
|
|
|
2716
|
|
|
2717 len = strlen(path) + 1;
|
|
|
2718 w = uv__malloc(sizeof(*w) + len);
|
|
|
2719 if (w == NULL)
|
|
|
2720 return UV_ENOMEM;
|
|
|
2721
|
|
|
2722 w->wd = wd;
|
|
|
2723 w->path = memcpy(w + 1, path, len);
|
|
|
2724 uv__queue_init(&w->watchers);
|
|
|
2725 w->iterating = 0;
|
|
|
2726 RB_INSERT(watcher_root, uv__inotify_watchers(loop), w);
|
|
|
2727
|
|
|
2728 no_insert:
|
|
|
2729 uv__handle_start(handle);
|
|
|
2730 uv__queue_insert_tail(&w->watchers, &handle->watchers);
|
|
|
2731 handle->path = w->path;
|
|
|
2732 handle->cb = cb;
|
|
|
2733 handle->wd = wd;
|
|
|
2734
|
|
|
2735 return 0;
|
|
|
2736 }
|
|
|
2737
|
|
|
2738
|
|
|
2739 int uv_fs_event_stop(uv_fs_event_t* handle) {
|
|
|
2740 struct watcher_list* w;
|
|
|
2741
|
|
|
2742 if (!uv__is_active(handle))
|
|
|
2743 return 0;
|
|
|
2744
|
|
|
2745 w = find_watcher(handle->loop, handle->wd);
|
|
|
2746 assert(w != NULL);
|
|
|
2747
|
|
|
2748 handle->wd = -1;
|
|
|
2749 handle->path = NULL;
|
|
|
2750 uv__handle_stop(handle);
|
|
|
2751 uv__queue_remove(&handle->watchers);
|
|
|
2752
|
|
|
2753 maybe_free_watcher_list(w, handle->loop);
|
|
|
2754
|
|
|
2755 return 0;
|
|
|
2756 }
|
|
|
2757
|
|
|
2758
|
|
|
2759 void uv__fs_event_close(uv_fs_event_t* handle) {
|
|
|
2760 uv_fs_event_stop(handle);
|
|
|
2761 }
|