From efd3be9de1dc07ec743912f3c166bbf17dbb20f5 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Mon, 8 Mar 2021 15:39:53 +0900 Subject: [PATCH] sd-event: re-check new epoll events when a child event is queued Previously, when a process outputs something and exit just after epoll_wait() but before process_child(), then the IO event is ignored even if the IO event has higher priority. See #18190. This can be solved by checking epoll event again after process_child(). However, there exists a possibility that another process outputs and exits just after process_child() but before the second epoll_wait(). When the IO event has lower priority than the child event, still IO event is processed. So, this makes new epoll events and child events are checked in a loop until no new event is detected. To prevent an infinite loop, the number of maximum trial is set to 10. Fixes #18190. --- src/libsystemd/sd-event/sd-event.c | 150 +++++++++++++++++++++++++++---------- 1 file changed, 110 insertions(+), 40 deletions(-) diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index 5d0e057..65db799 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -850,7 +850,7 @@ static int source_set_pending(sd_event_source *s, bool b) { } } - return 0; + return 1; } static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) { @@ -2502,12 +2502,20 @@ static int process_timer( return 0; } -static int process_child(sd_event *e) { +static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) { + int64_t min_priority = threshold; + bool something_new = false; sd_event_source *s; Iterator i; int r; assert(e); + assert(ret_min_priority); + + if (!e->need_process_child) { + *ret_min_priority = min_priority; + return 0; + } e->need_process_child = false; @@ -2532,6 +2540,9 @@ static int process_child(sd_event *e) { HASHMAP_FOREACH(s, e->child_sources, i) { assert(s->type == SOURCE_CHILD); + if (s->priority > threshold) + continue; + if (s->pending) continue; @@ -2560,19 +2571,24 @@ static int process_child(sd_event *e) { r = source_set_pending(s, true); if (r < 0) return r; + if (r > 0) { + something_new = true; + min_priority = MIN(min_priority, s->priority); + } } } - return 0; + *ret_min_priority = min_priority; + return something_new; } -static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) { - bool read_one = false; +static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) { int r; assert(e); assert(d); assert_return(events == EPOLLIN, -EIO); + assert(min_priority); /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make sure to recheck the @@ -2598,7 +2614,7 @@ static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) { n = read(d->fd, &si, sizeof(si)); if (n < 0) { if (IN_SET(errno, EAGAIN, EINTR)) - return read_one; + return 0; return -errno; } @@ -2608,8 +2624,6 @@ static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) { assert(SIGNAL_VALID(si.ssi_signo)); - read_one = true; - if (e->signal_sources) s = e->signal_sources[si.ssi_signo]; if (!s) @@ -2623,12 +2637,16 @@ static int process_signal(sd_event *e, struct signal_data *d, uint32_t events) { r = source_set_pending(s, true); if (r < 0) return r; + if (r > 0 && *min_priority >= s->priority) { + *min_priority = s->priority; + return 1; /* an event source with smaller priority is queued. */ + } - return 1; + return 0; } } -static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents) { +static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) { ssize_t n; assert(e); @@ -2644,6 +2662,9 @@ static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t if (d->buffer_filled > 0) return 0; + if (d->priority > threshold) + return 0; + n = read(d->fd, &d->buffer, sizeof(d->buffer)); if (n < 0) { if (IN_SET(errno, EAGAIN, EINTR)) @@ -3101,21 +3122,15 @@ pending: return r; } -_public_ int sd_event_wait(sd_event *e, uint64_t timeout) { +static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) { + int64_t min_priority = threshold; + bool something_new = false; struct epoll_event *ev_queue; unsigned ev_queue_max; int r, m, i; - assert_return(e, -EINVAL); - assert_return(e = event_resolve(e), -ENOPKG); - assert_return(!event_pid_changed(e), -ECHILD); - assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); - assert_return(e->state == SD_EVENT_ARMED, -EBUSY); - - if (e->exit_requested) { - e->state = SD_EVENT_PENDING; - return 1; - } + assert(e); + assert(ret_min_priority); ev_queue_max = MAX(e->n_sources, 1u); ev_queue = newa(struct epoll_event, ev_queue_max); @@ -3127,16 +3142,12 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) { m = epoll_wait(e->epoll_fd, ev_queue, ev_queue_max, timeout == (uint64_t) -1 ? -1 : (int) DIV_ROUND_UP(timeout, USEC_PER_MSEC)); if (m < 0) { - if (errno == EINTR) { - e->state = SD_EVENT_PENDING; - return 1; - } - - r = -errno; - goto finish; + return -errno; } - triple_timestamp_get(&e->timestamp); + /* Set timestamp only when this is called first time. */ + if (threshold == INT64_MAX) + triple_timestamp_get(&e->timestamp); for (i = 0; i < m; i++) { @@ -3147,9 +3158,18 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) { switch (*t) { - case WAKEUP_EVENT_SOURCE: - r = process_io(e, ev_queue[i].data.ptr, ev_queue[i].events); + case WAKEUP_EVENT_SOURCE: { + sd_event_source *s = ev_queue[i].data.ptr; + + if (s->priority > threshold) + continue; + + min_priority = MIN(min_priority, s->priority); + + r = process_io(e, s, ev_queue[i].events); + break; + } case WAKEUP_CLOCK_DATA: { struct clock_data *d = ev_queue[i].data.ptr; @@ -3158,11 +3178,12 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) { } case WAKEUP_SIGNAL_DATA: - r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events); + r = process_signal(e, ev_queue[i].data.ptr, ev_queue[i].events, &min_priority); break; case WAKEUP_INOTIFY_DATA: - r = event_inotify_data_read(e, ev_queue[i].data.ptr, ev_queue[i].events); + r = event_inotify_data_read(e, ev_queue[i].data.ptr, ev_queue[i].events, threshold); + break; default: @@ -3170,7 +3191,63 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) { } } if (r < 0) + return r; + if (r > 0) + something_new = true; + } + + *ret_min_priority = min_priority; + return something_new; +} + +_public_ int sd_event_wait(sd_event *e, uint64_t timeout) { + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(!event_pid_changed(e), -ECHILD); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(e->state == SD_EVENT_ARMED, -EBUSY); + + if (e->exit_requested) { + e->state = SD_EVENT_PENDING; + return 1; + } + + for (int64_t threshold = INT64_MAX; ; threshold--) { + int64_t epoll_min_priority, child_min_priority; + + /* There may be a possibility that new epoll (especially IO) and child events are + * triggered just after process_epoll() call but before process_child(), and the new IO + * events may have higher priority than the child events. To salvage these events, + * let's call epoll_wait() again, but accepts only events with higher priority than the + * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments + * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085 + * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */ + + r = process_epoll(e, timeout, threshold, &epoll_min_priority); + if (r == -EINTR) { + e->state = SD_EVENT_PENDING; + return 1; + } + if (r < 0) goto finish; + if (r == 0 && threshold < INT64_MAX) + /* No new epoll event. */ + break; + + r = process_child(e, threshold, &child_min_priority); + if (r < 0) + goto finish; + if (r == 0) + /* No new child event. */ + break; + + threshold = MIN(epoll_min_priority, child_min_priority); + if (threshold == INT64_MIN) + break; + + timeout = 0; } r = process_watchdog(e); @@ -3197,19 +3274,12 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) { if (r < 0) goto finish; - if (e->need_process_child) { - r = process_child(e); - if (r < 0) - goto finish; - } - r = process_inotify(e); if (r < 0) goto finish; if (event_next_pending(e)) { e->state = SD_EVENT_PENDING; - return 1; } -- 1.8.3.1