Tuesday, December 24, 2013

libxenstore bug

Opensource is perfect till you hit the critical bug. Then you are doomed. Ok, just joking. There is a community, in other words there are a lot of doomed people like you and bugs like this.
An example. There was (and probably is) a bug in Xen qemu-dm. It can just hang forever due to some circumstances. Some is a key word here. First time I've seen this in Xen Opensource 3.4.2. Taking into account that qemu-dm is responsible for all IO of some VM, this VM hangs too.

The problem was in libxenstore that called pthread_cond_wait():
#define condvar_wait(c,m,hnd)   pthread_cond_wait(c,m)

mutex_lock(&h->watch_mutex);

/* Wait on the condition variable for a watch to fire. */
while (list_empty(&h->watch_list))
        condvar_wait(&h->watch_condvar, &h->watch_mutex, h);
msg = list_top(&h->watch_list, struct xs_stored_msg, list);
list_del(&msg->list);

/* Clear the pipe token if there are no more pending watches. */
if (list_empty(&h->watch_list) && (h->watch_pipe[0] != -1))
        while (read(h->watch_pipe[0], &c, 1) != 1)
                continue;
watch_pipe is used for notifications (for select) about events in watch_list. Separate thread is writing to this pipe during element addition to this list in method read_message():
#define condvar_signal(c)       pthread_cond_signal(c)

mutex_lock(&h->watch_mutex);

/* Kick users out of their select() loop. */
if (list_empty(&h->watch_list) &&
    (h->watch_pipe[1] != -1))
        while (write(h->watch_pipe[1], body, 1) != 1)
                continue;

list_add_tail(&msg->list, &h->watch_list);

condvar_signal(&h->watch_condvar);

mutex_unlock(&h->watch_mutex);
It looks ok but it is not. Sometimes select() reports a read readiness for watch_pipe[0] when watch_list is empty and pthread_cond_wait() hangs indefinitely. This is a big mistake. First of all, kernel can mistakenly give you false positives, second, even if your pipe contains something to read before you actually read from list some another thread can modify this list.
Having said that the patch is obvious
diff --git a/xen-3.4.2/tools/ioemu-qemu-xen/xenstore.c b/xen-3.4.2/tools/ioemu-qemu-xen/xenstore.c
index 11b305d..894e9a5 100644
--- a/xen-3.4.2/tools/ioemu-qemu-xen/xenstore.c
+++ b/xen-3.4.2/tools/ioemu-qemu-xen/xenstore.c
@@ -953,7 +953,7 @@ void xenstore_process_event(void *opaque)
     char **vec, *offset, *bpath = NULL, *buf = NULL, *drv = NULL, *image = NULL;
     unsigned int len, num, hd_index;
 
-    vec = xs_read_watch(xsh, &num);
+    vec = xs_read_watch_noblock(xsh, &num);
     if (!vec)
         return;
 
diff --git a/xen-3.4.2/tools/xenstore/xs.c b/xen-3.4.2/tools/xenstore/xs.c
index 9707d19..9ce5be6 100644
--- a/xen-3.4.2/tools/xenstore/xs.c
+++ b/xen-3.4.2/tools/xenstore/xs.c
@@ -611,11 +611,11 @@ bool xs_watch(struct xs_handle *h, const char *path, const char *token)
                 ARRAY_SIZE(iov), NULL));
 }
 
-/* Find out what node change was on (will block if nothing pending).
+/* Find out what node change was on.
  * Returns array of two pointers: path and token, or NULL.
  * Call free() after use.
  */
-char **xs_read_watch(struct xs_handle *h, unsigned int *num)
+static char **xs_read_watch_(struct xs_handle *h, unsigned int *num, bool block)
 {
     struct xs_stored_msg *msg;
     char **ret, *strings, c = 0;
@@ -623,9 +623,19 @@ char **xs_read_watch(struct xs_handle *h, unsigned int *num)
 
     mutex_lock(&h->watch_mutex);
 
-    /* Wait on the condition variable for a watch to fire. */
-    while (list_empty(&h->watch_list))
-        condvar_wait(&h->watch_condvar, &h->watch_mutex, h);
+    if (block) {
+        /* Wait on the condition variable for a watch to fire. */
+        while (list_empty(&h->watch_list))
+            condvar_wait(&h->watch_condvar, &h->watch_mutex, h);
+    }
+    else if (list_empty(&h->watch_list)) {
+        /* Clear the pipe token if there are no more pending watches. */
+        if (list_empty(&h->watch_list) && (h->watch_pipe[0] != -1))
+            while (read(h->watch_pipe[0], &c, 1) != 1)
+                continue;
+        mutex_unlock(&h->watch_mutex);
+        return NULL;
+    }
     msg = list_top(&h->watch_list, struct xs_stored_msg, list);
     list_del(&msg->list);
 
@@ -662,6 +672,15 @@ char **xs_read_watch(struct xs_handle *h, unsigned int *num)
     return ret;
 }
 
+char **xs_read_watch(struct xs_handle *h, unsigned int *num)
+{
+    return xs_read_watch_(h, num, true);
+}
+
+char **xs_read_watch_noblock(struct xs_handle *h, unsigned int *num)
+{
+    return xs_read_watch_(h, num, false);
+}
 /* Remove a watch on a node.
  * Returns false on failure (no watch on that node).
  */
diff --git a/xen-3.4.2/tools/xenstore/xs.h b/xen-3.4.2/tools/xenstore/xs.h
index bd36a0b..a921a40 100644
--- a/xen-3.4.2/tools/xenstore/xs.h
+++ b/xen-3.4.2/tools/xenstore/xs.h
@@ -109,6 +109,7 @@ int xs_fileno(struct xs_handle *h);
  * elements. Call free() after use.
  */
 char **xs_read_watch(struct xs_handle *h, unsigned int *num);
+char **xs_read_watch_noblock(struct xs_handle *h, unsigned int *num);
 
 /* Remove a watch on a node: implicitly acks any outstanding watch.
  * Returns false on failure (no watch on that node).

No comments:

Post a Comment