Move emscripten_futex_wait_non_blocking to native code. NFC (#16145)

sbc100 · web-flow · commit 6754be32129e · 2022-01-31T12:00:12.000-08:00
This should be a pure transliteration of the JS function.

Good to see the native code cost here is only ~100 bytes but the JS savings are ~800.
diff --git a/emcc.py b/emcc.py
@@ -2028,7 +2028,6 @@ def default_setting(name, new_default):
     # Functions needs to be exported from the module since they are used in worker.js
     settings.REQUIRED_EXPORTS += [
       'emscripten_dispatch_to_thread_',
-      '_emscripten_main_thread_futex',
       '_emscripten_thread_free_data',
       '_emscripten_allow_main_runtime_queued_calls',
       'emscripten_main_browser_thread_id',
diff --git a/src/library_pthread.js b/src/library_pthread.js
@@ -567,8 +567,6 @@ var LibraryPThread = {
     );
 #if ASSERTIONS
     PThread.mainRuntimeThread = true;
-    // Verify that this native symbol used by futex_wait/wake is exported correctly.
-    assert(__emscripten_main_thread_futex > 0);
 #endif
     PThread.threadInit();
   },
@@ -783,115 +781,6 @@ var LibraryPThread = {
     return 0;
   },
 
-  // Returns 0 on success, or one of the values -ETIMEDOUT, -EWOULDBLOCK or -EINVAL on error.
-  _emscripten_futex_wait_non_blocking__deps: ['emscripten_main_thread_process_queued_calls'],
-  _emscripten_futex_wait_non_blocking: function(addr, val, timeout) {
-#if ASSERTIONS
-    // Should only be called from the main web thread where atomics.wait is not allowed.
-    assert(ENVIRONMENT_IS_WEB);
-#endif
-
-    // Atomics.wait is not available in the main browser thread, so simulate it via busy spinning.
-    var tNow = performance.now();
-    var tEnd = tNow + timeout;
-
-    // Register globally which address the main thread is simulating to be
-    // waiting on. When zero, the main thread is not waiting on anything, and on
-    // nonzero, the contents of the address pointed by __emscripten_main_thread_futex
-    // tell which address the main thread is simulating its wait on.
-    // We need to be careful of recursion here: If we wait on a futex, and
-    // then call _emscripten_main_thread_process_queued_calls() below, that
-    // will call code that takes the proxying mutex - which can once more
-    // reach this code in a nested call. To avoid interference between the
-    // two (there is just a single __emscripten_main_thread_futex at a time), unmark
-    // ourselves before calling the potentially-recursive call. See below for
-    // how we handle the case of our futex being notified during the time in
-    // between when we are not set as the value of __emscripten_main_thread_futex.
-#if ASSERTIONS
-    assert(__emscripten_main_thread_futex > 0);
-#endif
-    var lastAddr = Atomics.exchange(HEAP32, __emscripten_main_thread_futex >> 2, addr);
-#if ASSERTIONS
-    // We must not have already been waiting.
-    assert(lastAddr == 0);
-#endif
-
-    while (1) {
-      // Check for a timeout.
-      tNow = performance.now();
-      if (tNow > tEnd) {
-        // We timed out, so stop marking ourselves as waiting.
-        lastAddr = Atomics.exchange(HEAP32, __emscripten_main_thread_futex >> 2, 0);
-#if ASSERTIONS
-        // The current value must have been our address which we set, or
-        // in a race it was set to 0 which means another thread just allowed
-        // us to run, but (tragically) that happened just a bit too late.
-        assert(lastAddr == addr || lastAddr == 0);
-#endif
-        return -{{{ cDefine('ETIMEDOUT') }}};
-      }
-      // We are performing a blocking loop here, so we must handle proxied
-      // events from pthreads, to avoid deadlocks.
-      // Note that we have to do so carefully, as we may take a lock while
-      // doing so, which can recurse into this function; stop marking
-      // ourselves as waiting while we do so.
-      lastAddr = Atomics.exchange(HEAP32, __emscripten_main_thread_futex >> 2, 0);
-#if ASSERTIONS
-      assert(lastAddr == addr || lastAddr == 0);
-#endif
-      if (lastAddr == 0) {
-        // We were told to stop waiting, so stop.
-        break;
-      }
-      _emscripten_main_thread_process_queued_calls();
-
-      // Check the value, as if we were starting the futex all over again.
-      // This handles the following case:
-      //
-      //  * wait on futex A
-      //  * recurse into emscripten_main_thread_process_queued_calls(),
-      //    which waits on futex B. that sets the __emscripten_main_thread_futex address to
-      //    futex B, and there is no longer any mention of futex A.
-      //  * a worker is done with futex A. it checks __emscripten_main_thread_futex but does
-      //    not see A, so it does nothing special for the main thread.
-      //  * a worker is done with futex B. it flips mainThreadMutex from B
-      //    to 0, ending the wait on futex B.
-      //  * we return to the wait on futex A. __emscripten_main_thread_futex is 0, but that
-      //    is because of futex B being done - we can't tell from
-      //    __emscripten_main_thread_futex whether A is done or not. therefore, check the
-      //    memory value of the futex.
-      //
-      // That case motivates the design here. Given that, checking the memory
-      // address is also necessary for other reasons: we unset and re-set our
-      // address in __emscripten_main_thread_futex around calls to
-      // emscripten_main_thread_process_queued_calls(), and a worker could
-      // attempt to wake us up right before/after such times.
-      //
-      // Note that checking the memory value of the futex is valid to do: we
-      // could easily have been delayed (relative to the worker holding on
-      // to futex A), which means we could be starting all of our work at the
-      // later time when there is no need to block. The only "odd" thing is
-      // that we may have caused side effects in that "delay" time. But the
-      // only side effects we can have are to call
-      // emscripten_main_thread_process_queued_calls(). That is always ok to
-      // do on the main thread (it's why it is ok for us to call it in the
-      // middle of this function, and elsewhere). So if we check the value
-      // here and return, it's the same is if what happened on the main thread
-      // was the same as calling emscripten_main_thread_process_queued_calls()
-      // a few times times before calling emscripten_futex_wait().
-      if (Atomics.load(HEAP32, addr >> 2) != val) {
-        return -{{{ cDefine('EWOULDBLOCK') }}};
-      }
-
-      // Mark us as waiting once more, and continue the loop.
-      lastAddr = Atomics.exchange(HEAP32, __emscripten_main_thread_futex >> 2, addr);
-#if ASSERTIONS
-      assert(lastAddr == 0);
-#endif
-    }
-    return 0;
-  },
-
   __call_main__deps: ['exit', '$exitOnMainThread'],
   __call_main: function(argc, argv) {
     var returnCode = {{{ exportedAsmFunc('_main') }}}(argc, argv);
diff --git a/system/lib/pthread/emscripten_futex_wait.c b/system/lib/pthread/emscripten_futex_wait.c
@@ -8,10 +8,103 @@
 #include <errno.h>
 #include <math.h>
 #include <emscripten/threading.h>
+#include "atomic.h"
 #include "threading_internal.h"
 
+extern void* _emscripten_main_thread_futex;
+
 int _emscripten_thread_supports_atomics_wait(void);
-int _emscripten_futex_wait_non_blocking(volatile void *addr, uint32_t val, double max_wait_ms);
+
+static int futex_wait_busy(volatile void *addr, uint32_t val, double timeout) {
+  // Atomics.wait is not available in the main browser thread, so simulate it via busy spinning.
+  double now = emscripten_get_now();
+  double end = now + timeout;
+
+  // Register globally which address the main thread is simulating to be
+  // waiting on. When zero, the main thread is not waiting on anything, and on
+  // nonzero, the contents of the address pointed by __emscripten_main_thread_futex
+  // tell which address the main thread is simulating its wait on.
+  // We need to be careful of recursion here: If we wait on a futex, and
+  // then call _emscripten_main_thread_process_queued_calls() below, that
+  // will call code that takes the proxying mutex - which can once more
+  // reach this code in a nested call. To avoid interference between the
+  // two (there is just a single __emscripten_main_thread_futex at a time), unmark
+  // ourselves before calling the potentially-recursive call. See below for
+  // how we handle the case of our futex being notified during the time in
+  // between when we are not set as the value of __emscripten_main_thread_futex.
+  void* last_addr = a_cas_p(&_emscripten_main_thread_futex, 0, (void*)addr);
+  // We must not have already been waiting.
+  assert(last_addr == 0);
+
+  while (1) {
+    // Check for a timeout.
+    now = emscripten_get_now();
+    if (now > end) {
+      // We timed out, so stop marking ourselves as waiting.
+      last_addr = a_cas_p(&_emscripten_main_thread_futex, (void*)addr, 0);
+      // The current value must have been our address which we set, or
+      // in a race it was set to 0 which means another thread just allowed
+      // us to run, but (tragically) that happened just a bit too late.
+      assert(last_addr == addr || last_addr == 0);
+      return -ETIMEDOUT;
+    }
+    // We are performing a blocking loop here, so we must handle proxied
+    // events from pthreads, to avoid deadlocks.
+    // Note that we have to do so carefully, as we may take a lock while
+    // doing so, which can recurse into this function; stop marking
+    // ourselves as waiting while we do so.
+    last_addr = a_cas_p(&_emscripten_main_thread_futex, (void*)addr, 0);
+    assert(last_addr == addr || last_addr == 0);
+    if (last_addr == 0) {
+      // We were told to stop waiting, so stop.
+      break;
+    }
+    emscripten_main_thread_process_queued_calls();
+
+    // Check the value, as if we were starting the futex all over again.
+    // This handles the following case:
+    //
+    //  * wait on futex A
+    //  * recurse into emscripten_main_thread_process_queued_calls(),
+    //    which waits on futex B. that sets the __emscripten_main_thread_futex address to
+    //    futex B, and there is no longer any mention of futex A.
+    //  * a worker is done with futex A. it checks __emscripten_main_thread_futex but does
+    //    not see A, so it does nothing special for the main thread.
+    //  * a worker is done with futex B. it flips mainThreadMutex from B
+    //    to 0, ending the wait on futex B.
+    //  * we return to the wait on futex A. __emscripten_main_thread_futex is 0, but that
+    //    is because of futex B being done - we can't tell from
+    //    __emscripten_main_thread_futex whether A is done or not. therefore, check the
+    //    memory value of the futex.
+    //
+    // That case motivates the design here. Given that, checking the memory
+    // address is also necessary for other reasons: we unset and re-set our
+    // address in __emscripten_main_thread_futex around calls to
+    // emscripten_main_thread_process_queued_calls(), and a worker could
+    // attempt to wake us up right before/after such times.
+    //
+    // Note that checking the memory value of the futex is valid to do: we
+    // could easily have been delayed (relative to the worker holding on
+    // to futex A), which means we could be starting all of our work at the
+    // later time when there is no need to block. The only "odd" thing is
+    // that we may have caused side effects in that "delay" time. But the
+    // only side effects we can have are to call
+    // emscripten_main_thread_process_queued_calls(). That is always ok to
+    // do on the main thread (it's why it is ok for us to call it in the
+    // middle of this function, and elsewhere). So if we check the value
+    // here and return, it's the same is if what happened on the main thread
+    // was the same as calling emscripten_main_thread_process_queued_calls()
+    // a few times before calling emscripten_futex_wait().
+    if (__c11_atomic_load((_Atomic uintptr_t*)addr, __ATOMIC_SEQ_CST) != val) {
+      return -EWOULDBLOCK;
+    }
+
+    // Mark us as waiting once more, and continue the loop.
+    last_addr = a_cas_p(&_emscripten_main_thread_futex, 0, (void*)addr);
+    assert(last_addr == 0);
+  }
+  return 0;
+}
 
 int emscripten_futex_wait(volatile void *addr, uint32_t val, double max_wait_ms) {
   if ((((intptr_t)addr)&3) != 0) {
@@ -25,7 +118,7 @@ int emscripten_futex_wait(volatile void *addr, uint32_t val, double max_wait_ms)
   // __builtin_wasm_memory_atomic_wait32 so we call out the JS function that
   // will busy wait.
   if (!_emscripten_thread_supports_atomics_wait()) {
-    ret = _emscripten_futex_wait_non_blocking(addr, val, max_wait_ms);
+    ret = futex_wait_busy(addr, val, max_wait_ms);
     emscripten_conditional_set_current_thread_status(EM_THREAD_STATUS_WAITFUTEX, EM_THREAD_STATUS_RUNNING);
     return ret;
   }
diff --git a/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.exports b/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.exports
@@ -7,7 +7,7 @@ F
 G
 H
 I
-J
+o
 p
 q
 r
diff --git a/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.funcs b/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.funcs
@@ -21,6 +21,7 @@ $_emscripten_thread_free_data
 $_emscripten_thread_init
 $_main_thread
 $a_cas
+$a_cas_p.1
 $a_dec
 $a_fetch_add.1
 $a_inc
diff --git a/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.imports b/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.imports
@@ -12,4 +12,3 @@ a.k
 a.l
 a.m
 a.n
-a.o
diff --git a/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.jssize b/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.jssize
@@ -1 +1 @@
-48044
+47209
diff --git a/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.sent b/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.sent
@@ -12,4 +12,3 @@ k
 l
 m
 n
-o
diff --git a/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.size b/tests/other/metadce/minimal_main_Oz_USE_PTHREADS_PROXY_TO_PTHREAD.size
@@ -1 +1 @@
-17404
+17498

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ F`
`7`	`7`	`G`
`8`	`8`	`H`
`9`	`9`	`I`
`10`		`-J`
	`10`	`+o`
`11`	`11`	`p`
`12`	`12`	`q`
`13`	`13`	`r`
-Original file line number
+Diff line change
 a.l
 a.m
 a.n
 -a.o
Original file line number	Diff line number	Diff line change
`@@ -12,4 +12,3 @@ k`
`12`	`12`	`l`
`13`	`13`	`m`
`14`	`14`	`n`
`15`		`-o`