Speeup foldl over bytes

Prior to this PR, the compiler would call stdlib.types's __subscript_bytes__. However, that function performs some checks we do not need. After this MR, folding iterates directly over the bytes memory, saving the memory access checks and the function calls. This gets us a speedup of about 43% less CPU time used on Firefox. Also, by default, the CRC32 page runs a shorter timing test.
2022-08-21 15:37:20 +02:00 · 2022-08-21 15:37:20 +02:00 · 75a63e490f
commit 75a63e490f
parent 2970093c8f
2 changed files with 60 additions and 5 deletions
--- a/examples/crc32.html
+++ b/examples/crc32.html
@ -13,6 +13,45 @@ Note: This tests performs some timing comparison, please wait a few seconds for

 <h2>Measurement log</h2>
 <h3>AMD Ryzen 7 3700X 8-Core, Ubuntu 20.04, Linux 5.4.0-124-generic</h3>
+<h4>After optimizing fold over bytes by inlineing __subscript_bytes__</h4>
+<table>
+  <tr>
+    <td>Test</td>
+    <td>Interpreter</td>
+    <td>Setup</td>
+    <td>WebAssembly</td>
+    <td>Javascript</td>
+  </tr>
+  <tr>
+    <td>Lynx * 65536</td>
+    <td>Chromium 104.0.5112.101</td>
+    <td>DevTools closed</td>
+    <td>5.70</td>
+    <td>12.45</td>
+  </tr>
+  <tr>
+    <td>Lynx * 65536</td>
+    <td>Firefox 103</td>
+    <td>DevTools closed</td>
+    <td>5.16</td>
+    <td>5.72</td>
+  </tr>
+  <tr>
+    <td>Lynx * 1048576</td>
+    <td>Chromium 104.0.5112.101</td>
+    <td>DevTools closed</td>
+    <td>95.65</td>
+    <td>203.60</td>
+  </tr>
+  <tr>
+    <td>Lynx * 1048576</td>
+    <td>Firefox 103</td>
+    <td>DevTools closed</td>
+    <td>83.34</td>
+    <td>92.38</td>
+  </tr>
+</table>
+<h4>Before optimizing fold over bytes by inlineing __subscript_bytes__</h4>
 <table>
  <tr>
    <td>Test</td>
@ -80,7 +119,7 @@ Note: This tests performs some timing comparison, please wait a few seconds for
  </tr>
 </table>

-Notes:<br />
+<h4>Notes</h4>
 - Firefox seems faster than Chromium in my setup for Javascript, WebAssembly seems about the same.<br />
 - Having DevTools open in Chromium seems to slow down the WebAssembly by about 30%, but not when doing a recording of the page load.<br />
 - WebAssembly in Firefox seems to slow down when doing a recording of the page load, which makes sense, but the Javascript does not.<br />
@ -168,7 +207,9 @@ function run_test(app, str, str_repeat)
  });

  // Don't test speedup for small strings, it varies a lot
-  let speedup = str.length < 16 ? 1 : (js_time == wasm_time ? 1 : js_time / wasm_time);
+  let speedup = (wasm_timing.min == 0 || js_timing.min == 0)
+    ? 1
+    : js_time / wasm_time;

  test_result(check && 0.999 < speedup, { // At least as fast as Javascript
    'summary': 'crc32(' + (str
@ -197,7 +238,7 @@ WebAssembly.instantiateStreaming(fetch('crc32.wasm'), importObject)
    run_test(app, "abcdefghijklmnopqrstuvwxyz");
    run_test(app, "The quick brown fox jumps over the lazy dog");
    run_test(app, "The quick brown fox jumps over the lazy dog", 1024);
-    run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 1048576);
+    run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 65536);
  });
 </script>

--- a/phasm/compiler.py
+++ b/phasm/compiler.py
@ -368,13 +368,27 @@ def expression_fold(wgn: WasmGenerator, inp: ourlang.Fold) -> None:
    wgn.local.get(len_var)
    wgn.i32.lt_u()
    with wgn.if_():
+        # From here on, adr_var is the address of byte we're referencing
+        # This is akin to calling stdlib_types.__subscript_bytes__
+        # But since we already know we are inside of bounds,
+        # can just bypass it and load the memory directly.
+        wgn.local.get(adr_var)
+        wgn.i32.const(3) # Bytes header -1, since we do a +1 every loop
+        wgn.i32.add()
+        wgn.local.set(adr_var)
+
        wgn.add_statement('nop', comment='while True')
        with wgn.loop():
            wgn.add_statement('nop', comment='acu = func(acu, iter[i])')
            wgn.local.get(acu_var)
+
+            # Get the next byte, write back the address
            wgn.local.get(adr_var)
-            wgn.local.get(idx_var)
-            wgn.call(stdlib_types.__subscript_bytes__)
+            wgn.i32.const(1)
+            wgn.i32.add()
+            wgn.local.tee(adr_var)
+            wgn.i32.load8_u()
+
            wgn.add_statement('call', f'${inp.func.name}')
            wgn.local.set(acu_var)