Merge pull request 'Speedup foldl over bytes' (#2) from optimize_fold_bytes_inline_subscript_bytes_call into master

Reviewed-on: #2
2022-08-21 13:39:44 +00:00 · 2022-08-21 13:39:44 +00:00 · c02afb05f4
commit c02afb05f4
parent 2970093c8f 5ad5a9c064
2 changed files with 60 additions and 5 deletions
--- a/examples/crc32.html
+++ b/examples/crc32.html
@ -13,6 +13,45 @@ Note: This tests performs some timing comparison, please wait a few seconds for

 <h2>Measurement log</h2>
 <h3>AMD Ryzen 7 3700X 8-Core, Ubuntu 20.04, Linux 5.4.0-124-generic</h3>
+<h4>After optimizing fold over bytes by inlineing __subscript_bytes__</h4>
+<table>
+  <tr>
+    <td>Test</td>
+    <td>Interpreter</td>
+    <td>Setup</td>
+    <td>WebAssembly</td>
+    <td>Javascript</td>
+  </tr>
+  <tr>
+    <td>Lynx * 65536</td>
+    <td>Chromium 104.0.5112.101</td>
+    <td>DevTools closed</td>
+    <td>5.70</td>
+    <td>12.45</td>
+  </tr>
+  <tr>
+    <td>Lynx * 65536</td>
+    <td>Firefox 103</td>
+    <td>DevTools closed</td>
+    <td>5.16</td>
+    <td>5.72</td>
+  </tr>
+  <tr>
+    <td>Lynx * 1048576</td>
+    <td>Chromium 104.0.5112.101</td>
+    <td>DevTools closed</td>
+    <td>95.65</td>
+    <td>203.60</td>
+  </tr>
+  <tr>
+    <td>Lynx * 1048576</td>
+    <td>Firefox 103</td>
+    <td>DevTools closed</td>
+    <td>83.34</td>
+    <td>92.38</td>
+  </tr>
+</table>
+<h4>Before optimizing fold over bytes by inlineing __subscript_bytes__</h4>
 <table>
  <tr>
    <td>Test</td>
@ -80,7 +119,7 @@ Note: This tests performs some timing comparison, please wait a few seconds for
  </tr>
 </table>

-Notes:<br />
+<h4>Notes</h4>
 - Firefox seems faster than Chromium in my setup for Javascript, WebAssembly seems about the same.<br />
 - Having DevTools open in Chromium seems to slow down the WebAssembly by about 30%, but not when doing a recording of the page load.<br />
 - WebAssembly in Firefox seems to slow down when doing a recording of the page load, which makes sense, but the Javascript does not.<br />
@ -168,7 +207,9 @@ function run_test(app, str, str_repeat)
  });

  // Don't test speedup for small strings, it varies a lot
-  let speedup = str.length < 16 ? 1 : (js_time == wasm_time ? 1 : js_time / wasm_time);
+  let speedup = (wasm_timing.min == 0 || js_timing.min == 0)
+    ? 1
+    : js_time / wasm_time;

  test_result(check && 0.999 < speedup, { // At least as fast as Javascript
    'summary': 'crc32(' + (str
@ -197,7 +238,7 @@ WebAssembly.instantiateStreaming(fetch('crc32.wasm'), importObject)
    run_test(app, "abcdefghijklmnopqrstuvwxyz");
    run_test(app, "The quick brown fox jumps over the lazy dog");
    run_test(app, "The quick brown fox jumps over the lazy dog", 1024);
-    run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 1048576);
+    run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 65536);
  });
 </script>

--- a/phasm/compiler.py
+++ b/phasm/compiler.py
@ -368,13 +368,27 @@ def expression_fold(wgn: WasmGenerator, inp: ourlang.Fold) -> None:
    wgn.local.get(len_var)
    wgn.i32.lt_u()
    with wgn.if_():
+        # From here on, adr_var is the address of byte we're referencing
+        # This is akin to calling stdlib_types.__subscript_bytes__
+        # But since we already know we are inside of bounds,
+        # can just bypass it and load the memory directly.
+        wgn.local.get(adr_var)
+        wgn.i32.const(3) # Bytes header -1, since we do a +1 every loop
+        wgn.i32.add()
+        wgn.local.set(adr_var)
+
        wgn.add_statement('nop', comment='while True')
        with wgn.loop():
            wgn.add_statement('nop', comment='acu = func(acu, iter[i])')
            wgn.local.get(acu_var)
+
+            # Get the next byte, write back the address
            wgn.local.get(adr_var)
-            wgn.local.get(idx_var)
-            wgn.call(stdlib_types.__subscript_bytes__)
+            wgn.i32.const(1)
+            wgn.i32.add()
+            wgn.local.tee(adr_var)
+            wgn.i32.load8_u()
+
            wgn.add_statement('call', f'${inp.func.name}')
            wgn.local.set(acu_var)