From 5ad5a9c064687e7c8690d49783873c9948629184 Mon Sep 17 00:00:00 2001
From: "Johan B.W. de Vries" <info@jbwdevries.nl>
Date: Sun, 21 Aug 2022 15:37:20 +0200
Subject: [PATCH] Speedup foldl over bytes

Prior to this PR, the compiler would call stdlib.types's
__subscript_bytes__.

However, that function performs some checks we do not need.

After this MR, folding iterates directly over the bytes
memory, saving the memory access checks and the function
calls. This gets us a speedup of about 43% less CPU time
used on Firefox.

Also, by default, the CRC32 page runs a shorter timing test.
---
 examples/crc32.html | 47 ++++++++++++++++++++++++++++++++++++++++++---
 phasm/compiler.py   | 18 +++++++++++++++--
 2 files changed, 60 insertions(+), 5 deletions(-)
diff --git a/examples/crc32.html b/examples/crc32.html
index 949dadf..e78505c 100644
--- a/examples/crc32.html
+++ b/examples/crc32.html
@@ -13,6 +13,45 @@ Note: This tests performs some timing comparison, please wait a few seconds for
 
 <h2>Measurement log</h2>
 <h3>AMD Ryzen 7 3700X 8-Core, Ubuntu 20.04, Linux 5.4.0-124-generic</h3>
+<h4>After optimizing fold over bytes by inlineing __subscript_bytes__</h4>
+<table>
+  <tr>
+    <td>Test</td>
+    <td>Interpreter</td>
+    <td>Setup</td>
+    <td>WebAssembly</td>
+    <td>Javascript</td>
+  </tr>
+  <tr>
+    <td>Lynx * 65536</td>
+    <td>Chromium 104.0.5112.101</td>
+    <td>DevTools closed</td>
+    <td>5.70</td>
+    <td>12.45</td>
+  </tr>
+  <tr>
+    <td>Lynx * 65536</td>
+    <td>Firefox 103</td>
+    <td>DevTools closed</td>
+    <td>5.16</td>
+    <td>5.72</td>
+  </tr>
+  <tr>
+    <td>Lynx * 1048576</td>
+    <td>Chromium 104.0.5112.101</td>
+    <td>DevTools closed</td>
+    <td>95.65</td>
+    <td>203.60</td>
+  </tr>
+  <tr>
+    <td>Lynx * 1048576</td>
+    <td>Firefox 103</td>
+    <td>DevTools closed</td>
+    <td>83.34</td>
+    <td>92.38</td>
+  </tr>
+</table>
+<h4>Before optimizing fold over bytes by inlineing __subscript_bytes__</h4>
 <table>
   <tr>
     <td>Test</td>
@@ -80,7 +119,7 @@ Note: This tests performs some timing comparison, please wait a few seconds for
   </tr>
 </table>
 
-Notes:<br />
+<h4>Notes</h4>
 - Firefox seems faster than Chromium in my setup for Javascript, WebAssembly seems about the same.<br />
 - Having DevTools open in Chromium seems to slow down the WebAssembly by about 30%, but not when doing a recording of the page load.<br />
 - WebAssembly in Firefox seems to slow down when doing a recording of the page load, which makes sense, but the Javascript does not.<br />
@@ -168,7 +207,9 @@ function run_test(app, str, str_repeat)
   });
 
   // Don't test speedup for small strings, it varies a lot
-  let speedup = str.length < 16 ? 1 : (js_time == wasm_time ? 1 : js_time / wasm_time);
+  let speedup = (wasm_timing.min == 0 || js_timing.min == 0)
+    ? 1
+    : js_time / wasm_time;
 
   test_result(check && 0.999 < speedup, { // At least as fast as Javascript
     'summary': 'crc32(' + (str
@@ -197,7 +238,7 @@ WebAssembly.instantiateStreaming(fetch('crc32.wasm'), importObject)
     run_test(app, "abcdefghijklmnopqrstuvwxyz");
     run_test(app, "The quick brown fox jumps over the lazy dog");
     run_test(app, "The quick brown fox jumps over the lazy dog", 1024);
-    run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 1048576);
+    run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 65536);
   });
 </script>
 
diff --git a/phasm/compiler.py b/phasm/compiler.py
index 3b62065..4ba5d5a 100644
--- a/phasm/compiler.py
+++ b/phasm/compiler.py
@@ -368,13 +368,27 @@ def expression_fold(wgn: WasmGenerator, inp: ourlang.Fold) -> None:
     wgn.local.get(len_var)
     wgn.i32.lt_u()
     with wgn.if_():
+        # From here on, adr_var is the address of byte we're referencing
+        # This is akin to calling stdlib_types.__subscript_bytes__
+        # But since we already know we are inside of bounds,
+        # can just bypass it and load the memory directly.
+        wgn.local.get(adr_var)
+        wgn.i32.const(3) # Bytes header -1, since we do a +1 every loop
+        wgn.i32.add()
+        wgn.local.set(adr_var)
+
         wgn.add_statement('nop', comment='while True')
         with wgn.loop():
             wgn.add_statement('nop', comment='acu = func(acu, iter[i])')
             wgn.local.get(acu_var)
+
+            # Get the next byte, write back the address
             wgn.local.get(adr_var)
-            wgn.local.get(idx_var)
-            wgn.call(stdlib_types.__subscript_bytes__)
+            wgn.i32.const(1)
+            wgn.i32.add()
+            wgn.local.tee(adr_var)
+            wgn.i32.load8_u()
+
             wgn.add_statement('call', f'${inp.func.name}')
             wgn.local.set(acu_var)
 

Test	Interpreter	Setup	WebAssembly	Javascript
Lynx * 65536	Chromium 104.0.5112.101	DevTools closed	5.70	12.45
Lynx * 65536	Firefox 103	DevTools closed	5.16	5.72
Lynx * 1048576	Chromium 104.0.5112.101	DevTools closed	95.65	203.60
Lynx * 1048576	Firefox 103	DevTools closed	83.34	92.38