diff --git a/examples/crc32.html b/examples/crc32.html
index 949dadf..e78505c 100644
--- a/examples/crc32.html
+++ b/examples/crc32.html
@@ -13,6 +13,45 @@ Note: This tests performs some timing comparison, please wait a few seconds for
Measurement log
AMD Ryzen 7 3700X 8-Core, Ubuntu 20.04, Linux 5.4.0-124-generic
+After optimizing fold over bytes by inlineing __subscript_bytes__
+
+
+ | Test |
+ Interpreter |
+ Setup |
+ WebAssembly |
+ Javascript |
+
+
+ | Lynx * 65536 |
+ Chromium 104.0.5112.101 |
+ DevTools closed |
+ 5.70 |
+ 12.45 |
+
+
+ | Lynx * 65536 |
+ Firefox 103 |
+ DevTools closed |
+ 5.16 |
+ 5.72 |
+
+
+ | Lynx * 1048576 |
+ Chromium 104.0.5112.101 |
+ DevTools closed |
+ 95.65 |
+ 203.60 |
+
+
+ | Lynx * 1048576 |
+ Firefox 103 |
+ DevTools closed |
+ 83.34 |
+ 92.38 |
+
+
+Before optimizing fold over bytes by inlineing __subscript_bytes__
| Test |
@@ -80,7 +119,7 @@ Note: This tests performs some timing comparison, please wait a few seconds for
-Notes:
+Notes
- Firefox seems faster than Chromium in my setup for Javascript, WebAssembly seems about the same.
- Having DevTools open in Chromium seems to slow down the WebAssembly by about 30%, but not when doing a recording of the page load.
- WebAssembly in Firefox seems to slow down when doing a recording of the page load, which makes sense, but the Javascript does not.
@@ -168,7 +207,9 @@ function run_test(app, str, str_repeat)
});
// Don't test speedup for small strings, it varies a lot
- let speedup = str.length < 16 ? 1 : (js_time == wasm_time ? 1 : js_time / wasm_time);
+ let speedup = (wasm_timing.min == 0 || js_timing.min == 0)
+ ? 1
+ : js_time / wasm_time;
test_result(check && 0.999 < speedup, { // At least as fast as Javascript
'summary': 'crc32(' + (str
@@ -197,7 +238,7 @@ WebAssembly.instantiateStreaming(fetch('crc32.wasm'), importObject)
run_test(app, "abcdefghijklmnopqrstuvwxyz");
run_test(app, "The quick brown fox jumps over the lazy dog");
run_test(app, "The quick brown fox jumps over the lazy dog", 1024);
- run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 1048576);
+ run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 65536);
});
diff --git a/phasm/compiler.py b/phasm/compiler.py
index 3b62065..4ba5d5a 100644
--- a/phasm/compiler.py
+++ b/phasm/compiler.py
@@ -368,13 +368,27 @@ def expression_fold(wgn: WasmGenerator, inp: ourlang.Fold) -> None:
wgn.local.get(len_var)
wgn.i32.lt_u()
with wgn.if_():
+ # From here on, adr_var is the address of byte we're referencing
+ # This is akin to calling stdlib_types.__subscript_bytes__
+ # But since we already know we are inside of bounds,
+ # can just bypass it and load the memory directly.
+ wgn.local.get(adr_var)
+ wgn.i32.const(3) # Bytes header -1, since we do a +1 every loop
+ wgn.i32.add()
+ wgn.local.set(adr_var)
+
wgn.add_statement('nop', comment='while True')
with wgn.loop():
wgn.add_statement('nop', comment='acu = func(acu, iter[i])')
wgn.local.get(acu_var)
+
+ # Get the next byte, write back the address
wgn.local.get(adr_var)
- wgn.local.get(idx_var)
- wgn.call(stdlib_types.__subscript_bytes__)
+ wgn.i32.const(1)
+ wgn.i32.add()
+ wgn.local.tee(adr_var)
+ wgn.i32.load8_u()
+
wgn.add_statement('call', f'${inp.func.name}')
wgn.local.set(acu_var)