From 5ad5a9c064687e7c8690d49783873c9948629184 Mon Sep 17 00:00:00 2001 From: "Johan B.W. de Vries" Date: Sun, 21 Aug 2022 15:37:20 +0200 Subject: [PATCH] Speedup foldl over bytes Prior to this PR, the compiler would call stdlib.types's __subscript_bytes__. However, that function performs some checks we do not need. After this MR, folding iterates directly over the bytes memory, saving the memory access checks and the function calls. This gets us a speedup of about 43% less CPU time used on Firefox. Also, by default, the CRC32 page runs a shorter timing test. --- examples/crc32.html | 47 ++++++++++++++++++++++++++++++++++++++++++--- phasm/compiler.py | 18 +++++++++++++++-- 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/examples/crc32.html b/examples/crc32.html index 949dadf..e78505c 100644 --- a/examples/crc32.html +++ b/examples/crc32.html @@ -13,6 +13,45 @@ Note: This tests performs some timing comparison, please wait a few seconds for

Measurement log

AMD Ryzen 7 3700X 8-Core, Ubuntu 20.04, Linux 5.4.0-124-generic

+

After optimizing fold over bytes by inlineing __subscript_bytes__

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TestInterpreterSetupWebAssemblyJavascript
Lynx * 65536Chromium 104.0.5112.101DevTools closed5.7012.45
Lynx * 65536Firefox 103DevTools closed5.165.72
Lynx * 1048576Chromium 104.0.5112.101DevTools closed95.65203.60
Lynx * 1048576Firefox 103DevTools closed83.3492.38
+

Before optimizing fold over bytes by inlineing __subscript_bytes__

@@ -80,7 +119,7 @@ Note: This tests performs some timing comparison, please wait a few seconds for
Test
-Notes:
+

Notes

- Firefox seems faster than Chromium in my setup for Javascript, WebAssembly seems about the same.
- Having DevTools open in Chromium seems to slow down the WebAssembly by about 30%, but not when doing a recording of the page load.
- WebAssembly in Firefox seems to slow down when doing a recording of the page load, which makes sense, but the Javascript does not.
@@ -168,7 +207,9 @@ function run_test(app, str, str_repeat) }); // Don't test speedup for small strings, it varies a lot - let speedup = str.length < 16 ? 1 : (js_time == wasm_time ? 1 : js_time / wasm_time); + let speedup = (wasm_timing.min == 0 || js_timing.min == 0) + ? 1 + : js_time / wasm_time; test_result(check && 0.999 < speedup, { // At least as fast as Javascript 'summary': 'crc32(' + (str @@ -197,7 +238,7 @@ WebAssembly.instantiateStreaming(fetch('crc32.wasm'), importObject) run_test(app, "abcdefghijklmnopqrstuvwxyz"); run_test(app, "The quick brown fox jumps over the lazy dog"); run_test(app, "The quick brown fox jumps over the lazy dog", 1024); - run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 1048576); + run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 65536); }); diff --git a/phasm/compiler.py b/phasm/compiler.py index 3b62065..4ba5d5a 100644 --- a/phasm/compiler.py +++ b/phasm/compiler.py @@ -368,13 +368,27 @@ def expression_fold(wgn: WasmGenerator, inp: ourlang.Fold) -> None: wgn.local.get(len_var) wgn.i32.lt_u() with wgn.if_(): + # From here on, adr_var is the address of byte we're referencing + # This is akin to calling stdlib_types.__subscript_bytes__ + # But since we already know we are inside of bounds, + # can just bypass it and load the memory directly. + wgn.local.get(adr_var) + wgn.i32.const(3) # Bytes header -1, since we do a +1 every loop + wgn.i32.add() + wgn.local.set(adr_var) + wgn.add_statement('nop', comment='while True') with wgn.loop(): wgn.add_statement('nop', comment='acu = func(acu, iter[i])') wgn.local.get(acu_var) + + # Get the next byte, write back the address wgn.local.get(adr_var) - wgn.local.get(idx_var) - wgn.call(stdlib_types.__subscript_bytes__) + wgn.i32.const(1) + wgn.i32.add() + wgn.local.tee(adr_var) + wgn.i32.load8_u() + wgn.add_statement('call', f'${inp.func.name}') wgn.local.set(acu_var)