Speeup foldl over bytes

Prior to this PR, the compiler would call stdlib.types's
__subscript_bytes__.

However, that function performs some checks we do not need.

After this MR, folding iterates directly over the bytes
memory, saving the memory access checks and the function
calls. This gets us a speedup of about 43% less CPU time
used on Firefox.

Also, by default, the CRC32 page runs a shorter timing test.
This commit is contained in:
Johan B.W. de Vries 2022-08-21 15:37:20 +02:00
parent 2970093c8f
commit 75a63e490f
2 changed files with 60 additions and 5 deletions

View File

@ -13,6 +13,45 @@ Note: This tests performs some timing comparison, please wait a few seconds for
<h2>Measurement log</h2>
<h3>AMD Ryzen 7 3700X 8-Core, Ubuntu 20.04, Linux 5.4.0-124-generic</h3>
<h4>After optimizing fold over bytes by inlineing __subscript_bytes__</h4>
<table>
<tr>
<td>Test</td>
<td>Interpreter</td>
<td>Setup</td>
<td>WebAssembly</td>
<td>Javascript</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Chromium 104.0.5112.101</td>
<td>DevTools closed</td>
<td>5.70</td>
<td>12.45</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Firefox 103</td>
<td>DevTools closed</td>
<td>5.16</td>
<td>5.72</td>
</tr>
<tr>
<td>Lynx * 1048576</td>
<td>Chromium 104.0.5112.101</td>
<td>DevTools closed</td>
<td>95.65</td>
<td>203.60</td>
</tr>
<tr>
<td>Lynx * 1048576</td>
<td>Firefox 103</td>
<td>DevTools closed</td>
<td>83.34</td>
<td>92.38</td>
</tr>
</table>
<h4>Before optimizing fold over bytes by inlineing __subscript_bytes__</h4>
<table>
<tr>
<td>Test</td>
@ -80,7 +119,7 @@ Note: This tests performs some timing comparison, please wait a few seconds for
</tr>
</table>
Notes:<br />
<h4>Notes</h4>
- Firefox seems faster than Chromium in my setup for Javascript, WebAssembly seems about the same.<br />
- Having DevTools open in Chromium seems to slow down the WebAssembly by about 30%, but not when doing a recording of the page load.<br />
- WebAssembly in Firefox seems to slow down when doing a recording of the page load, which makes sense, but the Javascript does not.<br />
@ -168,7 +207,9 @@ function run_test(app, str, str_repeat)
});
// Don't test speedup for small strings, it varies a lot
let speedup = str.length < 16 ? 1 : (js_time == wasm_time ? 1 : js_time / wasm_time);
let speedup = (wasm_timing.min == 0 || js_timing.min == 0)
? 1
: js_time / wasm_time;
test_result(check && 0.999 < speedup, { // At least as fast as Javascript
'summary': 'crc32(' + (str
@ -197,7 +238,7 @@ WebAssembly.instantiateStreaming(fetch('crc32.wasm'), importObject)
run_test(app, "abcdefghijklmnopqrstuvwxyz");
run_test(app, "The quick brown fox jumps over the lazy dog");
run_test(app, "The quick brown fox jumps over the lazy dog", 1024);
run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 1048576);
run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 65536);
});
</script>

View File

@ -368,13 +368,27 @@ def expression_fold(wgn: WasmGenerator, inp: ourlang.Fold) -> None:
wgn.local.get(len_var)
wgn.i32.lt_u()
with wgn.if_():
# From here on, adr_var is the address of byte we're referencing
# This is akin to calling stdlib_types.__subscript_bytes__
# But since we already know we are inside of bounds,
# can just bypass it and load the memory directly.
wgn.local.get(adr_var)
wgn.i32.const(3) # Bytes header -1, since we do a +1 every loop
wgn.i32.add()
wgn.local.set(adr_var)
wgn.add_statement('nop', comment='while True')
with wgn.loop():
wgn.add_statement('nop', comment='acu = func(acu, iter[i])')
wgn.local.get(acu_var)
# Get the next byte, write back the address
wgn.local.get(adr_var)
wgn.local.get(idx_var)
wgn.call(stdlib_types.__subscript_bytes__)
wgn.i32.const(1)
wgn.i32.add()
wgn.local.tee(adr_var)
wgn.i32.load8_u()
wgn.add_statement('call', f'${inp.func.name}')
wgn.local.set(acu_var)