Merge pull request 'Speedup foldl over bytes' (#2) from optimize_fold_bytes_inline_subscript_bytes_call into master

Reviewed-on: #2
This commit is contained in:
jbwdevries 2022-08-21 13:39:44 +00:00
commit c02afb05f4
2 changed files with 60 additions and 5 deletions

View File

@ -13,6 +13,45 @@ Note: This tests performs some timing comparison, please wait a few seconds for
<h2>Measurement log</h2>
<h3>AMD Ryzen 7 3700X 8-Core, Ubuntu 20.04, Linux 5.4.0-124-generic</h3>
<h4>After optimizing fold over bytes by inlineing __subscript_bytes__</h4>
<table>
<tr>
<td>Test</td>
<td>Interpreter</td>
<td>Setup</td>
<td>WebAssembly</td>
<td>Javascript</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Chromium 104.0.5112.101</td>
<td>DevTools closed</td>
<td>5.70</td>
<td>12.45</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Firefox 103</td>
<td>DevTools closed</td>
<td>5.16</td>
<td>5.72</td>
</tr>
<tr>
<td>Lynx * 1048576</td>
<td>Chromium 104.0.5112.101</td>
<td>DevTools closed</td>
<td>95.65</td>
<td>203.60</td>
</tr>
<tr>
<td>Lynx * 1048576</td>
<td>Firefox 103</td>
<td>DevTools closed</td>
<td>83.34</td>
<td>92.38</td>
</tr>
</table>
<h4>Before optimizing fold over bytes by inlineing __subscript_bytes__</h4>
<table>
<tr>
<td>Test</td>
@ -80,7 +119,7 @@ Note: This tests performs some timing comparison, please wait a few seconds for
</tr>
</table>
Notes:<br />
<h4>Notes</h4>
- Firefox seems faster than Chromium in my setup for Javascript, WebAssembly seems about the same.<br />
- Having DevTools open in Chromium seems to slow down the WebAssembly by about 30%, but not when doing a recording of the page load.<br />
- WebAssembly in Firefox seems to slow down when doing a recording of the page load, which makes sense, but the Javascript does not.<br />
@ -168,7 +207,9 @@ function run_test(app, str, str_repeat)
});
// Don't test speedup for small strings, it varies a lot
let speedup = str.length < 16 ? 1 : (js_time == wasm_time ? 1 : js_time / wasm_time);
let speedup = (wasm_timing.min == 0 || js_timing.min == 0)
? 1
: js_time / wasm_time;
test_result(check && 0.999 < speedup, { // At least as fast as Javascript
'summary': 'crc32(' + (str
@ -197,7 +238,7 @@ WebAssembly.instantiateStreaming(fetch('crc32.wasm'), importObject)
run_test(app, "abcdefghijklmnopqrstuvwxyz");
run_test(app, "The quick brown fox jumps over the lazy dog");
run_test(app, "The quick brown fox jumps over the lazy dog", 1024);
run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 1048576);
run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 65536);
});
</script>

View File

@ -368,13 +368,27 @@ def expression_fold(wgn: WasmGenerator, inp: ourlang.Fold) -> None:
wgn.local.get(len_var)
wgn.i32.lt_u()
with wgn.if_():
# From here on, adr_var is the address of byte we're referencing
# This is akin to calling stdlib_types.__subscript_bytes__
# But since we already know we are inside of bounds,
# can just bypass it and load the memory directly.
wgn.local.get(adr_var)
wgn.i32.const(3) # Bytes header -1, since we do a +1 every loop
wgn.i32.add()
wgn.local.set(adr_var)
wgn.add_statement('nop', comment='while True')
with wgn.loop():
wgn.add_statement('nop', comment='acu = func(acu, iter[i])')
wgn.local.get(acu_var)
# Get the next byte, write back the address
wgn.local.get(adr_var)
wgn.local.get(idx_var)
wgn.call(stdlib_types.__subscript_bytes__)
wgn.i32.const(1)
wgn.i32.add()
wgn.local.tee(adr_var)
wgn.i32.load8_u()
wgn.add_statement('call', f'${inp.func.name}')
wgn.local.set(acu_var)