Speedup foldl over bytes
Prior to this PR, the compiler would call stdlib.types's __subscript_bytes__. However, that function performs some checks we do not need. After this MR, folding iterates directly over the bytes memory, saving the memory access checks and the function calls. This gets us a speedup of about 43% less CPU time used on Firefox. Also, by default, the CRC32 page runs a shorter timing test.
This commit is contained in:
parent
2970093c8f
commit
5ad5a9c064
@ -13,6 +13,45 @@ Note: This tests performs some timing comparison, please wait a few seconds for
|
|||||||
|
|
||||||
<h2>Measurement log</h2>
|
<h2>Measurement log</h2>
|
||||||
<h3>AMD Ryzen 7 3700X 8-Core, Ubuntu 20.04, Linux 5.4.0-124-generic</h3>
|
<h3>AMD Ryzen 7 3700X 8-Core, Ubuntu 20.04, Linux 5.4.0-124-generic</h3>
|
||||||
|
<h4>After optimizing fold over bytes by inlineing __subscript_bytes__</h4>
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<td>Test</td>
|
||||||
|
<td>Interpreter</td>
|
||||||
|
<td>Setup</td>
|
||||||
|
<td>WebAssembly</td>
|
||||||
|
<td>Javascript</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Lynx * 65536</td>
|
||||||
|
<td>Chromium 104.0.5112.101</td>
|
||||||
|
<td>DevTools closed</td>
|
||||||
|
<td>5.70</td>
|
||||||
|
<td>12.45</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Lynx * 65536</td>
|
||||||
|
<td>Firefox 103</td>
|
||||||
|
<td>DevTools closed</td>
|
||||||
|
<td>5.16</td>
|
||||||
|
<td>5.72</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Lynx * 1048576</td>
|
||||||
|
<td>Chromium 104.0.5112.101</td>
|
||||||
|
<td>DevTools closed</td>
|
||||||
|
<td>95.65</td>
|
||||||
|
<td>203.60</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Lynx * 1048576</td>
|
||||||
|
<td>Firefox 103</td>
|
||||||
|
<td>DevTools closed</td>
|
||||||
|
<td>83.34</td>
|
||||||
|
<td>92.38</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
<h4>Before optimizing fold over bytes by inlineing __subscript_bytes__</h4>
|
||||||
<table>
|
<table>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Test</td>
|
<td>Test</td>
|
||||||
@ -80,7 +119,7 @@ Note: This tests performs some timing comparison, please wait a few seconds for
|
|||||||
</tr>
|
</tr>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
Notes:<br />
|
<h4>Notes</h4>
|
||||||
- Firefox seems faster than Chromium in my setup for Javascript, WebAssembly seems about the same.<br />
|
- Firefox seems faster than Chromium in my setup for Javascript, WebAssembly seems about the same.<br />
|
||||||
- Having DevTools open in Chromium seems to slow down the WebAssembly by about 30%, but not when doing a recording of the page load.<br />
|
- Having DevTools open in Chromium seems to slow down the WebAssembly by about 30%, but not when doing a recording of the page load.<br />
|
||||||
- WebAssembly in Firefox seems to slow down when doing a recording of the page load, which makes sense, but the Javascript does not.<br />
|
- WebAssembly in Firefox seems to slow down when doing a recording of the page load, which makes sense, but the Javascript does not.<br />
|
||||||
@ -168,7 +207,9 @@ function run_test(app, str, str_repeat)
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Don't test speedup for small strings, it varies a lot
|
// Don't test speedup for small strings, it varies a lot
|
||||||
let speedup = str.length < 16 ? 1 : (js_time == wasm_time ? 1 : js_time / wasm_time);
|
let speedup = (wasm_timing.min == 0 || js_timing.min == 0)
|
||||||
|
? 1
|
||||||
|
: js_time / wasm_time;
|
||||||
|
|
||||||
test_result(check && 0.999 < speedup, { // At least as fast as Javascript
|
test_result(check && 0.999 < speedup, { // At least as fast as Javascript
|
||||||
'summary': 'crc32(' + (str
|
'summary': 'crc32(' + (str
|
||||||
@ -197,7 +238,7 @@ WebAssembly.instantiateStreaming(fetch('crc32.wasm'), importObject)
|
|||||||
run_test(app, "abcdefghijklmnopqrstuvwxyz");
|
run_test(app, "abcdefghijklmnopqrstuvwxyz");
|
||||||
run_test(app, "The quick brown fox jumps over the lazy dog");
|
run_test(app, "The quick brown fox jumps over the lazy dog");
|
||||||
run_test(app, "The quick brown fox jumps over the lazy dog", 1024);
|
run_test(app, "The quick brown fox jumps over the lazy dog", 1024);
|
||||||
run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 1048576);
|
run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 65536);
|
||||||
});
|
});
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
|||||||
@ -368,13 +368,27 @@ def expression_fold(wgn: WasmGenerator, inp: ourlang.Fold) -> None:
|
|||||||
wgn.local.get(len_var)
|
wgn.local.get(len_var)
|
||||||
wgn.i32.lt_u()
|
wgn.i32.lt_u()
|
||||||
with wgn.if_():
|
with wgn.if_():
|
||||||
|
# From here on, adr_var is the address of byte we're referencing
|
||||||
|
# This is akin to calling stdlib_types.__subscript_bytes__
|
||||||
|
# But since we already know we are inside of bounds,
|
||||||
|
# can just bypass it and load the memory directly.
|
||||||
|
wgn.local.get(adr_var)
|
||||||
|
wgn.i32.const(3) # Bytes header -1, since we do a +1 every loop
|
||||||
|
wgn.i32.add()
|
||||||
|
wgn.local.set(adr_var)
|
||||||
|
|
||||||
wgn.add_statement('nop', comment='while True')
|
wgn.add_statement('nop', comment='while True')
|
||||||
with wgn.loop():
|
with wgn.loop():
|
||||||
wgn.add_statement('nop', comment='acu = func(acu, iter[i])')
|
wgn.add_statement('nop', comment='acu = func(acu, iter[i])')
|
||||||
wgn.local.get(acu_var)
|
wgn.local.get(acu_var)
|
||||||
|
|
||||||
|
# Get the next byte, write back the address
|
||||||
wgn.local.get(adr_var)
|
wgn.local.get(adr_var)
|
||||||
wgn.local.get(idx_var)
|
wgn.i32.const(1)
|
||||||
wgn.call(stdlib_types.__subscript_bytes__)
|
wgn.i32.add()
|
||||||
|
wgn.local.tee(adr_var)
|
||||||
|
wgn.i32.load8_u()
|
||||||
|
|
||||||
wgn.add_statement('call', f'${inp.func.name}')
|
wgn.add_statement('call', f'${inp.func.name}')
|
||||||
wgn.local.set(acu_var)
|
wgn.local.set(acu_var)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user