phasm/examples/crc32.html
Johan B.W. de Vries 5ad5a9c064 Speedup foldl over bytes
Prior to this PR, the compiler would call stdlib.types's
__subscript_bytes__.

However, that function performs some checks we do not need.

After this MR, folding iterates directly over the bytes
memory, saving the memory access checks and the function
calls. This gets us a speedup of about 43% less CPU time
used on Firefox.

Also, by default, the CRC32 page runs a shorter timing test.
2022-08-21 15:38:11 +02:00

247 lines
6.0 KiB
HTML

<!DOCTYPE html>
<html>
<head>
<title>Examples - CRC32</title>
</head>
<body>
<h1>Buffer</h1>
<a href="index.html">List</a> - <a href="crc32.py.html">Source</a> - <a href="crc32.wat.html">WebAssembly</a><br />
<br />
Note: This tests performs some timing comparison, please wait a few seconds for the results.<br />
<div style="white-space: pre;" id="results"></div>
<h2>Measurement log</h2>
<h3>AMD Ryzen 7 3700X 8-Core, Ubuntu 20.04, Linux 5.4.0-124-generic</h3>
<h4>After optimizing fold over bytes by inlineing __subscript_bytes__</h4>
<table>
<tr>
<td>Test</td>
<td>Interpreter</td>
<td>Setup</td>
<td>WebAssembly</td>
<td>Javascript</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Chromium 104.0.5112.101</td>
<td>DevTools closed</td>
<td>5.70</td>
<td>12.45</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Firefox 103</td>
<td>DevTools closed</td>
<td>5.16</td>
<td>5.72</td>
</tr>
<tr>
<td>Lynx * 1048576</td>
<td>Chromium 104.0.5112.101</td>
<td>DevTools closed</td>
<td>95.65</td>
<td>203.60</td>
</tr>
<tr>
<td>Lynx * 1048576</td>
<td>Firefox 103</td>
<td>DevTools closed</td>
<td>83.34</td>
<td>92.38</td>
</tr>
</table>
<h4>Before optimizing fold over bytes by inlineing __subscript_bytes__</h4>
<table>
<tr>
<td>Test</td>
<td>Interpreter</td>
<td>Setup</td>
<td>WebAssembly</td>
<td>Javascript</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Chromium 104.0.5112.101</td>
<td>DevTools closed</td>
<td>9.35</td>
<td>12.56</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Chromium 104.0.5112.101</td>
<td>DevTools open</td>
<td>14.71</td>
<td>12.72</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Chromium 104.0.5112.101</td>
<td>Record page load</td>
<td>9.44</td>
<td>12.69</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Firefox 103</td>
<td>DevTools closed</td>
<td>9.02</td>
<td>5.86</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Firefox 103</td>
<td>DevTools open</td>
<td>9.01</td>
<td>5.83</td>
</tr>
<tr>
<td>Lynx * 65536</td>
<td>Firefox 103</td>
<td>Record page load</td>
<td>72.41</td>
<td>5.85</td>
</tr>
<tr>
<td>Lynx * 1048576</td>
<td>Chromium 104.0.5112.101</td>
<td>DevTools closed</td>
<td>149.24</td>
<td>202.36</td>
</tr>
<tr>
<td>Lynx * 1048576</td>
<td>Firefox 103</td>
<td>DevTools closed</td>
<td>145.01</td>
<td>91.44</td>
</tr>
</table>
<h4>Notes</h4>
- Firefox seems faster than Chromium in my setup for Javascript, WebAssembly seems about the same.<br />
- Having DevTools open in Chromium seems to slow down the WebAssembly by about 30%, but not when doing a recording of the page load.<br />
- WebAssembly in Firefox seems to slow down when doing a recording of the page load, which makes sense, but the Javascript does not.<br />
<script type="text/javascript" src="./include.js"></script>
<script type="text/javascript">
let importObject = {};
// Build up a JS version
var makeCRCTable = function(){
var c;
var crcTable = [];
for(var n =0; n < 256; n++){
c = n;
for(var k =0; k < 8; k++){
c = ((c&1) ? (0xEDB88320 ^ (c >>> 1)) : (c >>> 1));
}
crcTable[n] = c;
}
return crcTable;
}
window.crcTable = makeCRCTable();
var crc32_js = function(i8arr) {
// console.log('crc32_js', i8arr.length);
var crcTable = window.crcTable;
var crc = 0 ^ (-1);
for (var i = 0; i < i8arr.length; i++ ) {
crc = (crc >>> 8) ^ crcTable[(crc ^ i8arr[i]) & 0xFF];
}
return (crc ^ (-1)) >>> 0;
};
// Run a single test
function run_test(app, str, str_repeat)
{
// Cast to Uint32 in Javascript
let crc32_wasm = function(offset) {
// console.log('crc32_wasm', str.length);
return app.instance.exports.crc32(offset) >>> 0;
};
let orig_str = str;
if( str_repeat ) {
str = str.repeat(str_repeat);
} else {
str_repeat = 1;
}
let data = Uint8Array.from(str.split('').map(x => x.charCodeAt()));
offset = alloc_bytes(app, data);
let tweak = () => {
data[0] = data[0] + 1;
let i8arr = new Uint8Array(app.instance.exports.memory.buffer, offset + 4, data.length);
i8arr[0] = i8arr[0] + 1;
};
let tweak_reset = () => {
data[0] = 'T'.charCodeAt(0);
let i8arr = new Uint8Array(app.instance.exports.memory.buffer, offset + 4, data.length);
i8arr[0] = 'T'.charCodeAt(0);
};
// Run once to get the result
// For some reason, the JS version takes 2ms on the first run
// let wasm_result = crc32_wasm(offset);
// let js_result = crc32_js(data);
let wasm_timing = run_times(100, () => crc32_wasm(offset));
let js_timing = run_times(100, () => crc32_js(data));
let wasm_time = wasm_timing.avg;
let js_time = js_timing.avg;
let check = wasm_timing.values.every(function(value, index) {
return value.result === js_timing.values[index].result;
});
// Don't test speedup for small strings, it varies a lot
let speedup = (wasm_timing.min == 0 || js_timing.min == 0)
? 1
: js_time / wasm_time;
test_result(check && 0.999 < speedup, { // At least as fast as Javascript
'summary': 'crc32(' + (str
? (str.length < 64 ? '"' + str + '"' : '"' + str.substring(0, 64) + '..." (' + str.length + ')')
: '""') + ')',
'attributes': {
'str': orig_str,
'str_repeat': str_repeat,
'wasm_timing': wasm_timing,
'js_timing': js_timing,
'check': check,
'speedup': speedup,
},
});
}
// Load WebAssembly, and run all tests
WebAssembly.instantiateStreaming(fetch('crc32.wasm'), importObject)
.then(app => {
app.instance.exports.memory.grow(640);
run_test(app, "");
run_test(app, "a");
run_test(app, "Z");
run_test(app, "ab");
run_test(app, "abcdefghijklmnopqrstuvwxyz");
run_test(app, "The quick brown fox jumps over the lazy dog");
run_test(app, "The quick brown fox jumps over the lazy dog", 1024);
run_test(app, "Lynx c.q. vos prikt bh: dag zwemjuf!", 65536);
});
</script>
</body>
</html>