Conversation
Read several characters at a time.
…engines Importing this function by Function.prototype.call.bind(Dataview.prototype.getInt32) is optimized in V8 but is much slower than calling it through a JavaScript function with other browsers.
|
Hi and thanks a lot for your work. Since my chromium is borked on my laptop, I did my tests on a raspberry pi 4.
This PR results in a very large speedup as you can see. I've profiled the worker in both firefox and chromium. The functions called change a lot and I'm not seeing references to functions under On my laptop, the runtime gets pretty close to the runtime of the JS version (+20% or so compared to +400% before). Thanks again! |
|
@vouillon, what's the status of https://github.com/vouillon/wax ? Can it be used to review this change ? |
|
I think it is usable if you want to give it a try. It does not understand conditional directives --- a/runtime/wasm/bigstring.wat
+++ b/runtime/wasm/bigstring.wat
@@ -154,7 +154,8 @@
(local.set $i (i32.add (local.get $i) (i32.const 1)))
(br_if $loop (i32.eq (local.get $c1) (local.get $c2)))
(return
- (select (ref.i31 (i32.const -1)) (ref.i31 (i32.const 1))
+ (select (result (ref eq))
+ (ref.i31 (i32.const -1)) (ref.i31 (i32.const 1))
(i32.lt_u (local.get $c1) (local.get $c2)))))))
(ref.i31 (i32.const 0)))
@@ -183,7 +184,8 @@
(local.set $i (i32.add (local.get $i) (i32.const 1)))
(br_if $loop (i32.eq (local.get $c1) (local.get $c2)))
(return
- (select (ref.i31 (i32.const -1)) (ref.i31 (i32.const 1))
+ (select (result (ref eq))
+ (ref.i31 (i32.const -1)) (ref.i31 (i32.const 1))
(i32.lt_u (local.get $c1) (local.get $c2)))))))
(ref.i31 (i32.const 0))) |
|
Here is the diff after wax $ patdiff before.rst after.rst
------ before.rst
++++++ after.rst
@|-86,132 +86,278 ============================================================
|}
|#[export = "caml_bigstring_memset"]
|fn caml_bigstring_memset(s: &eq, pos: &eq, len: &eq, v: &eq) -> &eq {
| become caml_ba_fill(caml_ba_sub(s, pos, len), v);
|}
|#[export = "caml_bigstring_memcmp"]
|fn caml_bigstring_memcmp(s1: &eq, vpos1: &eq, s2: &eq, vpos2: &eq, vlen: &eq)
|-> &eq {
| let i: i32;
| let pos1: i32;
| let pos2: i32;
| let len: i32;
| let c1: i32;
| let c2: i32;
| let v1: &extern;
| let v2: &extern;
+| let w1: i32;
+| let w2: i32;
+| let xored: i32;
| v1 = caml_ba_get_view(s1);
| pos1 = vpos1 as &i31 as i32_s;
| v2 = caml_ba_get_view(s2);
| pos2 = vpos2 as &i31 as i32_s;
| len = vlen as &i31 as i32_s;
+| 'done: do {
+| 'loop: loop {
+| br_if 'done i + 4 >u len;
+| w1 = dv_get_i32_unaligned(v1, pos1 + i, 1);
+| w2 = dv_get_i32_unaligned(v2, pos2 + i, 1);
+| if w1 == w2 { i = i + 4; br 'loop; }
+| xored = w1 ^ w2;
+| if xored & 0xFF {
+| c1 = w1 & 0xFF;
+| c2 = w2 & 0xFF;
+| return c1 <u c2?-1 as &i31:1 as &i31;
+| }
+| if xored & 0xFF00 {
+| c1 = w1 >>u 8 & 0xFF;
+| c2 = w2 >>u 8 & 0xFF;
+| return c1 <u c2?-1 as &i31:1 as &i31;
+| }
+| if xored & 0xFF0000 {
+| c1 = w1 >>u 16 & 0xFF;
+| c2 = w2 >>u 16 & 0xFF;
+| return c1 <u c2?-1 as &i31:1 as &i31;
+| }
+| c1 = w1 >>u 24;
+| c2 = w2 >>u 24;
+| return c1 <u c2?-1 as &i31:1 as &i31;
+| } /* 'loop */
+| } /* 'done */
| 'loop: loop {
| if i <u len {
| c1 = dv_get_ui8(v1, pos1 + i);
| c2 = dv_get_ui8(v2, pos2 + i);
| i = i + 1;
| br_if 'loop c1 == c2;
| return c1 <u c2?-1 as &i31:1 as &i31;
| }
| } /* 'loop */
| 0 as &i31;
|}
|#[export = "caml_bigstring_memcmp_string"]
|fn caml_bigstring_memcmp_string
|(s1: &eq, vpos1: &eq, vs2: &eq, vpos2: &eq, vlen: &eq) -> &eq {
| let i: i32;
| let pos1: i32;
| let pos2: i32;
| let len: i32;
| let c1: i32;
| let c2: i32;
| let v1: &extern;
| let s2: &bytes;
+| let w1: i32;
+| let w2: i32;
+| let xored: i32;
+| let j: i32;
| v1 = caml_ba_get_view(s1);
| pos1 = vpos1 as &i31 as i32_s;
| s2 = vs2 as &bytes;
| pos2 = vpos2 as &i31 as i32_s;
| len = vlen as &i31 as i32_s;
+| 'done: do {
+| 'loop: loop {
+| br_if 'done i + 4 >u len;
+| w1 = dv_get_i32_unaligned(v1, pos1 + i, 1);
+| j = pos2 + i;
+| w2 =
+| s2[j] as i32_u | s2[j + 1] as i32_u << 8 |
+| (s2[j + 2] as i32_u << 16 | s2[j + 3] as i32_u << 24);
+| if w1 == w2 { i = i + 4; br 'loop; }
+| xored = w1 ^ w2;
+| if xored & 0xFF {
+| c1 = w1 & 0xFF;
+| c2 = w2 & 0xFF;
+| return c1 <u c2?-1 as &i31:1 as &i31;
+| }
+| if xored & 0xFF00 {
+| c1 = w1 >>u 8 & 0xFF;
+| c2 = w2 >>u 8 & 0xFF;
+| return c1 <u c2?-1 as &i31:1 as &i31;
+| }
+| if xored & 0xFF0000 {
+| c1 = w1 >>u 16 & 0xFF;
+| c2 = w2 >>u 16 & 0xFF;
+| return c1 <u c2?-1 as &i31:1 as &i31;
+| }
+| c1 = w1 >>u 24;
+| c2 = w2 >>u 24;
+| return c1 <u c2?-1 as &i31:1 as &i31;
+| } /* 'loop */
+| } /* 'done */
| 'loop: loop {
| if i <u len {
| c1 = dv_get_ui8(v1, pos1 + i);
| c2 = s2[pos2 + i] as i32_u;
| i = i + 1;
| br_if 'loop c1 == c2;
| return c1 <u c2?-1 as &i31:1 as &i31;
| }
| } /* 'loop */
| 0 as &i31;
|}
|#[export = "caml_bigstring_memchr"]
|fn caml_bigstring_memchr(s: &eq, vc: &eq, vpos: &eq, vlen: &eq) -> &eq {
| let pos: i32;
| let len: i32;
| let c: i32;
| let v: &extern;
+| let mask: i32;
+| let word: i32;
+| let xored: i32;
| c = vc as &i31 as i32_s;
| pos = vpos as &i31 as i32_s;
| len = vlen as &i31 as i32_s;
| v = caml_ba_get_view(s);
+| mask = c * 0x01010101;
+| 'done: do {
+| 'loop: loop {
+| br_if 'done len <s 4;
+| word = dv_get_i32_unaligned(v, pos, 1);
+| xored = word ^ mask;
+| if xored - 0x01010101 & (xored ^ -1) & 0x80808080 {
+| if !(xored & 0xFF) { return pos as &i31; }
+| if !(xored & 0xFF00) { return (pos + 1) as &i31; }
+| if !(xored & 0xFF0000) { return (pos + 2) as &i31; }
+| return (pos + 3) as &i31;
+| }
+| pos = pos + 4;
+| len = len - 4;
+| br 'loop;
+| } /* 'loop */
+| } /* 'done */
| 'loop: loop {
| if len >s 0 {
| if c == dv_get_ui8(v, pos) { return pos as &i31; }
| len = len - 1;
| pos = pos + 1;
| br 'loop;
| }
| } /* 'loop */
| -1 as &i31;
|}
|#[export = "caml_bigstring_memrchr"]
|fn caml_bigstring_memrchr(s: &eq, vc: &eq, vpos: &eq, vlen: &eq) -> &eq {
| let pos: i32;
| let len: i32;
| let c: i32;
| let cur: i32;
| let v: &extern;
+| let mask: i32;
+| let word: i32;
+| let xored: i32;
| c = vc as &i31 as i32_s;
| pos = vpos as &i31 as i32_s;
| len = vlen as &i31 as i32_s;
| v = caml_ba_get_view(s);
| cur = pos + len - 1;
+| mask = c * 0x01010101;
+| 'loop: loop {
+| if cur - pos >=s 3 {
+| word = dv_get_i32_unaligned(v, cur - 3, 1);
+| xored = word ^ mask;
+| if xored - 0x01010101 & (xored ^ -1) & 0x80808080 {
+| if !(xored & 0xFF000000) { return cur as &i31; }
+| if !(xored & 0xFF0000) { return (cur - 1) as &i31; }
+| if !(xored & 0xFF00) { return (cur - 2) as &i31; }
+| return (cur - 3) as &i31;
+| }
+| cur = cur - 4;
+| br 'loop;
+| }
+| } /* 'loop */
| 'loop: loop {
| if cur >=s pos {
| if c == dv_get_ui8(v, cur) { return cur as &i31; }
| cur = cur - 1;
| br 'loop;
| }
| } /* 'loop */
| -1 as &i31;
|}
|#[export = "caml_bigstring_strncmp"]
|fn caml_bigstring_strncmp
|(vs1: &eq, vpos1: &eq, vs2: &eq, vpos2: &eq, vlen: &eq) -> &eq {
| let v1: &extern;
| let v2: &extern;
| let pos1: i32;
| let pos2: i32;
| let len: i32;
| let i: i32;
| let c1: i32;
| let c2: i32;
+| let w1: i32;
+| let w2: i32;
+| let xored: i32;
| v1 = caml_ba_get_view(vs1);
| v2 = caml_ba_get_view(vs2);
| pos1 = vpos1 as &i31 as i32_s;
| pos2 = vpos2 as &i31 as i32_s;
| len = vlen as &i31 as i32_s;
+| 'done: do {
+| 'loop: loop {
+| br_if 'done i + 4 >u len;
+| w1 = dv_get_i32_unaligned(v1, pos1 + i, 1);
+| w2 = dv_get_i32_unaligned(v2, pos2 + i, 1);
+| if w1 == w2 {
+| if w1 - 0x01010101 & (w1 ^ -1) & 0x80808080 {
+| return 0 as &i31;
+| }
+| i = i + 4;
+| br 'loop;
+| }
+| xored = w1 ^ w2;
+| c1 = w1 & 0xFF;
+| c2 = w2 & 0xFF;
+| if xored & 0xFF | !c1 {
+| if c1 <u c2 { return -1 as &i31; }
+| if c1 >u c2 { return 1 as &i31; }
+| return 0 as &i31;
+| }
+| c1 = w1 >>u 8 & 0xFF;
+| c2 = w2 >>u 8 & 0xFF;
+| if xored & 0xFF00 | !c1 {
+| if c1 <u c2 { return -1 as &i31; }
+| if c1 >u c2 { return 1 as &i31; }
+| return 0 as &i31;
+| }
+| c1 = w1 >>u 16 & 0xFF;
+| c2 = w2 >>u 16 & 0xFF;
+| if xored & 0xFF0000 | !c1 {
+| if c1 <u c2 { return -1 as &i31; }
+| if c1 >u c2 { return 1 as &i31; }
+| return 0 as &i31;
+| }
+| c1 = w1 >>u 24;
+| c2 = w2 >>u 24;
+| if c1 <u c2 { return -1 as &i31; }
+| if c1 >u c2 { return 1 as &i31; }
+| return 0 as &i31;
+| } /* 'loop */
+| } /* 'done */
| 'loop: loop {
| if i <u len {
| c1 = dv_get_ui8(v1, pos1 + i);
| c2 = dv_get_ui8(v2, pos2 + i);
| i = i + 1;
| if c1 <u c2 { return -1 as &i31; }
| if c1 >u c2 { return 1 as &i31; }
| if c1 == 0 { return 0 as &i31; }
| br 'loop;
| }
| } /* 'loop */
| 0 as &i31;
|}
|#[export = "caml_bigstring_blit_bytes_to_ba"]
|#[export = "caml_bigstring_blit_string_to_ba"]
|fn caml_bigstring_blit_bytes_to_ba |
|
|
||
| const on_windows = isNode && globalThis.process.platform === "win32"; | ||
|
|
||
| const isV8 = new Error().stack?.includes("\n at ") ?? false; |
There was a problem hiding this comment.
Can you add a source for this trick ? should we add a test somewhere to spot if it ever changes ?
| dv_get_f32: call.bind(DV.getFloat32), | ||
| dv_get_i64: call.bind(DV.getBigInt64), | ||
| dv_get_i32: call.bind(DV.getInt32), | ||
| dv_get_i32: isV8 ? call.bind(DV.getInt32) : (x, y, z) => x.getInt32(y, z), |
There was a problem hiding this comment.
We need a comment in the code explaining this together with a date or versions at which this was true. And maybe a tiny benchmark that show differences between the two, so that one can check the optim is still accurate
No description provided.