Skip to content

Comments

Bigstring optimizations#2144

Open
vouillon wants to merge 3 commits intomasterfrom
bigstring
Open

Bigstring optimizations#2144
vouillon wants to merge 3 commits intomasterfrom
bigstring

Conversation

@vouillon
Copy link
Member

No description provided.

Read several characters at a time.
@vouillon vouillon added the wasm label Jan 20, 2026
…engines

Importing this function by Function.prototype.call.bind(Dataview.prototype.getInt32) is
optimized in V8 but is much slower than calling it through a JavaScript function with other
browsers.
@adrien-n
Copy link
Contributor

Hi and thanks a lot for your work.

Since my chromium is borked on my laptop, I did my tests on a raspberry pi 4.

Chromium Firefox
WASM baseline 20s 67s
WASM this PR 13s 18s
JS 5s 13s

This PR results in a very large speedup as you can see.

I've profiled the worker in both firefox and chromium. The functions called change a lot and I'm not seeing references to functions under JS:: in firefox, only to wasm-function.

On my laptop, the runtime gets pretty close to the runtime of the JS version (+20% or so compared to +400% before). Thanks again!

@vouillon vouillon marked this pull request as ready for review January 20, 2026 16:19
@hhugo hhugo mentioned this pull request Jan 30, 2026
9 tasks
@hhugo
Copy link
Member

hhugo commented Feb 2, 2026

@vouillon, what's the status of https://github.com/vouillon/wax ? Can it be used to review this change ?

@vouillon
Copy link
Member Author

vouillon commented Feb 2, 2026

I think it is usable if you want to give it a try. It does not understand conditional directives @if yet, but bigstring.wat does not contain any. It is also more strict than Binaryen, so you get an error with wax ~/js_of_ocaml/runtime/wasm/bigstring.wat -v -f wax if you don't fix these two lines:

--- a/runtime/wasm/bigstring.wat
+++ b/runtime/wasm/bigstring.wat
@@ -154,7 +154,8 @@
                (local.set $i (i32.add (local.get $i) (i32.const 1)))
                (br_if $loop (i32.eq (local.get $c1) (local.get $c2)))
                (return
-                  (select (ref.i31 (i32.const -1)) (ref.i31 (i32.const 1))
+                  (select (result (ref eq))
+                     (ref.i31 (i32.const -1)) (ref.i31 (i32.const 1))
                      (i32.lt_u (local.get $c1) (local.get $c2)))))))
       (ref.i31 (i32.const 0)))
 
@@ -183,7 +184,8 @@
                (local.set $i (i32.add (local.get $i) (i32.const 1)))
                (br_if $loop (i32.eq (local.get $c1) (local.get $c2)))
                (return
-                  (select (ref.i31 (i32.const -1)) (ref.i31 (i32.const 1))
+                  (select (result (ref eq))
+                     (ref.i31 (i32.const -1)) (ref.i31 (i32.const 1))
                      (i32.lt_u (local.get $c1) (local.get $c2)))))))
       (ref.i31 (i32.const 0)))

@hhugo
Copy link
Member

hhugo commented Feb 15, 2026

Here is the diff after wax

$ patdiff before.rst after.rst 
------ before.rst
++++++ after.rst
@|-86,132 +86,278 ============================================================
 |}
 |#[export = "caml_bigstring_memset"]
 |fn caml_bigstring_memset(s: &eq, pos: &eq, len: &eq, v: &eq) -> &eq {
 |    become caml_ba_fill(caml_ba_sub(s, pos, len), v);
 |}
 |#[export = "caml_bigstring_memcmp"]
 |fn caml_bigstring_memcmp(s1: &eq, vpos1: &eq, s2: &eq, vpos2: &eq, vlen: &eq)
 |-> &eq {
 |    let i: i32;
 |    let pos1: i32;
 |    let pos2: i32;
 |    let len: i32;
 |    let c1: i32;
 |    let c2: i32;
 |    let v1: &extern;
 |    let v2: &extern;
+|    let w1: i32;
+|    let w2: i32;
+|    let xored: i32;
 |    v1 = caml_ba_get_view(s1);
 |    pos1 = vpos1 as &i31 as i32_s;
 |    v2 = caml_ba_get_view(s2);
 |    pos2 = vpos2 as &i31 as i32_s;
 |    len = vlen as &i31 as i32_s;
+|    'done: do {
+|        'loop: loop {
+|            br_if 'done i + 4 >u len;
+|            w1 = dv_get_i32_unaligned(v1, pos1 + i, 1);
+|            w2 = dv_get_i32_unaligned(v2, pos2 + i, 1);
+|            if w1 == w2 { i = i + 4; br 'loop; }
+|            xored = w1 ^ w2;
+|            if xored & 0xFF {
+|                c1 = w1 & 0xFF;
+|                c2 = w2 & 0xFF;
+|                return c1 <u c2?-1 as &i31:1 as &i31;
+|            }
+|            if xored & 0xFF00 {
+|                c1 = w1 >>u 8 & 0xFF;
+|                c2 = w2 >>u 8 & 0xFF;
+|                return c1 <u c2?-1 as &i31:1 as &i31;
+|            }
+|            if xored & 0xFF0000 {
+|                c1 = w1 >>u 16 & 0xFF;
+|                c2 = w2 >>u 16 & 0xFF;
+|                return c1 <u c2?-1 as &i31:1 as &i31;
+|            }
+|            c1 = w1 >>u 24;
+|            c2 = w2 >>u 24;
+|            return c1 <u c2?-1 as &i31:1 as &i31;
+|        } /* 'loop */
+|    } /* 'done */
 |    'loop: loop {
 |        if i <u len {
 |            c1 = dv_get_ui8(v1, pos1 + i);
 |            c2 = dv_get_ui8(v2, pos2 + i);
 |            i = i + 1;
 |            br_if 'loop c1 == c2;
 |            return c1 <u c2?-1 as &i31:1 as &i31;
 |        }
 |    } /* 'loop */
 |    0 as &i31;
 |}
 |#[export = "caml_bigstring_memcmp_string"]
 |fn caml_bigstring_memcmp_string
 |(s1: &eq, vpos1: &eq, vs2: &eq, vpos2: &eq, vlen: &eq) -> &eq {
 |    let i: i32;
 |    let pos1: i32;
 |    let pos2: i32;
 |    let len: i32;
 |    let c1: i32;
 |    let c2: i32;
 |    let v1: &extern;
 |    let s2: &bytes;
+|    let w1: i32;
+|    let w2: i32;
+|    let xored: i32;
+|    let j: i32;
 |    v1 = caml_ba_get_view(s1);
 |    pos1 = vpos1 as &i31 as i32_s;
 |    s2 = vs2 as &bytes;
 |    pos2 = vpos2 as &i31 as i32_s;
 |    len = vlen as &i31 as i32_s;
+|    'done: do {
+|        'loop: loop {
+|            br_if 'done i + 4 >u len;
+|            w1 = dv_get_i32_unaligned(v1, pos1 + i, 1);
+|            j = pos2 + i;
+|            w2 =
+|                s2[j] as i32_u | s2[j + 1] as i32_u << 8 |
+|                    (s2[j + 2] as i32_u << 16 | s2[j + 3] as i32_u << 24);
+|            if w1 == w2 { i = i + 4; br 'loop; }
+|            xored = w1 ^ w2;
+|            if xored & 0xFF {
+|                c1 = w1 & 0xFF;
+|                c2 = w2 & 0xFF;
+|                return c1 <u c2?-1 as &i31:1 as &i31;
+|            }
+|            if xored & 0xFF00 {
+|                c1 = w1 >>u 8 & 0xFF;
+|                c2 = w2 >>u 8 & 0xFF;
+|                return c1 <u c2?-1 as &i31:1 as &i31;
+|            }
+|            if xored & 0xFF0000 {
+|                c1 = w1 >>u 16 & 0xFF;
+|                c2 = w2 >>u 16 & 0xFF;
+|                return c1 <u c2?-1 as &i31:1 as &i31;
+|            }
+|            c1 = w1 >>u 24;
+|            c2 = w2 >>u 24;
+|            return c1 <u c2?-1 as &i31:1 as &i31;
+|        } /* 'loop */
+|    } /* 'done */
 |    'loop: loop {
 |        if i <u len {
 |            c1 = dv_get_ui8(v1, pos1 + i);
 |            c2 = s2[pos2 + i] as i32_u;
 |            i = i + 1;
 |            br_if 'loop c1 == c2;
 |            return c1 <u c2?-1 as &i31:1 as &i31;
 |        }
 |    } /* 'loop */
 |    0 as &i31;
 |}
 |#[export = "caml_bigstring_memchr"]
 |fn caml_bigstring_memchr(s: &eq, vc: &eq, vpos: &eq, vlen: &eq) -> &eq {
 |    let pos: i32;
 |    let len: i32;
 |    let c: i32;
 |    let v: &extern;
+|    let mask: i32;
+|    let word: i32;
+|    let xored: i32;
 |    c = vc as &i31 as i32_s;
 |    pos = vpos as &i31 as i32_s;
 |    len = vlen as &i31 as i32_s;
 |    v = caml_ba_get_view(s);
+|    mask = c * 0x01010101;
+|    'done: do {
+|        'loop: loop {
+|            br_if 'done len <s 4;
+|            word = dv_get_i32_unaligned(v, pos, 1);
+|            xored = word ^ mask;
+|            if xored - 0x01010101 & (xored ^ -1) & 0x80808080 {
+|                if !(xored & 0xFF) { return pos as &i31; }
+|                if !(xored & 0xFF00) { return (pos + 1) as &i31; }
+|                if !(xored & 0xFF0000) { return (pos + 2) as &i31; }
+|                return (pos + 3) as &i31;
+|            }
+|            pos = pos + 4;
+|            len = len - 4;
+|            br 'loop;
+|        } /* 'loop */
+|    } /* 'done */
 |    'loop: loop {
 |        if len >s 0 {
 |            if c == dv_get_ui8(v, pos) { return pos as &i31; }
 |            len = len - 1;
 |            pos = pos + 1;
 |            br 'loop;
 |        }
 |    } /* 'loop */
 |    -1 as &i31;
 |}
 |#[export = "caml_bigstring_memrchr"]
 |fn caml_bigstring_memrchr(s: &eq, vc: &eq, vpos: &eq, vlen: &eq) -> &eq {
 |    let pos: i32;
 |    let len: i32;
 |    let c: i32;
 |    let cur: i32;
 |    let v: &extern;
+|    let mask: i32;
+|    let word: i32;
+|    let xored: i32;
 |    c = vc as &i31 as i32_s;
 |    pos = vpos as &i31 as i32_s;
 |    len = vlen as &i31 as i32_s;
 |    v = caml_ba_get_view(s);
 |    cur = pos + len - 1;
+|    mask = c * 0x01010101;
+|    'loop: loop {
+|        if cur - pos >=s 3 {
+|            word = dv_get_i32_unaligned(v, cur - 3, 1);
+|            xored = word ^ mask;
+|            if xored - 0x01010101 & (xored ^ -1) & 0x80808080 {
+|                if !(xored & 0xFF000000) { return cur as &i31; }
+|                if !(xored & 0xFF0000) { return (cur - 1) as &i31; }
+|                if !(xored & 0xFF00) { return (cur - 2) as &i31; }
+|                return (cur - 3) as &i31;
+|            }
+|            cur = cur - 4;
+|            br 'loop;
+|        }
+|    } /* 'loop */
 |    'loop: loop {
 |        if cur >=s pos {
 |            if c == dv_get_ui8(v, cur) { return cur as &i31; }
 |            cur = cur - 1;
 |            br 'loop;
 |        }
 |    } /* 'loop */
 |    -1 as &i31;
 |}
 |#[export = "caml_bigstring_strncmp"]
 |fn caml_bigstring_strncmp
 |(vs1: &eq, vpos1: &eq, vs2: &eq, vpos2: &eq, vlen: &eq) -> &eq {
 |    let v1: &extern;
 |    let v2: &extern;
 |    let pos1: i32;
 |    let pos2: i32;
 |    let len: i32;
 |    let i: i32;
 |    let c1: i32;
 |    let c2: i32;
+|    let w1: i32;
+|    let w2: i32;
+|    let xored: i32;
 |    v1 = caml_ba_get_view(vs1);
 |    v2 = caml_ba_get_view(vs2);
 |    pos1 = vpos1 as &i31 as i32_s;
 |    pos2 = vpos2 as &i31 as i32_s;
 |    len = vlen as &i31 as i32_s;
+|    'done: do {
+|        'loop: loop {
+|            br_if 'done i + 4 >u len;
+|            w1 = dv_get_i32_unaligned(v1, pos1 + i, 1);
+|            w2 = dv_get_i32_unaligned(v2, pos2 + i, 1);
+|            if w1 == w2 {
+|                if w1 - 0x01010101 & (w1 ^ -1) & 0x80808080 {
+|                    return 0 as &i31;
+|                }
+|                i = i + 4;
+|                br 'loop;
+|            }
+|            xored = w1 ^ w2;
+|            c1 = w1 & 0xFF;
+|            c2 = w2 & 0xFF;
+|            if xored & 0xFF | !c1 {
+|                if c1 <u c2 { return -1 as &i31; }
+|                if c1 >u c2 { return 1 as &i31; }
+|                return 0 as &i31;
+|            }
+|            c1 = w1 >>u 8 & 0xFF;
+|            c2 = w2 >>u 8 & 0xFF;
+|            if xored & 0xFF00 | !c1 {
+|                if c1 <u c2 { return -1 as &i31; }
+|                if c1 >u c2 { return 1 as &i31; }
+|                return 0 as &i31;
+|            }
+|            c1 = w1 >>u 16 & 0xFF;
+|            c2 = w2 >>u 16 & 0xFF;
+|            if xored & 0xFF0000 | !c1 {
+|                if c1 <u c2 { return -1 as &i31; }
+|                if c1 >u c2 { return 1 as &i31; }
+|                return 0 as &i31;
+|            }
+|            c1 = w1 >>u 24;
+|            c2 = w2 >>u 24;
+|            if c1 <u c2 { return -1 as &i31; }
+|            if c1 >u c2 { return 1 as &i31; }
+|            return 0 as &i31;
+|        } /* 'loop */
+|    } /* 'done */
 |    'loop: loop {
 |        if i <u len {
 |            c1 = dv_get_ui8(v1, pos1 + i);
 |            c2 = dv_get_ui8(v2, pos2 + i);
 |            i = i + 1;
 |            if c1 <u c2 { return -1 as &i31; }
 |            if c1 >u c2 { return 1 as &i31; }
 |            if c1 == 0 { return 0 as &i31; }
 |            br 'loop;
 |        }
 |    } /* 'loop */
 |    0 as &i31;
 |}
 |#[export = "caml_bigstring_blit_bytes_to_ba"]
 |#[export = "caml_bigstring_blit_string_to_ba"]
 |fn caml_bigstring_blit_bytes_to_ba


const on_windows = isNode && globalThis.process.platform === "win32";

const isV8 = new Error().stack?.includes("\n at ") ?? false;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a source for this trick ? should we add a test somewhere to spot if it ever changes ?

dv_get_f32: call.bind(DV.getFloat32),
dv_get_i64: call.bind(DV.getBigInt64),
dv_get_i32: call.bind(DV.getInt32),
dv_get_i32: isV8 ? call.bind(DV.getInt32) : (x, y, z) => x.getInt32(y, z),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need a comment in the code explaining this together with a date or versions at which this was true. And maybe a tiny benchmark that show differences between the two, so that one can check the optim is still accurate

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants