Consider using ctz for iteration. #19

modulovalue · 2023-12-22T13:03:50Z

Consider the following implementation of iteration over all set bits in a bit_set:

https://github.com/RoaringBitmap/CRoaring/blob/13407ae912cbe16d28f196966a1989b714b4996d/include/roaring/bitset/bitset.h#L255-L269

inline bool bitset_for_each(const bitset_t *b, bitset_iterator iterator,
                            void *ptr) {
    size_t base = 0;
    for (size_t i = 0; i < b->arraysize; ++i) {
        uint64_t w = b->array[i];
        while (w != 0) {
            uint64_t t = w & (~w + 1);
            int r = roaring_trailing_zeroes(w);
            if (!iterator(r + base, ptr)) return false;
            w ^= t;
        }
        base += 64;
    }
    return true;
}

it uses count trailing zeroes to improve efficiency.

Blocked by #17 & dart-lang/sdk#52673, or maybe there's an efficient-enough way to simulate count trailing zeroes (e.g. https://stackoverflow.com/questions/31233609/what-is-the-most-efficient-to-count-trailing-zeroes-in-an-integer).

modulovalue · 2023-12-22T23:35:14Z

Good news. That way to iterate appears to be much faster.

One benchmark takes 11 seconds with .asIntIterable() and 9 seconds with bitarrayForEach:

abstract class BitArrayOps {
  // reference: https://github.com/RoaringBitmap/CRoaring/blob/13407ae912cbe16d28f196966a1989b714b4996d/include/roaring/bitset/bitset.h#L255-L269
  static void bitarrayForEach(final BitArray a, final void Function(int) iterator) {
    int base = 0;
    for (int i = 0; i < a._data.length; ++i) {
        int w = a._data[i];
        while (w != 0) {
            int t = w & (~w + 1);
            int r = w.trailingZeros;
            iterator(r + base);
            w ^= t;
        }
        base += 32;
    }
  }
}

// https://github.com/imoldfella/query/blob/3c327b034d81b51132b3aecc7c9c8ecb63166ec2/lib/src/ef/select.dart#L19
extension on int {
  // https://stackoverflow.com/questions/45221914/number-of-trailing-zeroes
  int get trailingZeros {
    int bits = 0, x = this;
    if (x > 0) {
      /* assuming `x` has 32 bits: lets count the low order 0 bits in batches */
      /* mask the 16 low order bits, add 16 and shift them out if they are all 0 */
      if (0 == (x & 0x0000FFFF)) {
        bits += 16;
        x >>= 16;
      }
      /* mask the 8 low order bits, add 8 and shift them out if they are all 0 */
      if (0 == (x & 0x000000FF)) {
        bits += 8;
        x >>= 8;
      }
      /* mask the 4 low order bits, add 4 and shift them out if they are all 0 */
      if (0 == (x & 0x0000000F)) {
        bits += 4;
        x >>= 4;
      }
      /* mask the 2 low order bits, add 2 and shift them out if they are all 0 */
      if (0 == (x & 0x00000003)) {
        bits += 2;
        x >>= 2;
      }
      /* mask the low order bit and add 1 if it is 0 */
      bits += (x & 1) ^ 1;
    } else {
      return 32;
    }
    return bits;
  }
}

modulovalue · 2023-12-23T15:22:52Z

That stack overflow solution appears to use the "Count the consecutive zero bits (trailing) on the right by binary search" trick from Bit Twiddling hacks.

I think it might be worth investigating the other bit tricks for calculating ctz.

modulovalue · 2023-12-23T15:43:23Z

Here are some results from comparing different ctz implementations from bit twiddling hacks:

Measure: run: took: 6552.89700 ms
Measure: run: took: 6534.60500 ms
Measure: run: took: 6518.90600 ms

  // https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup
  int get trailingZeros {
    const MultiplyDeBruijnBitPosition = [
      0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
      31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
    ];
    return MultiplyDeBruijnBitPosition[(((this & -this) * 0x077CB531).toUnsigned(32)) >> 27];
  }

Measure: run: took: 7823.70000 ms
Measure: run: took: 6757.05700 ms
Measure: run: took: 6757.66400 ms
Measure: run: took: 6739.62400 ms

// https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightModLookup
int get trailingZeros {
  return const [
    32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13, 4,
    7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9, 5,
    20, 8, 19, 18
  ][(-this & this) % 37];
}

Measure: run: took: 6370.41900 ms
Measure: run: took: 6391.09100 ms
Measure: run: took: 6359.07100 ms
Measure: run: took: 6344.87400 ms

// https://stackoverflow.com/questions/45221914/number-of-trailing-zeroes
int get trailingZeros {
  int bits = 0;
  int x = this;
  if (x > 0) {
    /* assuming `x` has 32 bits: lets count the low order 0 bits in batches */
    /* mask the 16 low order bits, add 16 and shift them out if they are all 0 */
    if (0 == (x & 0x0000FFFF)) {
      bits += 16;
      x >>= 16;
    }
    /* mask the 8 low order bits, add 8 and shift them out if they are all 0 */
    if (0 == (x & 0x000000FF)) {
      bits += 8;
      x >>= 8;
    }
    /* mask the 4 low order bits, add 4 and shift them out if they are all 0 */
    if (0 == (x & 0x0000000F)) {
      bits += 4;
      x >>= 4;
    }
    /* mask the 2 low order bits, add 2 and shift them out if they are all 0 */
    if (0 == (x & 0x00000003)) {
      bits += 2;
      x >>= 2;
    }
    /* mask the low order bit and add 1 if it is 0 */
    bits += (x & 1) ^ 1;
  } else {
    return 32;
  }
  return bits;
}

Measure: run: took: 6307.44100 ms
Measure: run: took: 6323.07000 ms
Measure: run: took: 6313.07700 ms
Measure: run: took: 6323.30600 ms

// https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightBinSearch
int get trailingZeros {
  int v = this; // 32-bit word input to count zero bits on right
  int c; // c will be the number of zero bits on the right,
  // so if v is 1101000 (base 2), then c will be 3
  // NOTE: if 0 == v, then c = 31.
  if ((v & 0x1) > 0) {
    // special case for odd v (assumed to happen half of the time)
    c = 0;
  } else {
    c = 1;
    if ((v & 0xffff) == 0) {
      v >>= 16;
      c += 16;
    }
    if ((v & 0xff) == 0) {
      v >>= 8;
      c += 8;
    }
    if ((v & 0xf) == 0) {
      v >>= 4;
      c += 4;
    }
    if ((v & 0x3) == 0) {
      v >>= 2;
      c += 2;
    }
    c -= v & 0x1;
  }
  return c;
}

Not a huge difference.

modulovalue · 2023-12-23T16:44:42Z

Lemire uses ctz for his forEach in CRoaring and popcount for his forEach in FastBitSet:

https://github.com/lemire/FastBitSet.js/blob/d673f8ed91b4eb4e0104881aa1ff0402c3203648/FastBitSet.js#L183-L194

It would probably make sense to compare both approaches. Various approaches to popcount in Dart can be found in #20.

modulovalue · 2023-12-23T17:55:27Z

Using popcount (as displayed below) appears to be slower than ctz:

// BitArray
// ...
 void forEachSet(
    void Function(int index) fn,
  ) {
    final c = _data.length;
    for (int k = 0; k < c; ++k) {
      int w = _data[k];
      while (w != 0) {
        final t = w & -w;
        fn((k << 5) + _count_bits_parallel_32_bits((t - 1) | 0));
        w ^= t;
      }
    }
  }
}

int _count_bits_parallel_32_bits(
  final int value,
) {
  // In binary: 01010101010101010101010101010101
  const alternating_1 = 0x55555555;
  // In binary: 00110011001100110011001100110011
  const alternating_2 = 0x33333333;
  // In binary: 00001111000011110000111100001111
  const alternating_3 = 0x0F0F0F0F;
  // In binary: 00000000111111110000000011111111
  const alternating_4 = 0x00FF00FF;
  // In binary: 00000000000000001111111111111111
  const alternating_5 = 0x0000FFFF;
  final a = value >> 1;
  final b = value - (a & alternating_1);
  final c = ((b >> 2) & alternating_2) + (b & alternating_2);
  final d = ((c >> 4) + c) & alternating_3;
  final e = ((d >> 8) + d) & alternating_4;
  final f = ((e >> 16) + e) & alternating_5;
  return f;
}

modulovalue mentioned this issue Dec 23, 2023

Performance tracking issue #24

Open

modulovalue mentioned this issue Dec 23, 2023

Investigate loop unrolling. #25

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Consider using ctz for iteration. #19

Consider using ctz for iteration. #19

modulovalue commented Dec 22, 2023

modulovalue commented Dec 22, 2023

modulovalue commented Dec 23, 2023

modulovalue commented Dec 23, 2023

modulovalue commented Dec 23, 2023

modulovalue commented Dec 23, 2023

Consider using ctz for iteration. #19

Consider using ctz for iteration. #19

Comments

modulovalue commented Dec 22, 2023

modulovalue commented Dec 22, 2023

modulovalue commented Dec 23, 2023

modulovalue commented Dec 23, 2023

modulovalue commented Dec 23, 2023

modulovalue commented Dec 23, 2023