Skip to content

Commit c8ec41f

Browse files
authored
Add random access for row objects, get methods for compatibility (#5)
* feat: Add get methods, inc. row object. * test: Add random access perf tests.
1 parent cae4e17 commit c8ec41f

File tree

5 files changed

+114
-45
lines changed

5 files changed

+114
-45
lines changed

perf/perf-test.js

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,20 @@ function iterateValues(table) {
2828
names.forEach(name => Array.from(table.getChild(name)));
2929
}
3030

31+
// random access to each column value
32+
// this will be slower if there are multiple record batches
33+
// due to the need for binary search over the offsets array
34+
function randomAccess(table) {
35+
const { numRows, numCols } = table;
36+
const vals = Array(numCols);
37+
for (let j = 0; j < numCols; ++j) {
38+
const col = table.getChildAt(j);
39+
for (let i = 0; i < numRows; ++i) {
40+
vals[j] = col.at(i);
41+
}
42+
}
43+
}
44+
3145
// generate row objects, access each property
3246
function visitObjects(table) {
3347
const nr = table.numRows;
@@ -58,6 +72,7 @@ async function run(file) {
5872
trial('Parse Table from IPC', file, bytes, parseIPC, 10);
5973
trial('Extract Arrays', file, bytes, extractArrays, 10);
6074
trial('Iterate Values', file, bytes, iterateValues, 10);
75+
trial('Random Access', file, bytes, randomAccess, 10);
6176
trial('Visit Row Objects', file, bytes, visitObjects, 5);
6277
console.log();
6378
}

src/column.js

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { isDirectBatch } from './batch.js';
2+
import { bisectOffsets } from './util.js';
23

34
/**
45
* Build up a column from batches.
@@ -99,25 +100,25 @@ export class Column {
99100
* lookup less efficient than a standard array access. If making a full
100101
* scan of a column, consider extracting arrays via `toArray()` or using an
101102
* iterator (`for (const value of column) {...}`).
102-
* @param {number} index The index
103+
* @param {number} index The row index.
103104
* @returns {T | null} The value.
104105
*/
105106
at(index) {
106107
// NOTE: if there is only one batch, this method is replaced with an
107-
// optimized version within the Column constructor.
108+
// optimized version in the Column constructor.
108109
const { data, offsets } = this;
110+
const i = bisectOffsets(offsets, index);
111+
return data[i]?.at(index - offsets[i]); // undefined if out of range
112+
}
109113

110-
// binary search for batch index
111-
let a = 0;
112-
let b = offsets.length;
113-
do {
114-
const mid = (a + b) >>> 1;
115-
if (offsets[mid] <= index) a = mid + 1;
116-
else b = mid;
117-
} while (a < b);
118-
119-
// returns undefined if index is out of range
120-
return data[--a]?.at(index - offsets[a]);
114+
/**
115+
* Return the column value at the given index. This method is the same as
116+
* `at()` and is provided for better compatibility with Apache Arrow JS.
117+
* @param {number} index The row index.
118+
* @returns {T | null} The value.
119+
*/
120+
get(index) {
121+
return this.at(index);
121122
}
122123

123124
/**

src/table.js

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import { bisectOffsets } from './util.js';
2+
13
/**
24
* A table consists of a collection of named columns (or 'children').
35
* To work with table data directly in JavaScript, usse `toColumns()`
@@ -110,42 +112,57 @@ export class Table {
110112
return cols;
111113
}
112114

115+
/**
116+
* Return an array of objects representing the rows of this table.
117+
* @returns {Record<string, any>[]}
118+
*/
119+
toArray() {
120+
const { children, numRows, names } = this;
121+
const data = children[0]?.data ?? [];
122+
const output = Array(numRows);
123+
for (let b = 0, row = -1; b < data.length; ++b) {
124+
for (let i = 0; i < data[b].length; ++i) {
125+
output[++row] = rowObject(names, children, b, i);
126+
}
127+
}
128+
return output;
129+
}
130+
113131
/**
114132
* Return an iterator over objects representing the rows of this table.
115133
* @returns {Generator<Record<string, any>, any, null>}
116134
*/
117135
*[Symbol.iterator]() {
118136
const { children, names } = this;
119-
const batches = children[0]?.data.length ?? 0;
120-
// for each batch...
121-
for (let b = 0; b < batches; ++b) {
122-
const data = children.map(c => c.data[b]);
123-
const rows = data[0].length;
124-
// for each row...
125-
for (let i = 0; i < rows; ++i) {
126-
yield rowObject(names, data, i);
137+
const data = children[0]?.data ?? [];
138+
for (let b = 0; b < data.length; ++b) {
139+
for (let i = 0; i < data[b].length; ++i) {
140+
yield rowObject(names, children, b, i);
127141
}
128142
}
129143
}
130144

131145
/**
132-
* Return an array of objects representing the rows of this table.
133-
* @returns {Record<string, any>[]}
146+
* Return a row object for the given index.
147+
* @param {number} index The row index.
148+
* @returns {Record<string, any>} The row object.
134149
*/
135-
toArray() {
136-
const { children, numRows, names } = this;
137-
const batches = children[0]?.data.length ?? 0;
138-
const output = Array(numRows);
139-
// for each batch...
140-
for (let b = 0, row = -1; b < batches; ++b) {
141-
const data = children.map(c => c.data[b]);
142-
const rows = data?.[0].length;
143-
// for each row...
144-
for (let i = 0; i < rows; ++i) {
145-
output[++row] = rowObject(names, data, i);
146-
}
147-
}
148-
return output;
150+
at(index) {
151+
const { names, children, numRows } = this;
152+
if (index < 0 || index >= numRows) return null;
153+
const [{ offsets }] = children;
154+
const i = bisectOffsets(offsets, index);
155+
return rowObject(names, children, i, index - offsets[i]);
156+
}
157+
158+
/**
159+
* Return a row object for the given index. This method is the same as
160+
* `at()` and is provided for better compatibility with Apache Arrow JS.
161+
* @param {number} index The row index.
162+
* @returns {Record<string, any>} The row object.
163+
*/
164+
get(index) {
165+
return this.at(index);
149166
}
150167
}
151168

@@ -155,11 +172,10 @@ function renameField(field, name) {
155172
: field;
156173
}
157174

158-
function rowObject(names, data, index) {
175+
function rowObject(names, children, batch, index) {
159176
const o = {};
160-
// for each column...
161177
for (let j = 0; j < names.length; ++j) {
162-
o[names[j]] = data[j].at(index);
178+
o[names[j]] = children[j].data[batch].at(index);
163179
}
164180
return o;
165181
}

src/util.js

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,29 @@ export function divide(num, div) {
5757
return toNumber(num / div) + toNumber(num % div) / toNumber(div);
5858
}
5959

60+
/**
61+
* Determine the correct index into an offset array for a given
62+
* full column row index.
63+
* @param {Int32Array} offsets The offsets array.
64+
* @param {number} index The full column row index.
65+
*/
66+
export function bisectOffsets(offsets, index) {
67+
// binary search for batch index
68+
// we use a fast unsigned bit shift for division by two
69+
// this assumes offsets.length <= Math.pow(2, 31), which seems safe
70+
// otherwise that is a whole lotta record batches to handle in JS...
71+
let a = 0;
72+
let b = offsets.length;
73+
do {
74+
const mid = (a + b) >>> 1;
75+
if (offsets[mid] <= index) a = mid + 1;
76+
else b = mid;
77+
} while (a < b);
78+
79+
// decrement to the desired offset array index
80+
return --a;
81+
}
82+
6083
// -- flatbuffer utilities -----
6184

6285
/**

test/table-test.js

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,28 +12,42 @@ const values = [
1212
const table = tableFromIPC(await arrowFromDuckDB(values));
1313

1414
describe('Table', () => {
15-
it('provides row count', async () => {
15+
it('provides row count', () => {
1616
assert.deepStrictEqual(table.numRows, 3);
1717
});
1818

19-
it('provides column count', async () => {
19+
it('provides column count', () => {
2020
assert.deepStrictEqual(table.numCols, 1);
2121
});
2222

23-
it('provides child column accessors', async () => {
23+
it('provides child column accessors', () => {
2424
const col = table.getChild('value');
2525
assert.strictEqual(col, table.getChildAt(0));
2626
assert.deepStrictEqual(col.toArray(), values);
2727
});
2828

29-
it('provides object array', async () => {
29+
it('provides object array', () => {
3030
assert.deepStrictEqual(table.toArray(), values.map(value => ({ value })));
3131
});
3232

33-
it('provides column array map', async () => {
33+
it('provides column array map', () => {
3434
assert.deepStrictEqual(table.toColumns(), { value: values });
3535
});
3636

37+
it('provides random access via at/get', () => {
38+
const idx = [0, 1, 2];
39+
40+
// table object random access
41+
const obj = values.map(value => ({ value }));
42+
assert.deepStrictEqual(idx.map(i => table.at(i)), obj);
43+
assert.deepStrictEqual(idx.map(i => table.get(i)), obj);
44+
45+
// column value random access
46+
const col = table.getChildAt(0);
47+
assert.deepStrictEqual(idx.map(i => col.at(i)), values);
48+
assert.deepStrictEqual(idx.map(i => col.get(i)), values);
49+
});
50+
3751
it('provides select by index', async () => {
3852
const sel = table.selectAt([0, 0]);
3953
const col = table.getChild('value');

0 commit comments

Comments
 (0)