Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions perf/perf-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,20 @@ function iterateValues(table) {
names.forEach(name => Array.from(table.getChild(name)));
}

// random access to each column value
// this will be slower if there are multiple record batches
// due to the need for binary search over the offsets array
function randomAccess(table) {
const { numRows, numCols } = table;
const vals = Array(numCols);
for (let j = 0; j < numCols; ++j) {
const col = table.getChildAt(j);
for (let i = 0; i < numRows; ++i) {
vals[j] = col.at(i);
}
}
}

// generate row objects, access each property
function visitObjects(table) {
const nr = table.numRows;
Expand Down Expand Up @@ -58,6 +72,7 @@ async function run(file) {
trial('Parse Table from IPC', file, bytes, parseIPC, 10);
trial('Extract Arrays', file, bytes, extractArrays, 10);
trial('Iterate Values', file, bytes, iterateValues, 10);
trial('Random Access', file, bytes, randomAccess, 10);
trial('Visit Row Objects', file, bytes, visitObjects, 5);
console.log();
}
Expand Down
27 changes: 14 additions & 13 deletions src/column.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { isDirectBatch } from './batch.js';
import { bisectOffsets } from './util.js';

/**
* Build up a column from batches.
Expand Down Expand Up @@ -99,25 +100,25 @@ export class Column {
* lookup less efficient than a standard array access. If making a full
* scan of a column, consider extracting arrays via `toArray()` or using an
* iterator (`for (const value of column) {...}`).
* @param {number} index The index
* @param {number} index The row index.
* @returns {T | null} The value.
*/
at(index) {
// NOTE: if there is only one batch, this method is replaced with an
// optimized version within the Column constructor.
// optimized version in the Column constructor.
const { data, offsets } = this;
const i = bisectOffsets(offsets, index);
return data[i]?.at(index - offsets[i]); // undefined if out of range
}

// binary search for batch index
let a = 0;
let b = offsets.length;
do {
const mid = (a + b) >>> 1;
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);

// returns undefined if index is out of range
return data[--a]?.at(index - offsets[a]);
/**
* Return the column value at the given index. This method is the same as
* `at()` and is provided for better compatibility with Apache Arrow JS.
* @param {number} index The row index.
* @returns {T | null} The value.
*/
get(index) {
return this.at(index);
}

/**
Expand Down
70 changes: 43 additions & 27 deletions src/table.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { bisectOffsets } from './util.js';

/**
* A table consists of a collection of named columns (or 'children').
* To work with table data directly in JavaScript, usse `toColumns()`
Expand Down Expand Up @@ -110,42 +112,57 @@ export class Table {
return cols;
}

/**
* Return an array of objects representing the rows of this table.
* @returns {Record<string, any>[]}
*/
toArray() {
const { children, numRows, names } = this;
const data = children[0]?.data ?? [];
const output = Array(numRows);
for (let b = 0, row = -1; b < data.length; ++b) {
for (let i = 0; i < data[b].length; ++i) {
output[++row] = rowObject(names, children, b, i);
}
}
return output;
}

/**
* Return an iterator over objects representing the rows of this table.
* @returns {Generator<Record<string, any>, any, null>}
*/
*[Symbol.iterator]() {
const { children, names } = this;
const batches = children[0]?.data.length ?? 0;
// for each batch...
for (let b = 0; b < batches; ++b) {
const data = children.map(c => c.data[b]);
const rows = data[0].length;
// for each row...
for (let i = 0; i < rows; ++i) {
yield rowObject(names, data, i);
const data = children[0]?.data ?? [];
for (let b = 0; b < data.length; ++b) {
for (let i = 0; i < data[b].length; ++i) {
yield rowObject(names, children, b, i);
}
}
}

/**
* Return an array of objects representing the rows of this table.
* @returns {Record<string, any>[]}
* Return a row object for the given index.
* @param {number} index The row index.
* @returns {Record<string, any>} The row object.
*/
toArray() {
const { children, numRows, names } = this;
const batches = children[0]?.data.length ?? 0;
const output = Array(numRows);
// for each batch...
for (let b = 0, row = -1; b < batches; ++b) {
const data = children.map(c => c.data[b]);
const rows = data?.[0].length;
// for each row...
for (let i = 0; i < rows; ++i) {
output[++row] = rowObject(names, data, i);
}
}
return output;
at(index) {
const { names, children, numRows } = this;
if (index < 0 || index >= numRows) return null;
const [{ offsets }] = children;
const i = bisectOffsets(offsets, index);
return rowObject(names, children, i, index - offsets[i]);
}

/**
* Return a row object for the given index. This method is the same as
* `at()` and is provided for better compatibility with Apache Arrow JS.
* @param {number} index The row index.
* @returns {Record<string, any>} The row object.
*/
get(index) {
return this.at(index);
}
}

Expand All @@ -155,11 +172,10 @@ function renameField(field, name) {
: field;
}

function rowObject(names, data, index) {
function rowObject(names, children, batch, index) {
const o = {};
// for each column...
for (let j = 0; j < names.length; ++j) {
o[names[j]] = data[j].at(index);
o[names[j]] = children[j].data[batch].at(index);
}
return o;
}
23 changes: 23 additions & 0 deletions src/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,29 @@ export function divide(num, div) {
return toNumber(num / div) + toNumber(num % div) / toNumber(div);
}

/**
* Determine the correct index into an offset array for a given
* full column row index.
* @param {Int32Array} offsets The offsets array.
* @param {number} index The full column row index.
*/
export function bisectOffsets(offsets, index) {
// binary search for batch index
// we use a fast unsigned bit shift for division by two
// this assumes offsets.length <= Math.pow(2, 31), which seems safe
// otherwise that is a whole lotta record batches to handle in JS...
let a = 0;
let b = offsets.length;
do {
const mid = (a + b) >>> 1;
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);

// decrement to the desired offset array index
return --a;
}

// -- flatbuffer utilities -----

/**
Expand Down
24 changes: 19 additions & 5 deletions test/table-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,42 @@ const values = [
const table = tableFromIPC(await arrowFromDuckDB(values));

describe('Table', () => {
it('provides row count', async () => {
it('provides row count', () => {
assert.deepStrictEqual(table.numRows, 3);
});

it('provides column count', async () => {
it('provides column count', () => {
assert.deepStrictEqual(table.numCols, 1);
});

it('provides child column accessors', async () => {
it('provides child column accessors', () => {
const col = table.getChild('value');
assert.strictEqual(col, table.getChildAt(0));
assert.deepStrictEqual(col.toArray(), values);
});

it('provides object array', async () => {
it('provides object array', () => {
assert.deepStrictEqual(table.toArray(), values.map(value => ({ value })));
});

it('provides column array map', async () => {
it('provides column array map', () => {
assert.deepStrictEqual(table.toColumns(), { value: values });
});

it('provides random access via at/get', () => {
const idx = [0, 1, 2];

// table object random access
const obj = values.map(value => ({ value }));
assert.deepStrictEqual(idx.map(i => table.at(i)), obj);
assert.deepStrictEqual(idx.map(i => table.get(i)), obj);

// column value random access
const col = table.getChildAt(0);
assert.deepStrictEqual(idx.map(i => col.at(i)), values);
assert.deepStrictEqual(idx.map(i => col.get(i)), values);
});

it('provides select by index', async () => {
const sel = table.selectAt([0, 0]);
const col = table.getChild('value');
Expand Down