Skip to content

Commit 6ef187f

Browse files
authored
feat!: auto-shard based on node size (#171)
js counterpart to ipfs/kubo#8114 Changes the `shardSplitThreshold` parameter to mean the size of the final DAGNode (including link names, sizes, etc) instead of the number of entries in the directory. Fixes: #149 BREAKING CHANGE: the `shardSplitThreshold` option has changed to `shardSplitThresholdBytes` and reflects a DAGNode size where sharding might kick in
1 parent 5ebc923 commit 6ef187f

16 files changed

+212
-47
lines changed

Diff for: packages/ipfs-unixfs-exporter/package.json

-3
Original file line numberDiff line numberDiff line change
@@ -169,17 +169,14 @@
169169
},
170170
"devDependencies": {
171171
"@types/sinon": "^10.0.0",
172-
"abort-controller": "^3.0.0",
173172
"aegir": "^38.1.2",
174173
"blockstore-core": "^3.0.0",
175-
"crypto-browserify": "^3.12.0",
176174
"delay": "^5.0.0",
177175
"ipfs-unixfs-importer": "^12.0.0",
178176
"it-all": "^2.0.0",
179177
"it-buffer-stream": "^3.0.0",
180178
"it-first": "^2.0.0",
181179
"merge-options": "^3.0.4",
182-
"native-abort-controller": "^1.0.3",
183180
"sinon": "^15.0.0"
184181
},
185182
"browser": {

Diff for: packages/ipfs-unixfs-exporter/test/exporter-sharded.spec.js

+4-3
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ describe('exporter sharded', function () {
4545
*/
4646
const createShardWithFiles = async (files) => {
4747
const result = await last(importer(files, block, {
48-
shardSplitThreshold: SHARD_SPLIT_THRESHOLD,
48+
shardSplitThresholdBytes: SHARD_SPLIT_THRESHOLD,
4949
wrapWithDirectory: true
5050
}))
5151

@@ -60,7 +60,8 @@ describe('exporter sharded', function () {
6060
/** @type {{ [key: string]: { content: Uint8Array, cid?: CID }}} */
6161
const files = {}
6262

63-
for (let i = 0; i < (SHARD_SPLIT_THRESHOLD + 1); i++) {
63+
// needs to result in a block that is larger than SHARD_SPLIT_THRESHOLD bytes
64+
for (let i = 0; i < 100; i++) {
6465
files[`file-${Math.random()}.txt`] = {
6566
content: uint8ArrayConcat(await all(randomBytes(100)))
6667
}
@@ -71,7 +72,7 @@ describe('exporter sharded', function () {
7172
content: asAsyncIterable(files[path].content)
7273
})), block, {
7374
wrapWithDirectory: true,
74-
shardSplitThreshold: SHARD_SPLIT_THRESHOLD
75+
shardSplitThresholdBytes: SHARD_SPLIT_THRESHOLD
7576
}))
7677

7778
const dirCid = imported.pop()?.cid

Diff for: packages/ipfs-unixfs-exporter/test/exporter.spec.js

-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import all from 'it-all'
1414
import last from 'it-last'
1515
import first from 'it-first'
1616
import randomBytes from 'it-buffer-stream'
17-
import { AbortController } from 'native-abort-controller'
1817
import blockApi from './helpers/block.js'
1918
import { concat as uint8ArrayConcat } from 'uint8arrays/concat'
2019
import { fromString as uint8ArrayFromString } from 'uint8arrays/from-string'

Diff for: packages/ipfs-unixfs-exporter/test/import-export-dir-sharding.spec.js

+4-4
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ describe('builder: directory sharding', () => {
2626
path: 'a/b',
2727
content: asAsyncIterable(content)
2828
}], block, {
29-
shardSplitThreshold: Infinity // never shard
29+
shardSplitThresholdBytes: Infinity // never shard
3030
}))
3131

3232
expect(nodes.length).to.equal(2)
@@ -62,7 +62,7 @@ describe('builder: directory sharding', () => {
6262
path: 'a/b',
6363
content: asAsyncIterable(uint8ArrayFromString('i have the best bytes'))
6464
}], block, {
65-
shardSplitThreshold: 0 // always shard
65+
shardSplitThresholdBytes: 0 // always shard
6666
}))
6767

6868
expect(nodes.length).to.equal(2)
@@ -84,7 +84,7 @@ describe('builder: directory sharding', () => {
8484
path: 'a/b',
8585
content: asAsyncIterable(uint8ArrayFromString(content))
8686
}], block, {
87-
shardSplitThreshold: Infinity // never shard
87+
shardSplitThresholdBytes: Infinity // never shard
8888
}))
8989

9090
const nonShardedHash = nodes[1].cid
@@ -121,7 +121,7 @@ describe('builder: directory sharding', () => {
121121
path: 'a/b',
122122
content: asAsyncIterable(uint8ArrayFromString(content))
123123
}], block, {
124-
shardSplitThreshold: 0 // always shard
124+
shardSplitThresholdBytes: 0 // always shard
125125
}))
126126

127127
const shardedHash = nodes[1].cid

Diff for: packages/ipfs-unixfs-exporter/test/importer.spec.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -724,7 +724,7 @@ strategies.forEach((strategy) => {
724724
const options = {
725725
cidVersion: 1,
726726
// Ensures we use DirSharded for the data below
727-
shardSplitThreshold: 3
727+
shardSplitThresholdBytes: 3
728728
}
729729

730730
const files = await all(importer(inputFiles.map(file => ({
@@ -941,7 +941,7 @@ strategies.forEach((strategy) => {
941941
}, {
942942
path: '/foo/qux'
943943
}], block, {
944-
shardSplitThreshold: 0
944+
shardSplitThresholdBytes: 0
945945
}))
946946

947947
const nodes = await all(recursive(entries[entries.length - 1].cid, block))

Diff for: packages/ipfs-unixfs-importer/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ The input's file paths and directory structure will be preserved in the [`dag-pb
120120
`options` is an JavaScript option that might include the following keys:
121121

122122
- `wrapWithDirectory` (boolean, defaults to false): if true, a wrapping node will be created
123-
- `shardSplitThreshold` (positive integer, defaults to 1000): the number of directory entries above which we decide to use a sharding directory builder (instead of the default flat one)
123+
- `shardSplitThresholdBytes` (positive integer, defaults to 256KiB): if the serialized node is larger than this it might be converted to a HAMT sharded directory
124124
- `chunker` (string, defaults to `"fixed"`): the chunking strategy. Supports:
125125
- `fixed`
126126
- `rabin`

Diff for: packages/ipfs-unixfs-importer/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,9 @@
169169
},
170170
"devDependencies": {
171171
"aegir": "^38.1.2",
172-
"assert": "^2.0.0",
173172
"blockstore-core": "^3.0.0",
174173
"it-buffer-stream": "^3.0.0",
174+
"it-last": "^2.0.0",
175175
"wherearewe": "^2.0.1"
176176
},
177177
"browser": {

Diff for: packages/ipfs-unixfs-importer/src/dir-flat.js

+30-18
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { encode, prepare } from '@ipld/dag-pb'
22
import { UnixFS } from 'ipfs-unixfs'
3-
import Dir from './dir.js'
3+
import { Dir, CID_V0, CID_V1 } from './dir.js'
44
import persist from './utils/persist.js'
55

66
/**
@@ -21,8 +21,8 @@ class DirFlat extends Dir {
2121
constructor (props, options) {
2222
super(props, options)
2323

24-
/** @type {{ [key: string]: InProgressImportResult | Dir }} */
25-
this._children = {}
24+
/** @type {Map<string, InProgressImportResult | Dir>} */
25+
this._children = new Map()
2626
}
2727

2828
/**
@@ -32,53 +32,65 @@ class DirFlat extends Dir {
3232
async put (name, value) {
3333
this.cid = undefined
3434
this.size = undefined
35+
this.nodeSize = undefined
3536

36-
this._children[name] = value
37+
this._children.set(name, value)
3738
}
3839

3940
/**
4041
* @param {string} name
4142
*/
4243
get (name) {
43-
return Promise.resolve(this._children[name])
44+
return Promise.resolve(this._children.get(name))
4445
}
4546

4647
childCount () {
47-
return Object.keys(this._children).length
48+
return this._children.size
4849
}
4950

5051
directChildrenCount () {
5152
return this.childCount()
5253
}
5354

5455
onlyChild () {
55-
return this._children[Object.keys(this._children)[0]]
56+
return this._children.values().next().value
5657
}
5758

5859
async * eachChildSeries () {
59-
const keys = Object.keys(this._children)
60+
for (const [key, child] of this._children.entries()) {
61+
yield {
62+
key,
63+
child
64+
}
65+
}
66+
}
67+
68+
estimateNodeSize () {
69+
if (this.nodeSize !== undefined) {
70+
return this.nodeSize
71+
}
6072

61-
for (let i = 0; i < keys.length; i++) {
62-
const key = keys[i]
73+
this.nodeSize = 0
6374

64-
yield {
65-
key: key,
66-
child: this._children[key]
75+
// estimate size only based on DAGLink name and CID byte lengths
76+
// https://github.com/ipfs/go-unixfsnode/blob/37b47f1f917f1b2f54c207682f38886e49896ef9/data/builder/directory.go#L81-L96
77+
for (const [name, child] of this._children.entries()) {
78+
if (child.size != null && child.cid) {
79+
this.nodeSize += name.length + (this.options.cidVersion === 1 ? CID_V1.bytes.byteLength : CID_V0.bytes.byteLength)
6780
}
6881
}
82+
83+
return this.nodeSize
6984
}
7085

7186
/**
7287
* @param {Blockstore} block
7388
* @returns {AsyncIterable<ImportResult>}
7489
*/
7590
async * flush (block) {
76-
const children = Object.keys(this._children)
7791
const links = []
7892

79-
for (let i = 0; i < children.length; i++) {
80-
let child = this._children[children[i]]
81-
93+
for (let [name, child] of this._children.entries()) {
8294
if (child instanceof Dir) {
8395
for await (const entry of child.flush(block)) {
8496
child = entry
@@ -89,7 +101,7 @@ class DirFlat extends Dir {
89101

90102
if (child.size != null && child.cid) {
91103
links.push({
92-
Name: children[i],
104+
Name: name,
93105
Tsize: child.size,
94106
Hash: child.cid
95107
})

Diff for: packages/ipfs-unixfs-importer/src/dir-sharded.js

+89-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { encode, prepare } from '@ipld/dag-pb'
22
import { UnixFS } from 'ipfs-unixfs'
3-
import Dir from './dir.js'
3+
import { Dir, CID_V0, CID_V1 } from './dir.js'
44
import persist from './utils/persist.js'
55
import { createHAMT, Bucket } from 'hamt-sharding'
66

@@ -35,6 +35,10 @@ class DirSharded extends Dir {
3535
* @param {InProgressImportResult | Dir} value
3636
*/
3737
async put (name, value) {
38+
this.cid = undefined
39+
this.size = undefined
40+
this.nodeSize = undefined
41+
3842
await this._bucket.put(name, value)
3943
}
4044

@@ -66,6 +70,16 @@ class DirSharded extends Dir {
6670
}
6771
}
6872

73+
estimateNodeSize () {
74+
if (this.nodeSize !== undefined) {
75+
return this.nodeSize
76+
}
77+
78+
this.nodeSize = calculateSize(this._bucket, this, this.options)
79+
80+
return this.nodeSize
81+
}
82+
6983
/**
7084
* @param {Blockstore} blockstore
7185
* @returns {AsyncIterable<ImportResult>}
@@ -85,7 +99,7 @@ export default DirSharded
8599
/**
86100
* @param {Bucket<?>} bucket
87101
* @param {Blockstore} blockstore
88-
* @param {*} shardRoot
102+
* @param {DirSharded | null} shardRoot
89103
* @param {ImporterOptions} options
90104
* @returns {AsyncIterable<ImportResult>}
91105
*/
@@ -183,3 +197,76 @@ async function * flush (bucket, blockstore, shardRoot, options) {
183197
size
184198
}
185199
}
200+
201+
/**
202+
* @param {Bucket<?>} bucket
203+
* @param {DirSharded | null} shardRoot
204+
* @param {ImporterOptions} options
205+
*/
206+
function calculateSize (bucket, shardRoot, options) {
207+
const children = bucket._children
208+
const links = []
209+
210+
for (let i = 0; i < children.length; i++) {
211+
const child = children.get(i)
212+
213+
if (!child) {
214+
continue
215+
}
216+
217+
const labelPrefix = i.toString(16).toUpperCase().padStart(2, '0')
218+
219+
if (child instanceof Bucket) {
220+
const size = calculateSize(child, null, options)
221+
222+
links.push({
223+
Name: labelPrefix,
224+
Tsize: size,
225+
Hash: options.cidVersion === 0 ? CID_V0 : CID_V1
226+
})
227+
} else if (typeof child.value.flush === 'function') {
228+
const dir = child.value
229+
const size = dir.nodeSize()
230+
231+
links.push({
232+
Name: labelPrefix + child.key,
233+
Tsize: size,
234+
Hash: options.cidVersion === 0 ? CID_V0 : CID_V1
235+
})
236+
} else {
237+
const value = child.value
238+
239+
if (!value.cid) {
240+
continue
241+
}
242+
243+
const label = labelPrefix + child.key
244+
const size = value.size
245+
246+
links.push({
247+
Name: label,
248+
Tsize: size,
249+
Hash: value.cid
250+
})
251+
}
252+
}
253+
254+
// go-ipfs uses little endian, that's why we have to
255+
// reverse the bit field before storing it
256+
const data = Uint8Array.from(children.bitField().reverse())
257+
const dir = new UnixFS({
258+
type: 'hamt-sharded-directory',
259+
data,
260+
fanout: bucket.tableSize(),
261+
hashType: options.hamtHashCode,
262+
mtime: shardRoot && shardRoot.mtime,
263+
mode: shardRoot && shardRoot.mode
264+
})
265+
266+
const buffer = encode(prepare({
267+
Data: dir.marshal(),
268+
Links: links
269+
}))
270+
271+
return buffer.length
272+
}

0 commit comments

Comments
 (0)