Skip to content

Commit 2454a7a

Browse files
fs-eirekleiti
authored andcommitted
[js/webgpu] support using uniform buffer (microsoft#17803)
### Description support using uniform buffer. This PR allows to use uniform buffer in shader program, so that some runtime information (eg. input/output shape) is no longer need to be hardcoded into shader code. There are 2 commits in this PR: - [667f31c](microsoft@667f31c): framework changes to support uniform buffer, as well as updates in program manager, gpu data manager and indices helper. - [09e1d2a](microsoft@09e1d2a): an example change for operator `Transpose` to use input's rank-only instead of dims as shader key. With this change, model mobilenetv2-12 shader compile times dropped from 71 to 52.
1 parent 06afe7b commit 2454a7a

41 files changed

Lines changed: 1213 additions & 1205 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

js/web/lib/wasm/jsep/backend-webgpu.ts

Lines changed: 148 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,44 @@ import {createView, TensorView} from './tensor-view';
88
import {createGpuDataManager, downloadGpuData, GpuDataManager} from './webgpu/gpu-data-manager';
99
import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
1010
import {ProgramManager} from './webgpu/program-manager';
11-
import {ComputeContext, GpuData, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
11+
import {ComputeContext, GpuData, ProgramInfo, ProgramInputTensorInfoDependency} from './webgpu/types';
12+
13+
const getProgramInputTensorInfoDependencyKey =
14+
(inputTensors: readonly TensorView[], inputDependencies: readonly ProgramInputTensorInfoDependency[]): string => {
15+
if (inputDependencies.length !== inputTensors.length) {
16+
throw new Error(`inputDependencies length ${inputDependencies.length} is not equal to inputTensors length ${
17+
inputTensors.length}.`);
18+
}
19+
20+
const inputInfos: string[] = [];
21+
for (let i = 0; i < inputTensors.length; ++i) {
22+
const type = inputTensors[i].dataType;
23+
switch (inputDependencies[i]) {
24+
case 'none': {
25+
inputInfos.push('');
26+
break;
27+
}
28+
case 'type': {
29+
inputInfos.push(`${type}`);
30+
break;
31+
}
32+
case 'rank': {
33+
const rank = inputTensors[i].dims.length;
34+
inputInfos.push(`${type};${rank}`);
35+
break;
36+
}
37+
case 'dims': {
38+
const dims = inputTensors[i].dims.join(',');
39+
inputInfos.push(`${type};${dims}`);
40+
break;
41+
}
42+
default:
43+
throw new Error(`unsupported input dependency: ${inputDependencies[i]}`);
44+
}
45+
}
46+
47+
return inputInfos.join('|');
48+
};
1249

1350
/**
1451
* get a unique key representing the program from the program info, input shapes and types.
@@ -17,18 +54,20 @@ import {ComputeContext, GpuData, ProgramInfo, ProgramInfoLoader} from './webgpu/
1754
* program. if the key is the same, the program shader source should be the same, so we can reuse the program.
1855
*
1956
*/
20-
const getProgramInfoUniqueKey =
21-
(programInfo: ProgramInfo|ProgramInfoLoader, inputTensors: readonly TensorView[]): string => {
22-
// final key format:
23-
// <PROGRAM_NAME>[<PROGRAM_CUSTOM_CACHE_HINT>]:<INPUTS_INFO_0>|<INPUTS_INFO_1>|...
24-
const inputInfos = inputTensors.map(tensor => `${tensor.dataType};${tensor.dims.join(',')}`).join('|');
25-
let key = programInfo.name;
26-
if (programInfo.cacheHint) {
27-
key += '[' + programInfo.cacheHint + ']';
28-
}
29-
key += ':' + inputInfos;
30-
return key;
31-
};
57+
const getProgramInfoUniqueKey = (programInfo: ProgramInfo, inputTensors: readonly TensorView[]): string => {
58+
// final key format:
59+
// <PROGRAM_NAME>[<PROGRAM_CUSTOM_CACHE_HINT>]:<INPUTS_INFO_0>|<INPUTS_INFO_1>|...
60+
let key = programInfo.name;
61+
if (programInfo.shaderCache?.hint) {
62+
key += '[' + programInfo.shaderCache.hint + ']';
63+
}
64+
key += `:${
65+
getProgramInputTensorInfoDependencyKey(
66+
inputTensors,
67+
programInfo.shaderCache?.inputDependencies ??
68+
new Array<ProgramInputTensorInfoDependency>(inputTensors.length).fill('dims'))}`;
69+
return key;
70+
};
3271

3372
/**
3473
* this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
@@ -208,55 +247,53 @@ export class WebGpuBackend {
208247

209248
/**
210249
* run a WebGPU program.
211-
* @param program either a ProgramInfo instance containing metadata including the shader code, or a function that
212-
* can be called and return a ProgramInfo instance
213-
* @param inputs a TensorView array. each element represents a value already exists in GPU.
250+
* @param program a ProgramInfo instance
251+
* @param inputTensorViews a TensorView array. each element represents a value already exists in GPU.
214252
* @param outputIndices an indices array. each element can be either -1 (temporary data), -2 (persistent data) or an
215253
* index to the kernel's output.
216254
* @param createKernelOutput a callback function that create a value to kernel's output with the given index
217255
* @param createIntermediateOutput a callback function that create a value as a intermediate value, either temporary
218256
* or persistent (owned by the current kernel)
219257
* @returns a TensorView array representing the result.
220258
*/
221-
run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly TensorView[], outputIndices: readonly number[],
259+
run(program: ProgramInfo, inputTensorViews: readonly TensorView[], outputIndices: readonly number[],
222260
createKernelOutput: (index: number, dataType: number, dims: readonly number[]) => TensorView,
223261
createIntermediateOutput: (dataType: number, dims: readonly number[]) => TensorView): TensorView[] {
224-
if (inputs.length !== program.inputTypes.length) {
262+
if (inputTensorViews.length !== program.inputTypes.length) {
225263
throw new Error(`Input size must be equal to ${program.inputTypes.length}.`);
226264
}
227265

228266
// create info for inputs
229267
const inputDatas: GpuData[] = [];
230-
for (let i = 0; i < inputs.length; ++i) {
231-
const gpuData = this.gpuDataManager.get(inputs[i].data);
268+
for (let i = 0; i < inputTensorViews.length; ++i) {
269+
const gpuData = this.gpuDataManager.get(inputTensorViews[i].data);
232270
if (!gpuData) {
233-
throw new Error(`no GPU data for input: ${inputs[i].data}`);
271+
throw new Error(`no GPU data for input: ${inputTensorViews[i].data}`);
234272
}
235273
inputDatas[i] = gpuData;
236274
}
237275

238-
const key = getProgramInfoUniqueKey(program, inputs);
276+
// get program info
277+
const key = getProgramInfoUniqueKey(program, inputTensorViews);
239278
let artifact = this.programManager.getArtifact(key);
240-
const programInfo = artifact ?
241-
artifact.programInfo :
242-
(typeof (program as ProgramInfoLoader).get === 'function' ? (program as ProgramInfoLoader).get() :
243-
(program as ProgramInfo));
279+
280+
const {outputs, dispatchGroup, variables} = program.getRunData(inputTensorViews);
244281

245282
// check output indices
246-
const validatedOutputIndices = outputIndices.length === 0 ? programInfo.outputs.map((_, i) => i) : outputIndices;
247-
if (validatedOutputIndices.length !== programInfo.outputs.length) {
248-
throw new Error(`Output size ${validatedOutputIndices.length} must be equal to ${programInfo.outputs.length}.`);
283+
const validatedOutputIndices = outputIndices.length === 0 ? outputs.map((_, i) => i) : outputIndices;
284+
if (validatedOutputIndices.length !== outputs.length) {
285+
throw new Error(`Output size ${validatedOutputIndices.length} must be equal to ${outputs.length}.`);
249286
}
250287

251288
// create info for outputs
252289
const outputTensorViews: TensorView[] = [];
253290
const outputDatas: GpuData[] = [];
254-
for (let i = 0; i < programInfo.outputs.length; ++i) {
291+
for (let i = 0; i < outputs.length; ++i) {
255292
// value -1 and -2 are used for creating temporary and persistent outputs.
256293
// value -3 is used for placeholder output. So -3, -2, -1 and 0, 1, 2, ... are valid
257294
// output indices. see type definition of ComputeContextInputsOutputsMapping for more details.
258295
if (!Number.isInteger(validatedOutputIndices[i]) || validatedOutputIndices[i] < -3 ||
259-
validatedOutputIndices[i] >= programInfo.outputs.length) {
296+
validatedOutputIndices[i] >= outputs.length) {
260297
throw new Error(`Invalid output index: ${validatedOutputIndices[i]}`);
261298
}
262299
if (validatedOutputIndices[i] === -3) {
@@ -265,8 +302,8 @@ export class WebGpuBackend {
265302
const isTemporary = validatedOutputIndices[i] === -1;
266303
const isPersistent = validatedOutputIndices[i] === -2;
267304
const tensorView = (isTemporary || isPersistent) ?
268-
createIntermediateOutput(programInfo.outputs[i].dataType, programInfo.outputs[i].dims) :
269-
createKernelOutput(validatedOutputIndices[i], programInfo.outputs[i].dataType, programInfo.outputs[i].dims);
305+
createIntermediateOutput(outputs[i].dataType, outputs[i].dims) :
306+
createKernelOutput(validatedOutputIndices[i], outputs[i].dataType, outputs[i].dims);
270307
const gpuData = this.gpuDataManager.get(tensorView.data);
271308
if (!gpuData) {
272309
throw new Error(`no GPU data for output: ${tensorView.data}`);
@@ -286,18 +323,92 @@ export class WebGpuBackend {
286323
outputDatas.push(gpuData);
287324
}
288325

289-
const normalizedDispatchGroup = this.programManager.normalizeDispatchGroupSize(programInfo.dispatchGroup(inputs));
326+
327+
// load uniforms
328+
// TODO: add cache for uniform (is it necessary?)
329+
//
330+
let uniformBufferBinding: GPUBindingResource|undefined;
331+
if (variables) {
332+
let currentOffset = 0;
333+
let preLength = 0;
334+
const offsets: number[] = [];
335+
let maxAlignmentOfField = 1;
336+
variables.forEach(v => {
337+
const data = typeof v.data === 'number' ? [v.data] : v.data;
338+
// https://www.w3.org/TR/WGSL/#alignof
339+
let baseAlignment: number;
340+
switch (data.length) {
341+
case 1:
342+
baseAlignment = 4;
343+
break;
344+
case 2:
345+
baseAlignment = 8;
346+
break;
347+
case 3:
348+
baseAlignment = 16;
349+
break;
350+
case 4:
351+
baseAlignment = 16;
352+
break;
353+
case 5:
354+
baseAlignment = 16;
355+
break;
356+
case 6:
357+
baseAlignment = 16;
358+
break;
359+
default:
360+
throw new Error(`unsupported data length: ${data.length}`);
361+
}
362+
363+
if (preLength === 5 || preLength === 6) {
364+
baseAlignment = 16;
365+
}
366+
if (baseAlignment > maxAlignmentOfField) {
367+
maxAlignmentOfField = baseAlignment;
368+
}
369+
currentOffset = Math.ceil(currentOffset / baseAlignment) * baseAlignment;
370+
preLength = data.length;
371+
offsets.push(currentOffset);
372+
currentOffset += data.length * 4;
373+
});
374+
375+
currentOffset = Math.ceil(currentOffset / maxAlignmentOfField) * maxAlignmentOfField;
376+
const arrayBuffer = new ArrayBuffer(currentOffset);
377+
variables.forEach((v, i) => {
378+
const offset = offsets[i];
379+
const data = typeof v.data === 'number' ? [v.data] : v.data;
380+
if (v.type === 'int32') {
381+
new Int32Array(arrayBuffer, offset, data.length).set(data);
382+
} else if (v.type === 'uint32') {
383+
new Uint32Array(arrayBuffer, offset, data.length).set(data);
384+
} else {
385+
new Float32Array(arrayBuffer, offset, data.length).set(data);
386+
}
387+
});
388+
389+
const uniformBufferData =
390+
// eslint-disable-next-line no-bitwise
391+
this.gpuDataManager.create(currentOffset, GPUBufferUsage.COPY_DST | GPUBufferUsage.UNIFORM);
392+
this.device.queue.writeBuffer(uniformBufferData.buffer, 0, arrayBuffer, 0, currentOffset);
393+
this.gpuDataManager.release(uniformBufferData.id);
394+
uniformBufferBinding = {offset: 0, size: currentOffset, buffer: uniformBufferData.buffer};
395+
}
396+
397+
398+
const normalizedDispatchGroup = this.programManager.normalizeDispatchGroupSize(dispatchGroup);
290399

291400
if (!artifact) {
292-
artifact = this.programManager.build(programInfo, normalizedDispatchGroup);
401+
artifact = this.programManager.build(program, normalizedDispatchGroup);
293402
this.programManager.setArtifact(key, artifact);
294403
}
295404

296405
LOG_DEBUG(
297406
'info',
298-
() => `[ProgramManager] run "${programInfo.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
407+
() => `[ProgramManager] run "${program.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
299408
normalizedDispatchGroup[1]}x${normalizedDispatchGroup[2]}`);
300-
this.programManager.run(artifact, inputs, inputDatas, outputDatas, normalizedDispatchGroup);
409+
this.programManager.run(
410+
artifact, inputTensorViews, outputTensorViews, inputDatas, outputDatas, normalizedDispatchGroup,
411+
uniformBufferBinding);
301412

302413
return outputTensorViews;
303414
}

js/web/lib/wasm/jsep/init.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {WebGpuBackend} from './backend-webgpu';
1010
import {LOG_DEBUG} from './log';
1111
import {TensorView} from './tensor-view';
1212
import {ShapeUtil} from './util';
13-
import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
13+
import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo} from './webgpu/types';
1414

1515
/* eslint-disable no-bitwise */
1616

@@ -90,8 +90,7 @@ class ComputeContextImpl implements ComputeContext {
9090
this.inputs = inputs;
9191
}
9292

93-
compute(program: ProgramInfoLoader|ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping):
94-
TensorView[] {
93+
compute(program: ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping): TensorView[] {
9594
// prepare inputs. inputs should always be valid data.
9695
const mappedInputs =
9796
inputsOutputsMapping?.inputs?.map(i => typeof i === 'number' ? this.inputs[i] : i) ?? this.inputs;

js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,13 +133,16 @@ class GpuDataManagerImpl implements GpuDataManager {
133133

134134
// The reusable storage buffers for computing.
135135
private freeBuffers: Map<number, GPUBuffer[]>;
136+
// The reusable uniform buffers
137+
private freeUniformBuffers: Map<number, GPUBuffer[]>;
136138

137139
// The external buffers registered users for IO Binding.
138140
private externalBuffers: Map<GPUBuffer, GpuDataId>;
139141

140142
constructor(private backend: WebGpuBackend) {
141143
this.storageCache = new Map();
142144
this.freeBuffers = new Map();
145+
this.freeUniformBuffers = new Map();
143146
this.buffersForUploadingPending = [];
144147
this.buffersPending = [];
145148
this.externalBuffers = new Map();
@@ -247,11 +250,15 @@ class GpuDataManagerImpl implements GpuDataManager {
247250
let gpuBuffer;
248251
// Currently, only storage buffers are reused.
249252
// eslint-disable-next-line no-bitwise
250-
if ((usage & GPUBufferUsage.STORAGE) === GPUBufferUsage.STORAGE) {
251-
let buffers = this.freeBuffers.get(bufferSize);
253+
const isStorage = (usage & GPUBufferUsage.STORAGE) === GPUBufferUsage.STORAGE;
254+
// eslint-disable-next-line no-bitwise
255+
const isUniform = (usage & GPUBufferUsage.UNIFORM) === GPUBufferUsage.UNIFORM;
256+
if (isStorage || isUniform) {
257+
const freeBuffers = isStorage ? this.freeBuffers : this.freeUniformBuffers;
258+
let buffers = freeBuffers.get(bufferSize);
252259
if (!buffers) {
253260
buffers = [];
254-
this.freeBuffers.set(bufferSize, buffers);
261+
freeBuffers.set(bufferSize, buffers);
255262
}
256263
if (buffers.length > 0) {
257264
gpuBuffer = buffers.pop() as GPUBuffer;
@@ -310,6 +317,10 @@ class GpuDataManagerImpl implements GpuDataManager {
310317
if ((buffer.usage & GPUBufferUsage.STORAGE) === GPUBufferUsage.STORAGE) {
311318
// Put the pending buffer to freeBuffers list instead of really destroying it for buffer reusing.
312319
this.freeBuffers.get(buffer.size)!.push(buffer);
320+
// eslint-disable-next-line no-bitwise
321+
} else if ((buffer.usage & GPUBufferUsage.UNIFORM) === GPUBufferUsage.UNIFORM) {
322+
// Put the pending buffer to freeUniformBuffers list instead of really destroying it for buffer reusing.
323+
this.freeUniformBuffers.get(buffer.size)!.push(buffer);
313324
} else {
314325
buffer.destroy();
315326
}
@@ -323,13 +334,19 @@ class GpuDataManagerImpl implements GpuDataManager {
323334
buffer.destroy();
324335
});
325336
});
337+
this.freeUniformBuffers.forEach((buffers) => {
338+
buffers.forEach(buffer => {
339+
buffer.destroy();
340+
});
341+
});
326342

327343
this.storageCache.forEach((storage) => {
328344
storage.gpuData.buffer.destroy();
329345
});
330346

331347
this.storageCache = new Map();
332348
this.freeBuffers = new Map();
349+
this.freeUniformBuffers = new Map();
333350
}
334351
}
335352

js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import {LOG_DEBUG} from '../../../log';
2323
import {TensorView} from '../../../tensor-view';
2424
import {ShapeUtil} from '../../../util';
25-
import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
25+
import {GpuDataType, ProgramInfo} from '../../types';
2626
import {tensorTypeToWsglStorageType} from '../common';
2727
import {ConvAttributes} from '../conv';
2828

@@ -154,9 +154,8 @@ const conv2dCommonSnippet =
154154
};
155155

156156
export const createConv2DMatMulProgramInfo =
157-
(inputs: readonly TensorView[], metadata: ProgramMetadata, attributes: ConvAttributes,
158-
outputShape: readonly number[], dimAOuter: number, dimBOuter: number, dimInner: number, hasBias: boolean,
159-
sequentialAccessByThreads: boolean): ProgramInfo => {
157+
(inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[], dimAOuter: number,
158+
dimBOuter: number, dimInner: number, hasBias: boolean, sequentialAccessByThreads: boolean): ProgramInfo => {
160159
const isChannelsLast = attributes.format === 'NHWC';
161160
const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1];
162161
const batchSize = outputShape[0];
@@ -213,9 +212,14 @@ export const createConv2DMatMulProgramInfo =
213212
}
214213

215214
return {
216-
...metadata,
217-
outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
218-
dispatchGroup: () => ({x: dispatch[0], y: dispatch[1], z: dispatch[2]}),
215+
name: 'Conv2DMatMul',
216+
inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
217+
[GpuDataType.default, GpuDataType.default],
218+
shaderCache: {hint: attributes.cacheKey},
219+
getRunData: () => ({
220+
outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
221+
dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]},
222+
}),
219223
getShaderSource: () => `
220224
${utilFunctions}
221225
//struct Uniforms { xShape : vec4<i32>, wShape : vec4<i32>, outShape : vec4<i32>,

0 commit comments

Comments
 (0)