kleiti
diff --git a/‎js/web/lib/wasm/jsep/backend-webgpu.ts‎
Lines changed: 148 additions & 37 deletions b/‎js/web/lib/wasm/jsep/backend-webgpu.ts‎
Lines changed: 148 additions & 37 deletions
diff --git a/‎js/web/lib/wasm/jsep/init.ts‎
Lines changed: 2 additions & 3 deletions b/‎js/web/lib/wasm/jsep/init.ts‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts‎
Lines changed: 20 additions & 3 deletions b/‎js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts‎
Lines changed: 11 additions & 7 deletions b/‎js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts‎
Lines changed: 11 additions & 7 deletions
@@ -8,7 +8,44 @@ import {createView, TensorView} from './tensor-view';
 import {createGpuDataManager, downloadGpuData, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {ComputeContext, GpuData, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
+import {ComputeContext, GpuData, ProgramInfo, ProgramInputTensorInfoDependency} from './webgpu/types';
+
+const getProgramInputTensorInfoDependencyKey =
+    (inputTensors: readonly TensorView[], inputDependencies: readonly ProgramInputTensorInfoDependency[]): string => {
+      if (inputDependencies.length !== inputTensors.length) {
+        throw new Error(`inputDependencies length ${inputDependencies.length} is not equal to inputTensors length ${
+            inputTensors.length}.`);
+      }
+
+      const inputInfos: string[] = [];
+      for (let i = 0; i < inputTensors.length; ++i) {
+        const type = inputTensors[i].dataType;
+        switch (inputDependencies[i]) {
+          case 'none': {
+            inputInfos.push('');
+            break;
+          }
+          case 'type': {
+            inputInfos.push(`${type}`);
+            break;
+          }
+          case 'rank': {
+            const rank = inputTensors[i].dims.length;
+            inputInfos.push(`${type};${rank}`);
+            break;
+          }
+          case 'dims': {
+            const dims = inputTensors[i].dims.join(',');
+            inputInfos.push(`${type};${dims}`);
+            break;
+          }
+          default:
+            throw new Error(`unsupported input dependency: ${inputDependencies[i]}`);
+        }
+      }
+
+      return inputInfos.join('|');
+    };
 
 /**
  * get a unique key representing the program from the program info, input shapes and types.
@@ -17,18 +54,20 @@ import {ComputeContext, GpuData, ProgramInfo, ProgramInfoLoader} from './webgpu/
  * program. if the key is the same, the program shader source should be the same, so we can reuse the program.
  *
  */
-const getProgramInfoUniqueKey =
-    (programInfo: ProgramInfo|ProgramInfoLoader, inputTensors: readonly TensorView[]): string => {
-      // final key format:
-      // <PROGRAM_NAME>[<PROGRAM_CUSTOM_CACHE_HINT>]:<INPUTS_INFO_0>|<INPUTS_INFO_1>|...
-      const inputInfos = inputTensors.map(tensor => `${tensor.dataType};${tensor.dims.join(',')}`).join('|');
-      let key = programInfo.name;
-      if (programInfo.cacheHint) {
-        key += '[' + programInfo.cacheHint + ']';
-      }
-      key += ':' + inputInfos;
-      return key;
-    };
+const getProgramInfoUniqueKey = (programInfo: ProgramInfo, inputTensors: readonly TensorView[]): string => {
+  // final key format:
+  // <PROGRAM_NAME>[<PROGRAM_CUSTOM_CACHE_HINT>]:<INPUTS_INFO_0>|<INPUTS_INFO_1>|...
+  let key = programInfo.name;
+  if (programInfo.shaderCache?.hint) {
+    key += '[' + programInfo.shaderCache.hint + ']';
+  }
+  key += `:${
+      getProgramInputTensorInfoDependencyKey(
+          inputTensors,
+          programInfo.shaderCache?.inputDependencies ??
+              new Array<ProgramInputTensorInfoDependency>(inputTensors.length).fill('dims'))}`;
+  return key;
+};
 
 /**
  * this class is designed to store status and being used as a singleton for JSEP. It will be passed to jsepInit() as
@@ -208,55 +247,53 @@ export class WebGpuBackend {
 
   /**
    * run a WebGPU program.
-   * @param program either a ProgramInfo instance containing metadata including the shader code, or a function that
-   * can be called and return a ProgramInfo instance
-   * @param inputs a TensorView array. each element represents a value already exists in GPU.
+   * @param program a ProgramInfo instance
+   * @param inputTensorViews a TensorView array. each element represents a value already exists in GPU.
    * @param outputIndices an indices array. each element can be either -1 (temporary data), -2 (persistent data) or an
    * index to the kernel's output.
    * @param createKernelOutput a callback function that create a value to kernel's output with the given index
    * @param createIntermediateOutput a callback function that create a value as a intermediate value, either temporary
    * or persistent (owned by the current kernel)
    * @returns a TensorView array representing the result.
    */
-  run(program: ProgramInfoLoader|ProgramInfo, inputs: readonly TensorView[], outputIndices: readonly number[],
+  run(program: ProgramInfo, inputTensorViews: readonly TensorView[], outputIndices: readonly number[],
       createKernelOutput: (index: number, dataType: number, dims: readonly number[]) => TensorView,
       createIntermediateOutput: (dataType: number, dims: readonly number[]) => TensorView): TensorView[] {
-    if (inputs.length !== program.inputTypes.length) {
+    if (inputTensorViews.length !== program.inputTypes.length) {
       throw new Error(`Input size must be equal to ${program.inputTypes.length}.`);
     }
 
     // create info for inputs
     const inputDatas: GpuData[] = [];
-    for (let i = 0; i < inputs.length; ++i) {
-      const gpuData = this.gpuDataManager.get(inputs[i].data);
+    for (let i = 0; i < inputTensorViews.length; ++i) {
+      const gpuData = this.gpuDataManager.get(inputTensorViews[i].data);
       if (!gpuData) {
-        throw new Error(`no GPU data for input: ${inputs[i].data}`);
+        throw new Error(`no GPU data for input: ${inputTensorViews[i].data}`);
       }
       inputDatas[i] = gpuData;
     }
 
-    const key = getProgramInfoUniqueKey(program, inputs);
+    // get program info
+    const key = getProgramInfoUniqueKey(program, inputTensorViews);
     let artifact = this.programManager.getArtifact(key);
-    const programInfo = artifact ?
-        artifact.programInfo :
-        (typeof (program as ProgramInfoLoader).get === 'function' ? (program as ProgramInfoLoader).get() :
-                                                                    (program as ProgramInfo));
+
+    const {outputs, dispatchGroup, variables} = program.getRunData(inputTensorViews);
 
     // check output indices
-    const validatedOutputIndices = outputIndices.length === 0 ? programInfo.outputs.map((_, i) => i) : outputIndices;
-    if (validatedOutputIndices.length !== programInfo.outputs.length) {
-      throw new Error(`Output size ${validatedOutputIndices.length} must be equal to ${programInfo.outputs.length}.`);
+    const validatedOutputIndices = outputIndices.length === 0 ? outputs.map((_, i) => i) : outputIndices;
+    if (validatedOutputIndices.length !== outputs.length) {
+      throw new Error(`Output size ${validatedOutputIndices.length} must be equal to ${outputs.length}.`);
     }
 
     // create info for outputs
     const outputTensorViews: TensorView[] = [];
     const outputDatas: GpuData[] = [];
-    for (let i = 0; i < programInfo.outputs.length; ++i) {
+    for (let i = 0; i < outputs.length; ++i) {
       // value -1 and -2 are used for creating temporary and persistent outputs.
       // value -3 is used for placeholder output. So -3, -2, -1 and 0, 1, 2, ... are valid
       // output indices. see type definition of ComputeContextInputsOutputsMapping for more details.
       if (!Number.isInteger(validatedOutputIndices[i]) || validatedOutputIndices[i] < -3 ||
-          validatedOutputIndices[i] >= programInfo.outputs.length) {
+          validatedOutputIndices[i] >= outputs.length) {
         throw new Error(`Invalid output index: ${validatedOutputIndices[i]}`);
       }
       if (validatedOutputIndices[i] === -3) {
@@ -265,8 +302,8 @@ export class WebGpuBackend {
       const isTemporary = validatedOutputIndices[i] === -1;
       const isPersistent = validatedOutputIndices[i] === -2;
       const tensorView = (isTemporary || isPersistent) ?
-          createIntermediateOutput(programInfo.outputs[i].dataType, programInfo.outputs[i].dims) :
-          createKernelOutput(validatedOutputIndices[i], programInfo.outputs[i].dataType, programInfo.outputs[i].dims);
+          createIntermediateOutput(outputs[i].dataType, outputs[i].dims) :
+          createKernelOutput(validatedOutputIndices[i], outputs[i].dataType, outputs[i].dims);
       const gpuData = this.gpuDataManager.get(tensorView.data);
       if (!gpuData) {
         throw new Error(`no GPU data for output: ${tensorView.data}`);
@@ -286,18 +323,92 @@ export class WebGpuBackend {
       outputDatas.push(gpuData);
     }
 
-    const normalizedDispatchGroup = this.programManager.normalizeDispatchGroupSize(programInfo.dispatchGroup(inputs));
+
+    // load uniforms
+    // TODO: add cache for uniform (is it necessary?)
+    //
+    let uniformBufferBinding: GPUBindingResource|undefined;
+    if (variables) {
+      let currentOffset = 0;
+      let preLength = 0;
+      const offsets: number[] = [];
+      let maxAlignmentOfField = 1;
+      variables.forEach(v => {
+        const data = typeof v.data === 'number' ? [v.data] : v.data;
+        // https://www.w3.org/TR/WGSL/#alignof
+        let baseAlignment: number;
+        switch (data.length) {
+          case 1:
+            baseAlignment = 4;
+            break;
+          case 2:
+            baseAlignment = 8;
+            break;
+          case 3:
+            baseAlignment = 16;
+            break;
+          case 4:
+            baseAlignment = 16;
+            break;
+          case 5:
+            baseAlignment = 16;
+            break;
+          case 6:
+            baseAlignment = 16;
+            break;
+          default:
+            throw new Error(`unsupported data length: ${data.length}`);
+        }
+
+        if (preLength === 5 || preLength === 6) {
+          baseAlignment = 16;
+        }
+        if (baseAlignment > maxAlignmentOfField) {
+          maxAlignmentOfField = baseAlignment;
+        }
+        currentOffset = Math.ceil(currentOffset / baseAlignment) * baseAlignment;
+        preLength = data.length;
+        offsets.push(currentOffset);
+        currentOffset += data.length * 4;
+      });
+
+      currentOffset = Math.ceil(currentOffset / maxAlignmentOfField) * maxAlignmentOfField;
+      const arrayBuffer = new ArrayBuffer(currentOffset);
+      variables.forEach((v, i) => {
+        const offset = offsets[i];
+        const data = typeof v.data === 'number' ? [v.data] : v.data;
+        if (v.type === 'int32') {
+          new Int32Array(arrayBuffer, offset, data.length).set(data);
+        } else if (v.type === 'uint32') {
+          new Uint32Array(arrayBuffer, offset, data.length).set(data);
+        } else {
+          new Float32Array(arrayBuffer, offset, data.length).set(data);
+        }
+      });
+
+      const uniformBufferData =
+          // eslint-disable-next-line no-bitwise
+          this.gpuDataManager.create(currentOffset, GPUBufferUsage.COPY_DST | GPUBufferUsage.UNIFORM);
+      this.device.queue.writeBuffer(uniformBufferData.buffer, 0, arrayBuffer, 0, currentOffset);
+      this.gpuDataManager.release(uniformBufferData.id);
+      uniformBufferBinding = {offset: 0, size: currentOffset, buffer: uniformBufferData.buffer};
+    }
+
+
+    const normalizedDispatchGroup = this.programManager.normalizeDispatchGroupSize(dispatchGroup);
 
     if (!artifact) {
-      artifact = this.programManager.build(programInfo, normalizedDispatchGroup);
+      artifact = this.programManager.build(program, normalizedDispatchGroup);
       this.programManager.setArtifact(key, artifact);
     }
 
     LOG_DEBUG(
         'info',
-        () => `[ProgramManager] run "${programInfo.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
+        () => `[ProgramManager] run "${program.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
             normalizedDispatchGroup[1]}x${normalizedDispatchGroup[2]}`);
-    this.programManager.run(artifact, inputs, inputDatas, outputDatas, normalizedDispatchGroup);
+    this.programManager.run(
+        artifact, inputTensorViews, outputTensorViews, inputDatas, outputDatas, normalizedDispatchGroup,
+        uniformBufferBinding);
 
     return outputTensorViews;
   }
 
@@ -10,7 +10,7 @@ import {WebGpuBackend} from './backend-webgpu';
 import {LOG_DEBUG} from './log';
 import {TensorView} from './tensor-view';
 import {ShapeUtil} from './util';
-import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo, ProgramInfoLoader} from './webgpu/types';
+import {ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo} from './webgpu/types';
 
 /* eslint-disable no-bitwise */
 
@@ -90,8 +90,7 @@ class ComputeContextImpl implements ComputeContext {
     this.inputs = inputs;
   }
 
-  compute(program: ProgramInfoLoader|ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping):
-      TensorView[] {
+  compute(program: ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping): TensorView[] {
     // prepare inputs. inputs should always be valid data.
     const mappedInputs =
         inputsOutputsMapping?.inputs?.map(i => typeof i === 'number' ? this.inputs[i] : i) ?? this.inputs;
 
@@ -133,13 +133,16 @@ class GpuDataManagerImpl implements GpuDataManager {
 
   // The reusable storage buffers for computing.
   private freeBuffers: Map<number, GPUBuffer[]>;
+  // The reusable uniform buffers
+  private freeUniformBuffers: Map<number, GPUBuffer[]>;
 
   // The external buffers registered users for IO Binding.
   private externalBuffers: Map<GPUBuffer, GpuDataId>;
 
   constructor(private backend: WebGpuBackend) {
     this.storageCache = new Map();
     this.freeBuffers = new Map();
+    this.freeUniformBuffers = new Map();
     this.buffersForUploadingPending = [];
     this.buffersPending = [];
     this.externalBuffers = new Map();
@@ -247,11 +250,15 @@ class GpuDataManagerImpl implements GpuDataManager {
     let gpuBuffer;
     // Currently, only storage buffers are reused.
     // eslint-disable-next-line no-bitwise
-    if ((usage & GPUBufferUsage.STORAGE) === GPUBufferUsage.STORAGE) {
-      let buffers = this.freeBuffers.get(bufferSize);
+    const isStorage = (usage & GPUBufferUsage.STORAGE) === GPUBufferUsage.STORAGE;
+    // eslint-disable-next-line no-bitwise
+    const isUniform = (usage & GPUBufferUsage.UNIFORM) === GPUBufferUsage.UNIFORM;
+    if (isStorage || isUniform) {
+      const freeBuffers = isStorage ? this.freeBuffers : this.freeUniformBuffers;
+      let buffers = freeBuffers.get(bufferSize);
       if (!buffers) {
         buffers = [];
-        this.freeBuffers.set(bufferSize, buffers);
+        freeBuffers.set(bufferSize, buffers);
       }
       if (buffers.length > 0) {
         gpuBuffer = buffers.pop() as GPUBuffer;
@@ -310,6 +317,10 @@ class GpuDataManagerImpl implements GpuDataManager {
       if ((buffer.usage & GPUBufferUsage.STORAGE) === GPUBufferUsage.STORAGE) {
         // Put the pending buffer to freeBuffers list instead of really destroying it for buffer reusing.
         this.freeBuffers.get(buffer.size)!.push(buffer);
+        // eslint-disable-next-line no-bitwise
+      } else if ((buffer.usage & GPUBufferUsage.UNIFORM) === GPUBufferUsage.UNIFORM) {
+        // Put the pending buffer to freeUniformBuffers list instead of really destroying it for buffer reusing.
+        this.freeUniformBuffers.get(buffer.size)!.push(buffer);
       } else {
         buffer.destroy();
       }
@@ -323,13 +334,19 @@ class GpuDataManagerImpl implements GpuDataManager {
         buffer.destroy();
       });
     });
+    this.freeUniformBuffers.forEach((buffers) => {
+      buffers.forEach(buffer => {
+        buffer.destroy();
+      });
+    });
 
     this.storageCache.forEach((storage) => {
       storage.gpuData.buffer.destroy();
     });
 
     this.storageCache = new Map();
     this.freeBuffers = new Map();
+    this.freeUniformBuffers = new Map();
   }
 }
 
 
@@ -22,7 +22,7 @@
 import {LOG_DEBUG} from '../../../log';
 import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
-import {GpuDataType, ProgramInfo, ProgramMetadata} from '../../types';
+import {GpuDataType, ProgramInfo} from '../../types';
 import {tensorTypeToWsglStorageType} from '../common';
 import {ConvAttributes} from '../conv';
 
@@ -154,9 +154,8 @@ const conv2dCommonSnippet =
     };
 
 export const createConv2DMatMulProgramInfo =
-    (inputs: readonly TensorView[], metadata: ProgramMetadata, attributes: ConvAttributes,
-     outputShape: readonly number[], dimAOuter: number, dimBOuter: number, dimInner: number, hasBias: boolean,
-     sequentialAccessByThreads: boolean): ProgramInfo => {
+    (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[], dimAOuter: number,
+     dimBOuter: number, dimInner: number, hasBias: boolean, sequentialAccessByThreads: boolean): ProgramInfo => {
       const isChannelsLast = attributes.format === 'NHWC';
       const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1];
       const batchSize = outputShape[0];
@@ -213,9 +212,14 @@ export const createConv2DMatMulProgramInfo =
       }
 
       return {
-        ...metadata,
-        outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
-        dispatchGroup: () => ({x: dispatch[0], y: dispatch[1], z: dispatch[2]}),
+        name: 'Conv2DMatMul',
+        inputTypes: hasBias ? [GpuDataType.default, GpuDataType.default, GpuDataType.default] :
+                              [GpuDataType.default, GpuDataType.default],
+        shaderCache: {hint: attributes.cacheKey},
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default}],
+          dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]},
+        }),
         getShaderSource: () => `
         ${utilFunctions}
         //struct Uniforms { xShape : vec4<i32>, wShape : vec4<i32>, outShape : vec4<i32>,