diff --git a/noxfile.py b/noxfile.py
index f3aa7656ae..1b050561e8 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -26,8 +26,8 @@
     "pytest!=7.1.0",
     "pyyaml",
 )
-ONNX = "onnx==1.14.1"
-ONNX_RUNTIME = "onnxruntime==1.16.1"
+ONNX = "onnx==1.15.0"
+ONNX_RUNTIME = "onnxruntime==1.17.1"
 PYTORCH = "torch==2.1.0"
 TORCHVISON = "torchvision==0.16"
 ONNX_RUNTIME_NIGHTLY_DEPENDENCIES = (
diff --git a/onnxscript/onnx_opset/__init__.py b/onnxscript/onnx_opset/__init__.py
index 7396ddd928..c84d95c0cd 100644
--- a/onnxscript/onnx_opset/__init__.py
+++ b/onnxscript/onnx_opset/__init__.py
@@ -36,9 +36,11 @@
 from onnxscript.onnx_opset._impl.opset17 import Opset17
 from onnxscript.onnx_opset._impl.opset18 import Opset18
 from onnxscript.onnx_opset._impl.opset19 import Opset19
+from onnxscript.onnx_opset._impl.opset20 import Opset20
 from onnxscript.onnx_opset._impl.opset_ai_onnx_ml1 import Opset_ai_onnx_ml1
 from onnxscript.onnx_opset._impl.opset_ai_onnx_ml2 import Opset_ai_onnx_ml2
 from onnxscript.onnx_opset._impl.opset_ai_onnx_ml3 import Opset_ai_onnx_ml3
+from onnxscript.onnx_opset._impl.opset_ai_onnx_ml4 import Opset_ai_onnx_ml4
 from onnxscript.onnx_opset._impl.opset_ai_onnx_preview_training1 import (
     Opset_ai_onnx_preview_training1,
 )
@@ -65,9 +67,11 @@
     "opset17",
     "opset18",
     "opset19",
+    "opset20",
     "opset_ai_onnx_ml1",
     "opset_ai_onnx_ml2",
     "opset_ai_onnx_ml3",
+    "opset_ai_onnx_ml4",
     "opset_ai_onnx_preview_training1",
 ]
 
@@ -97,9 +101,11 @@
 opset17 = Opset17()
 opset18 = Opset18()
 opset19 = Opset19()
+opset20 = Opset20()
 opset_ai_onnx_ml1 = Opset_ai_onnx_ml1()
 opset_ai_onnx_ml2 = Opset_ai_onnx_ml2()
 opset_ai_onnx_ml3 = Opset_ai_onnx_ml3()
+opset_ai_onnx_ml4 = Opset_ai_onnx_ml4()
 opset_ai_onnx_preview_training1 = Opset_ai_onnx_preview_training1()
 all_opsets: Mapping[Tuple[str, int], Opset] = {
     (
@@ -178,6 +184,10 @@
         "",
         19,
     ): opset19,
+    (
+        "",
+        20,
+    ): opset20,
     (
         "ai.onnx.ml",
         1,
@@ -190,6 +200,10 @@
         "ai.onnx.ml",
         3,
     ): opset_ai_onnx_ml3,
+    (
+        "ai.onnx.ml",
+        4,
+    ): opset_ai_onnx_ml4,
     (
         "ai.onnx.preview.training",
         1,
diff --git a/onnxscript/onnx_opset/_impl/opset1.py b/onnxscript/onnx_opset/_impl/opset1.py
index e5fab9d3ad..756cc5a150 100644
--- a/onnxscript/onnx_opset/_impl/opset1.py
+++ b/onnxscript/onnx_opset/_impl/opset1.py
@@ -1495,7 +1495,7 @@ def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) ->
         If conditional
 
         Args:
-            cond: Condition for the if
+            cond: Condition for the if. The tensor must contain a single element.
 
             else_branch: Graph to run if condition is false. Has N outputs: values you
                 wish to be live-out to the enclosing scope. The number of outputs must
@@ -3011,7 +3011,8 @@ def ReduceL1(
 
         Computes the L1 norm of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields 0.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -3040,7 +3041,8 @@ def ReduceL2(
 
         Computes the L2 norm of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields 0.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -3071,7 +3073,8 @@ def ReduceLogSum(
 
         Computes the log sum of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -3106,7 +3109,8 @@ def ReduceLogSumExp(
 
         Computes the log sum exponent of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -3135,7 +3139,8 @@ def ReduceMax(
 
         Computes the max of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -3166,7 +3171,8 @@ def ReduceMean(
 
         Computes the mean of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields undefined.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -3195,7 +3201,8 @@ def ReduceMin(
 
         Computes the min of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -3226,7 +3233,8 @@ def ReduceProd(
 
         Computes the product of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields 1.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -3255,7 +3263,8 @@ def ReduceSum(
 
         Computes the sum of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields 0.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -3290,7 +3299,8 @@ def ReduceSumSquare(
 
         Computes the sum square of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields 0.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -3903,10 +3913,10 @@ def TopK(self, X: T_TopK, *, axis: int = -1, k: int) -> Tuple[T_TopK, I_TopK]:
 
 
         Retrieve the top-K elements along a specified axis. Given an input tensor of
-        shape [a_1, a_2, ..., a_n, r] and integer argument k, return two outputs:
-          -Value tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n]
+        shape [a_0, a_1, ..., a_{n-1}] and integer argument k, return two outputs:
+          -Value tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}]
             which contains the values of the top k elements along the specified axis
-          -Index tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] which
+          -Index tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] which
            contains the indices of the top k elements (original indices from the input
            tensor).
         Given two equivalent values, this operator uses the indices along the axis  as
@@ -3914,7 +3924,7 @@ def TopK(self, X: T_TopK, *, axis: int = -1, k: int) -> Tuple[T_TopK, I_TopK]:
 
 
         Args:
-            X: Tensor of shape [a_1, a_2, ..., a_n, r]
+            X: Tensor of shape [a_0, a_1, ..., a_{n-1}]
 
             axis: Dimension on which to do the sort.
 
diff --git a/onnxscript/onnx_opset/_impl/opset10.py b/onnxscript/onnx_opset/_impl/opset10.py
index 634f45077d..65ea0013e3 100644
--- a/onnxscript/onnx_opset/_impl/opset10.py
+++ b/onnxscript/onnx_opset/_impl/opset10.py
@@ -1202,10 +1202,10 @@ def TopK(self, X: T_TopK, K: INT64, *, axis: int = -1) -> Tuple[T_TopK, I_TopK]:
 
 
         Retrieve the top-K elements along a specified axis. Given an input tensor of
-        shape [a_1, a_2, ..., a_n, r] and integer argument k, return two outputs:
-          -Value tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n]
+        shape [a_0, a_1, ..., a_{n-1}] and integer argument k, return two outputs:
+          -Value tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}]
             which contains the values of the top k elements along the specified axis
-          -Index tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] which
+          -Index tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] which
            contains the indices of the top k elements (original indices from the input
            tensor).
 
@@ -1214,7 +1214,7 @@ def TopK(self, X: T_TopK, K: INT64, *, axis: int = -1) -> Tuple[T_TopK, I_TopK]:
 
 
         Args:
-            X: Tensor of shape [a_1, a_2, ..., a_n, r]
+            X: Tensor of shape [a_0, a_1, ..., a_{n-1}]
 
             K: A 1-D tensor containing a single positive value corresponding to the
                 number of top elements to retrieve
diff --git a/onnxscript/onnx_opset/_impl/opset11.py b/onnxscript/onnx_opset/_impl/opset11.py
index 113b3cd203..bb54cbeb02 100644
--- a/onnxscript/onnx_opset/_impl/opset11.py
+++ b/onnxscript/onnx_opset/_impl/opset11.py
@@ -66,6 +66,7 @@ def ArgMax(self, data: T_ArgMax, *, axis: int = 0, keepdims: int = 1) -> INT64:
         Computes the indices of the max elements of the input tensor's element along the
         provided axis. The resulting tensor has the same rank as the input if keepdims equals 1.
         If keepdims equal 0, then the resulting tensor has the reduced dimension pruned.
+        The input tensor must not be empty.
         The type of the output tensor is integer.
 
         Args:
@@ -104,6 +105,7 @@ def ArgMin(self, data: T_ArgMin, *, axis: int = 0, keepdims: int = 1) -> INT64:
         Computes the indices of the min elements of the input tensor's element along the
         provided axis. The resulting tensor has the same rank as the input if keepdims equals 1.
         If keepdims equal 0, then the resulting tensor has the reduced dimension pruned.
+        The input tensor must not be empty.
         The type of the output tensor is integer.
 
         Args:
@@ -154,11 +156,17 @@ def AveragePool(
          * pad_shape[i] is sum of pads along axis i
          ```
 
-         `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
+         `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
          ```
          VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
          SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
          ```
+        or when ceil_mode is disabled:
+         ```
+         VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
+         SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor(input_spatial_shape[i] / strides_spatial_shape[i])
+         ```
+
          And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
          ```
          pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
@@ -642,7 +650,9 @@ def ConvTranspose(
 
             output_shape: The shape of the output can be explicitly set which will cause
                 pads values to be auto generated. If output_shape is specified pads
-                values are ignored. See doc for details for equations to generate pads
+                values are ignored. See doc for details for equations to generate pads.
+                Note that the output_shape attribute value should not include dimensions
+                for batch size and channels, which are automatically inferred.
 
             pads: Padding for the beginning and ending along each spatial axis, it can
                 take any value greater than or equal to 0. The value represent the
@@ -821,16 +831,16 @@ def DynamicQuantizeLinear(
         r"""[🌐 DynamicQuantizeLinear(11)](https://onnx.ai/onnx/operators/onnx__DynamicQuantizeLinear.html#dynamicquantizelinear-11 "Online Documentation")
 
 
-        A Function to fuse calculation for Scale, Zero Point and FP32->8Bit convertion of FP32 Input data.
+        A Function to fuse calculation for Scale, Zero Point and FP32->8Bit conversion of FP32 Input data.
         Outputs Scale, ZeroPoint and Quantized Input for a given FP32 Input.
         Scale is calculated as:
         ::
 
-            y_scale = (max(x) - min(x))/(qmax - qmin)
+            y_scale = (maximum(0, max(x)) - minimum(0, min(x))) / (qmax - qmin)
 
 
 
-        * where qmax and qmin are max and min values for quantization range .i.e [0, 255] in case of uint8
+        * where qmax and qmin are max and min values for quantization range i.e. [0, 255] in case of uint8
         * data range is adjusted to include 0.
 
         Zero point is calculated as:
@@ -1376,7 +1386,7 @@ def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) ->
         If conditional
 
         Args:
-            cond: Condition for the if
+            cond: Condition for the if. The tensor must contain a single element.
 
             else_branch: Graph to run if condition is false. Has N outputs: values you
                 wish to be live-out to the enclosing scope. The number of outputs must
@@ -1808,7 +1818,7 @@ def MaxUnpool(
         MaxUnpool essentially computes the partial inverse of the MaxPool op.
          The input information to this op is typically the output information from a MaxPool op. The first
          input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
-         from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corrsponding
+         from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corresponding
          to the elements in the first input tensor X. Input tensor I is typically the second output of the MaxPool op.
          The third (optional) input is a tensor that specifies the output size of the unpooling operation.
 
@@ -1821,7 +1831,7 @@ def MaxUnpool(
          known/predictable size.
 
         In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape, strides, and pads,
-         which define the exact unpooling op. The attributes typically have the same values as the corrsponding
+         which define the exact unpooling op. The attributes typically have the same values as the corresponding
          pooling op that the unpooling op is trying to invert.
 
 
@@ -2363,7 +2373,8 @@ def ReduceMax(
 
         Computes the max of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -2425,7 +2436,8 @@ def ReduceMin(
 
         Computes the min of the input tensor's element along the provided axes. The resulting
         tensor has the same rank as the input if keepdims equals 1. If keepdims equal 0, then
-        the resulted tensor have the reduced dimension pruned.
+        the resulted tensor have the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
 
         The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
         False instead of True.
@@ -2693,7 +2705,7 @@ def Round(self, X: T_Round) -> T_Round:
 
         Round takes one input Tensor and rounds the values, element-wise, meaning
         it finds the nearest integer for each value.
-        In case of halfs, the rule is to round them to the nearest even integer.
+        In case of halves, the rule is to round them to the nearest even integer.
         If input x is integral, +0, -0, NaN,  or infinite, x itself is returned.
         The output tensor has the same shape and type as the input.
 
@@ -3752,11 +3764,11 @@ def TopK(
 
 
         Retrieve the top-K largest or smallest elements along a specified axis. Given an input tensor of
-        shape [a_1, a_2, ..., a_n, r] and integer argument k, return two outputs:
+        shape [a_0, a_1, ..., a_{n-1}] and integer argument k, return two outputs:
 
-        * Value tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n]
+        * Value tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}]
           which contains the values of the top k elements along the specified axis
-        * Index tensor of shape [a_1, a_2, ..., a_{axis-1}, k, a_{axis+1}, ... a_n] which
+        * Index tensor of shape [a_0, a_1, ..., a_{axis-1}, k, a_{axis+1}, ... a_{n-1}] which
           contains the indices of the top k elements (original indices from the input
           tensor).
 
@@ -3769,7 +3781,7 @@ def TopK(
 
 
         Args:
-            X: (differentiable) Tensor of shape [a_1, a_2, ..., a_n, r]
+            X: (differentiable) Tensor of shape [a_0, a_1, ..., a_{n-1}]
 
             K: (non-differentiable) A 1-D tensor containing a single positive value
                 corresponding to the number of top elements to retrieve
@@ -3819,8 +3831,8 @@ def Unique(
 
         This operator returns the unique values or sliced unique subtensors of the input tensor and three optional outputs.
         The first output tensor 'Y' contains all unique values or subtensors of the input.
-        The second optional output tensor 'indices' contains indices of 'Y' elements' first occurance in 'X'..
-        The third optional output tensor 'inverse_indices' contains, for elements of 'X', its corresponding indices in 'Y'. ".
+        The second optional output tensor 'indices' contains indices of 'Y' elements' first occurrence in 'X'.
+        The third optional output tensor 'inverse_indices' contains, for elements of 'X', its corresponding indices in 'Y'.
         The fourth optional output tensor 'counts' contains the count of each element of 'Y' in the input.
 
         Outputs are either sorted in ascending order or optionally in the order of the first occurrence of the values in the input.
diff --git a/onnxscript/onnx_opset/_impl/opset12.py b/onnxscript/onnx_opset/_impl/opset12.py
index b1a92dd4b6..ede4fb34a7 100644
--- a/onnxscript/onnx_opset/_impl/opset12.py
+++ b/onnxscript/onnx_opset/_impl/opset12.py
@@ -369,7 +369,7 @@ def Einsum(self, *Inputs: T_Einsum, equation: str) -> T_Einsum:
 
         ::
 
-            output[output-term] = reduce-sum( input1[term1] * input2[term] )
+            output[output-term] = reduce-sum( input1[term1] * input2[term2] )
 
 
 
@@ -664,21 +664,28 @@ def MaxPool(
          the tensor according to kernel sizes, stride sizes, and pad lengths.
          max pooling consisting of computing the max on all values of a
          subset of the input tensor according to the kernel size and downsampling the
-         data into the output tensor Y for further processing. The output spatial shape will be following:
+         data into the output tensor Y for further processing. The output spatial shape is calculated differently
+         depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
+         With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
          ```
-         output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
+         output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
          ```
          or
          ```
-         output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
+         output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
          ```
-         if ceil_mode is enabled `pad_shape[i]` is the sum of pads along axis `i`.
+         if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
 
-         `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
+         `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
          ```
          VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
          SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
          ```
+         or when ceil_mode is disabled (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AveragePooling2D):
+         ```
+         VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i]) + 1
+         SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor((input_spatial_shape[i] - 1) / strides_spatial_shape[i]) + 1
+         ```
          And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
          ```
          pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
@@ -1018,7 +1025,7 @@ def SoftmaxCrossEntropyLoss(
         shape(labels): (N) where each value is 0 <= labels[i] <= C-1, or (N, D1, D2,..., Dk),
                 with K >= 1 in case of K-dimensional loss.
 
-        The loss for one sample, l_i, can caculated as follows:
+        The loss for one sample, l_i, can calculated as follows:
             l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk], where i is the index of classes.
         or
             l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk] * weights[c], if 'weights' is provided.
diff --git a/onnxscript/onnx_opset/_impl/opset13.py b/onnxscript/onnx_opset/_impl/opset13.py
index 011d4873ef..616fe5ff69 100644
--- a/onnxscript/onnx_opset/_impl/opset13.py
+++ b/onnxscript/onnx_opset/_impl/opset13.py
@@ -1108,55 +1108,58 @@ def GatherND(self, data: T_GatherND, indices: INT64, *, batch_dims: int = 0) ->
 
         This operator is the inverse of `ScatterND`.
 
-        `Example 1`
+        **Example 1**
 
-          batch_dims = 0
-
-          data    = [[0,1],[2,3]]   # data_shape = [2, 2]
-
-          indices = [[0,0],[1,1]]   # indices_shape = [2, 2]
+        ::
 
-          output  = [0,3]           # output_shape = [2]
+            batch_dims = 0
+            data    = [[0,1],[2,3]]   # data_shape    = [2, 2]
+            indices = [[0,0],[1,1]]   # indices_shape = [2, 2]
+            output  = [0,3]           # output_shape  = [2]
 
-        `Example 2`
 
-          batch_dims = 0
 
-          data    = [[0,1],[2,3]]  # data_shape = [2, 2]
+        **Example 2**
 
-          indices = [[1],[0]]      # indices_shape = [2, 1]
+        ::
 
-          output  = [[2,3],[0,1]]  # output_shape = [2, 2]
+            batch_dims = 0
+            data    = [[0,1],[2,3]]  # data_shape    = [2, 2]
+            indices = [[1],[0]]      # indices_shape = [2, 1]
+            output  = [[2,3],[0,1]]  # output_shape  = [2, 2]
 
-        `Example 3`
 
-          batch_dims = 0
 
-          data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+        **Example 3**
 
-          indices = [[0,1],[1,0]]                 # indices_shape = [2, 2]
+        ::
 
-          output  = [[2,3],[4,5]]                 # output_shape = [2, 2]
+            batch_dims = 0
+            data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape    = [2, 2, 2]
+            indices = [[0,1],[1,0]]                 # indices_shape = [2, 2]
+            output  = [[2,3],[4,5]]                 # output_shape  = [2, 2]
 
-        `Example 4`
 
-          batch_dims = 0
 
-          data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+        **Example 4**
 
-          indices = [[[0,1]],[[1,0]]]             # indices_shape = [2, 1, 2]
+        ::
 
-          output  = [[[2,3]],[[4,5]]]             # output_shape = [2, 1, 2]
+            batch_dims = 0
+            data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape    = [2, 2, 2]
+            indices = [[[0,1]],[[1,0]]]             # indices_shape = [2, 1, 2]
+            output  = [[[2,3]],[[4,5]]]             # output_shape  = [2, 1, 2]
 
-        `Example 5`
 
-          batch_dims = 1
 
-          data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape = [2, 2, 2]
+        **Example 5**
 
-          indices = [[1],[0]]             # indices_shape = [2, 1]
+        ::
 
-          output  = [[2,3],[4,5]]             # output_shape = [2, 2]
+            batch_dims = 1
+            data    = [[[0,1],[2,3]],[[4,5],[6,7]]] # data_shape    = [2, 2, 2]
+            indices = [[1],[0]]                     # indices_shape = [2, 1]
+            output  = [[2,3],[4,5]]                 # output_shape  = [2, 2]
 
 
 
@@ -1377,7 +1380,7 @@ def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) ->
         If conditional
 
         Args:
-            cond: Condition for the if
+            cond: Condition for the if. The tensor must contain a single element.
 
             else_branch: Graph to run if condition is false. Has N outputs: values you
                 wish to be live-out to the enclosing scope. The number of outputs must
@@ -1841,7 +1844,7 @@ def MeanVarianceNormalization(
         Args:
             X: (differentiable) Input tensor
 
-            axes: A list of integers, along which to reduce. The default is to caculate
+            axes: A list of integers, along which to reduce. The default is to calculate
                 along axes [0,2,3] for calculating mean and variance along each channel.
                 Two variables with the same C-coordinate are associated with the same
                 mean and variance.
@@ -2417,12 +2420,13 @@ def ReduceL1(
 
 
         Computes the L1 norm of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields 0.
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -2450,12 +2454,13 @@ def ReduceL2(
 
 
         Computes the L2 norm of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields 0.
+
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -2483,12 +2488,13 @@ def ReduceLogSum(
 
 
         Computes the log sum of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -2520,12 +2526,13 @@ def ReduceLogSumExp(
 
 
         Computes the log sum exponent of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
+
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -2563,12 +2570,13 @@ def ReduceMax(
 
 
         Computes the max of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -2596,12 +2604,13 @@ def ReduceMean(
 
 
         Computes the mean of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields undefined.
+
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -2639,12 +2648,13 @@ def ReduceMin(
 
 
         Computes the min of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -2672,12 +2682,13 @@ def ReduceProd(
 
 
         Computes the product of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields 1.
+
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -2710,12 +2721,13 @@ def ReduceSum(
 
 
         Computes the sum of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields 0.
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -2758,12 +2770,13 @@ def ReduceSumSquare(
 
 
         Computes the sum square of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields 0.
+
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -3361,16 +3374,16 @@ def Slice(
         Slice uses the `starts`, `ends`, `axes` and `steps` inputs to select a sub-tensor
         of its input `data` tensor.
 
-        An effective `start[i]`, `end[i]`, and `step[i]` must be computed for each `i`
+        An effective `starts[i]`, `ends[i]`, and `steps[i]` must be computed for each `i`
         in `[0, ... r-1]` where `r = rank(input)` as follows:
 
         If `axes` are omitted, they are set to `[0, ..., r-1]`.
         If `steps` are omitted, they are set to `[1, ..., 1]` of length `len(starts)`
 
-        The effective values are initialized as `start[i] = 0`, `end[i] = dims[i]` where
-        `dims` are the dimensions of `input` and `step[i] = `1.
+        The effective values are initialized as `start[i] = 0`, `ends[i] = dims[i]` where
+        `dims` are the dimensions of `input` and `steps[i] = 1`.
 
-        All negative elements of `axes` are made non-negatve by adding `r` to them, where
+        All negative elements of `axes` are made non-negative by adding `r` to them, where
         `r =rank(input)`.
 
         All negative values in `starts[i]` and `ends[i]` have `dims[axes[i]]` added to them,
@@ -3380,10 +3393,10 @@ def Slice(
 
         The clamping for the adjusted `ends[i]` depends on the sign of `steps[i]` and must
         accommodate copying 0 through `dims[axes[i]]` elements, so for positive stepping
-        `end[axes[i]]` is clamped to `[0, dims[axes[i]]]`, while for negative stepping it
+        `ends[axes[i]]` is clamped to `[0, dims[axes[i]]]`, while for negative stepping it
         is clamped to `[-1, dims[axes[i]]-1]`.
 
-        Finally, `step[axes[i]] = steps[i]`.
+        Finally, `steps[axes[i]] = steps[i]`.
 
         For slicing to the end of a dimension with unknown size, it is recommended to pass
         in `INT_MAX` when slicing forward and 'INT_MIN' when slicing backward.
@@ -3506,7 +3519,7 @@ def SoftmaxCrossEntropyLoss(
         * shape(labels): (N) where each value is 0 <= labels[i] <= C-1, or (N, D1, D2,..., Dk),
           with K >= 1 in case of K-dimensional loss.
 
-        The loss for one sample, l_i, can caculated as follows:
+        The loss for one sample, l_i, can calculated as follows:
         ::
 
             l[i][d1][d2]...[dk] = -y[i][c][d1][d2]..[dk], where i is the index of classes.
diff --git a/onnxscript/onnx_opset/_impl/opset15.py b/onnxscript/onnx_opset/_impl/opset15.py
index cfaecef198..38c235bced 100644
--- a/onnxscript/onnx_opset/_impl/opset15.py
+++ b/onnxscript/onnx_opset/_impl/opset15.py
@@ -138,7 +138,7 @@ def BatchNormalization(
                 running_mean = running_mean * momentum + mean * (1 - momentum).
 
             training_mode: If set to true, it indicates BatchNormalization is being used
-                for training, and outputs 1, 2, 3, and 4 would be populated.
+                for training, and outputs 1 and 2 are to be computed.
         """
 
         schema = get_schema("BatchNormalization", 15, "")
diff --git a/onnxscript/onnx_opset/_impl/opset16.py b/onnxscript/onnx_opset/_impl/opset16.py
index f34826eeeb..c90392d582 100644
--- a/onnxscript/onnx_opset/_impl/opset16.py
+++ b/onnxscript/onnx_opset/_impl/opset16.py
@@ -325,7 +325,7 @@ def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) ->
         If conditional
 
         Args:
-            cond: Condition for the if
+            cond: Condition for the if. The tensor must contain a single element.
 
             else_branch: Graph to run if condition is false. Has N outputs: values you
                 wish to be live-out to the enclosing scope. The number of outputs must
@@ -657,7 +657,7 @@ def PRelu(self, X: T_PRelu, slope: T_PRelu) -> T_PRelu:
         Args:
             X: (differentiable) Input tensor
 
-            slope: (differentiable) Slope tensor. The shape of slope can be smaller then
+            slope: (differentiable) Slope tensor. The shape of slope can be smaller than
                 first input X; if so, its shape must be unidirectional broadcastable to
                 X
         """
diff --git a/onnxscript/onnx_opset/_impl/opset17.py b/onnxscript/onnx_opset/_impl/opset17.py
index 2144bda1bd..80b4b457c0 100644
--- a/onnxscript/onnx_opset/_impl/opset17.py
+++ b/onnxscript/onnx_opset/_impl/opset17.py
@@ -115,26 +115,30 @@ def DFT(
                 [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][1]. For complex
                 input, the following shape is expected:
                 [batch_idx][signal_dim1][signal_dim2]...[signal_dimN][2]. The first
-                dimension is the batch dimension. The following N dimentions correspond
+                dimension is the batch dimension. The following N dimensions correspond
                 to the signal's dimensions. The final dimension represents the real and
                 imaginary parts of the value in that order.
 
-            dft_length: (optional, non-differentiable) The length of the signal.If
-                greater than the axis dimension, the signal will be zero-padded up to
-                dft_length. If less than the axis dimension, only the first dft_length
-                values will be used as the signal. It's an optional value.
+            dft_length: (optional, non-differentiable) The length of the signal as a
+                scalar. If greater than the axis dimension, the signal will be
+                zero-padded up to dft_length. If less than the axis dimension, only the
+                first dft_length values will be used as the signal. It's an optional
+                value.
 
             axis: The axis on which to perform the DFT. By default this value is set to
                 1, which corresponds to the first dimension after the batch index.
+                Negative value means counting dimensions from the back. Accepted range
+                is $[-r, -2] \cup [0, r-2]$ where `r = rank(input)`. The last dimension
+                is for representing complex numbers and thus is an invalid axis.
 
             inverse: Whether to perform the inverse discrete fourier transform. By
                 default this value is set to 0, which corresponds to false.
 
             onesided: If onesided is 1, only values for w in [0, 1, 2, ...,
                 floor(n_fft/2) + 1] are returned because the real-to-complex Fourier
-                transform satisfies the conjugate symmetry, i.e., X[m, w] =
-                X[m,w]=X[m,n_fft-w]*. Note if the input or window tensors are complex,
-                then onesided output is not possible. Enabling onesided with real inputs
+                transform satisfies the conjugate symmetry, i.e., X[m, w] = X[m,
+                n_fft-w]*. Note if the input or window tensors are complex, then
+                onesided output is not possible. Enabling onesided with real inputs
                 performs a Real-valued fast Fourier transform (RFFT). When invoked with
                 real or complex valued input, the default value is 0. Values can be 0 or
                 1.
@@ -300,7 +304,9 @@ def LayerNormalization(
               Let `d[i]` indicate the i-th dimension of `X`.
               If `X`'s shape is `[d[0], ..., d[axis-1], d[axis], ..., d[rank-1]]`,
               the shape of `Mean` and `InvStdDev` is `[d[0], ..., d[axis-1], 1, ..., 1]`.
-              `Y` and `X` have the same shape.
+              `Y` and `X` have the same shape. This operator supports unidirectional broadcasting
+              (tensors `Scale` and `B` should be unidirectional broadcastable to tensor `X`);
+              for more details please check `Broadcasting in ONNX <https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md>`_.
 
 
         Args:
@@ -311,7 +317,7 @@ def LayerNormalization(
             B: (optional) Bias tensor.
 
             axis: The first normalization dimension. If rank(X) is r, axis' allowed
-                range is [-r, r]. Negative value means counting dimensions from the
+                range is [-r, r). Negative value means counting dimensions from the
                 back.
 
             epsilon: The epsilon value to use to avoid division by zero.
diff --git a/onnxscript/onnx_opset/_impl/opset18.py b/onnxscript/onnx_opset/_impl/opset18.py
index 46f483b624..c4154635d9 100644
--- a/onnxscript/onnx_opset/_impl/opset18.py
+++ b/onnxscript/onnx_opset/_impl/opset18.py
@@ -826,12 +826,13 @@ def ReduceL1(
 
 
         Computes the L1 norm of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields 0.
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -875,12 +876,13 @@ def ReduceL2(
 
 
         Computes the L2 norm of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields 0.
+
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -924,12 +926,13 @@ def ReduceLogSum(
 
 
         Computes the log sum of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
+
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -973,12 +976,13 @@ def ReduceLogSumExp(
 
 
         Computes the log sum exponent of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or undefined otherwise.
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -1032,12 +1036,13 @@ def ReduceMax(
 
 
         Computes the max of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
+
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -1081,12 +1086,13 @@ def ReduceMean(
 
 
         Computes the mean of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields undefined.
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -1140,12 +1146,13 @@ def ReduceMin(
 
 
         Computes the min of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
+
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -1189,12 +1196,13 @@ def ReduceProd(
 
 
         Computes the product of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields 1.
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -1238,12 +1246,13 @@ def ReduceSumSquare(
 
 
         Computes the sum square of the input tensor's elements along the provided axes. The resulting
-        tensor has the same rank as the input if keepdims equals 1. If keepdims equals 0, then
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
         the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
-        valid.
+        valid. Reduction over an empty set of values yields 0.
+
 
-        The above behavior is similar to numpy, with the exception that numpy defaults keepdims to
-        False instead of True.
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
 
         Args:
             data: (differentiable) An input tensor.
@@ -1528,8 +1537,8 @@ def ScatterElements(
         When `reduction` is set to some reduction function `f`, the update corresponding to the [i][j] entry is performed as below:
         ::
 
-            output[indices[i][j]][j] += f(output[indices[i][j]][j], updates[i][j]) if axis = 0,
-            output[i][indices[i][j]] += f(output[i][indices[i][j]], updates[i][j]) if axis = 1,
+            output[indices[i][j]][j] = f(output[indices[i][j]][j], updates[i][j]) if axis = 0,
+            output[i][indices[i][j]] = f(output[i][indices[i][j]], updates[i][j]) if axis = 1,
 
 
         where the `f` is `+`, `*`, `max` or `min` as specified.
diff --git a/onnxscript/onnx_opset/_impl/opset19.py b/onnxscript/onnx_opset/_impl/opset19.py
index d485bd0659..467c23917e 100644
--- a/onnxscript/onnx_opset/_impl/opset19.py
+++ b/onnxscript/onnx_opset/_impl/opset19.py
@@ -70,21 +70,28 @@ def AveragePool(
          the tensor according to kernel sizes, stride sizes, and pad lengths.
          average pooling consisting of computing the average on all values of a
          subset of the input tensor according to the kernel size and downsampling the
-         data into the output tensor Y for further processing. The output spatial shape will be following:
+         data into the output tensor Y for further processing. The output spatial shape is calculated differently
+         depending on whether explicit padding is used, where pads is employed, or auto padding is used, where auto_pad is utilized.
+         With explicit padding (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html?highlight=maxpool#torch.nn.MaxPool2d):
          ```
-         output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
+         output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
          ```
          or
          ```
-         output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i] + 1)
+         output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
          ```
-         if ceil_mode is enabled `pad_shape[i]` is the sum of pads along axis `i`.
+         if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
 
-         `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following:
+         `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
          ```
          VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) + 1) / strides_spatial_shape[i])
          SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i])
          ```
+         or when ceil_mode is disabled (https://www.tensorflow.org/api_docs/python/tf/keras/layers/AveragePooling2D):
+         ```
+         VALID: output_spatial_shape[i] = floor((input_spatial_shape[i] - ((kernel_spatial_shape[i] - 1) * dilations[i] + 1)) / strides_spatial_shape[i]) + 1
+         SAME_UPPER or SAME_LOWER: output_spatial_shape[i] = floor((input_spatial_shape[i] - 1) / strides_spatial_shape[i]) + 1
+         ```
          And pad shape will be following if `SAME_UPPER` or `SAME_LOWER`:
          ```
          pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + ((kernel_spatial_shape[i] - 1) * dilations[i] + 1) - input_spatial_shape[i]
@@ -777,7 +784,7 @@ def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) ->
         If conditional
 
         Args:
-            cond: Condition for the if
+            cond: Condition for the if. The tensor must contain a single element.
 
             else_branch: Graph to run if condition is false. Has N outputs: values you
                 wish to be live-out to the enclosing scope. The number of outputs must
diff --git a/onnxscript/onnx_opset/_impl/opset20.py b/onnxscript/onnx_opset/_impl/opset20.py
new file mode 100644
index 0000000000..e05b5018a4
--- /dev/null
+++ b/onnxscript/onnx_opset/_impl/opset20.py
@@ -0,0 +1,675 @@
+# --------------------------------------------------------------------------
+# ⚠️ WARNING - AUTO-GENERATED CODE - DO NOT EDIT ⚠️
+# ⚙️ Generated by 'python -m opgen'
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+# pylint: disable=W0221,W0222,R0901,W0237
+# mypy: disable-error-code=override
+# ruff: noqa: N801,E741
+# ruff: noqa: D214,D402,D405,D411,D412,D416,D417
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Optional, Tuple, TypeVar, Union
+
+from onnx import TensorProto
+from onnx.defs import get_schema
+from typing_extensions import TypeAlias
+
+from onnxscript.onnx_opset._impl.opset19 import Opset19
+from onnxscript.onnx_types import (
+    BFLOAT16,
+    BOOL,
+    COMPLEX64,
+    COMPLEX128,
+    DOUBLE,
+    FLOAT,
+    FLOAT8E4M3FN,
+    FLOAT8E4M3FNUZ,
+    FLOAT8E5M2,
+    FLOAT8E5M2FNUZ,
+    FLOAT16,
+    INT8,
+    INT16,
+    INT32,
+    INT64,
+    STRING,
+    UINT8,
+    UINT16,
+    UINT32,
+    UINT64,
+)
+from onnxscript.values import Op, Opset
+
+
+class Opset20(Opset19):
+    def __new__(cls):
+        return Opset.__new__(cls, "", 20)
+
+    T1_AffineGrid = TypeVar("T1_AffineGrid", BFLOAT16, DOUBLE, FLOAT, FLOAT16)
+
+    T2_AffineGrid: TypeAlias = INT64
+
+    def AffineGrid(
+        self, theta: T1_AffineGrid, size: T2_AffineGrid, *, align_corners: int = 0
+    ) -> T1_AffineGrid:
+        r"""[🌐 AffineGrid(20)](https://onnx.ai/onnx/operators/onnx__AffineGrid.html#affinegrid-20 "Online Documentation")
+
+
+        Generates a 2D or 3D flow field (sampling grid), given a batch of affine matrices theta
+        (https://pytorch.org/docs/stable/generated/torch.nn.functional.affine_grid.html).
+        An affine matrix `theta` is applied to a position tensor represented in its homogeneous expression. Here is an example in 3D:
+        ::
+
+            [r00, r01, r02, t0]   [x]   [x']
+            [r10, r11, r12, t1] * [y] = [y']
+            [r20, r21, r22, t2]   [z]   [z']
+            [0,   0,   0,   1 ]   [1]   [1 ]
+
+
+        where `(x, y, z)` is the position in the original space, `(x', y', z')` is the position in the output space.
+        The last row is always `[0, 0, 0, 1]` and is not stored in the affine matrix. Therefore we have `theta` of shape `(N, 2, 3)` for 2D or `(N, 3, 4)` for 3D.
+
+        Input `size` is used to define grid of positions evenly spaced in the original 2D or 3D space, with dimensions ranging from `-1` to `1`.
+        The output `grid` contains positions in the output space.
+
+        When `align_corners=1`, consider `-1` and `1` to refer to the centers of the corner pixels (mark `v` in illustration).
+        ::
+
+            v            v            v            v
+            |-------------------|------------------|
+            -1                  0                  1
+
+
+        When `align_corners=0`, consider `-1` and `1` to refer to the outer edge of the corner pixels.
+        ::
+
+                v        v         v         v
+            |------------------|-------------------|
+            -1                 0                   1
+
+
+
+
+        Args:
+            theta: (non-differentiable) input batch of affine matrices with shape (N, 2,
+                3) for 2D or (N, 3, 4) for 3D
+
+            size: (non-differentiable) the target output image size (N, C, H, W) for 2D
+                or (N, C, D, H, W) for 3D
+
+            align_corners: if align_corners=1, consider -1 and 1 to refer to the centers
+                of the corner pixels. if align_corners=0, consider -1 and 1 to refer to
+                the outer edge the corner pixels.
+        """
+
+        schema = get_schema("AffineGrid", 20, "")
+        op = Op(self, "AffineGrid", schema)
+        return op(*self._prepare_inputs(schema, theta, size), align_corners=align_corners)
+
+    T1_ConstantOfShape: TypeAlias = INT64
+
+    T2_ConstantOfShape: TypeAlias = Union[
+        BFLOAT16,
+        BOOL,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        INT16,
+        INT32,
+        INT64,
+        INT8,
+        UINT16,
+        UINT32,
+        UINT64,
+        UINT8,
+    ]
+
+    def ConstantOfShape(
+        self, input: T1_ConstantOfShape, *, value: Optional[TensorProto] = None
+    ) -> T2_ConstantOfShape:
+        r"""[🌐 ConstantOfShape(20)](https://onnx.ai/onnx/operators/onnx__ConstantOfShape.html#constantofshape-20 "Online Documentation")
+
+
+        Generate a tensor with given value and shape.
+
+
+        Args:
+            input: 1D tensor. The shape of the expected output tensor. If empty tensor
+                is given, the output would be a scalar. All values must be >= 0.
+
+            value: (Optional) The value of the output elements.Should be a one-element
+                tensor. If not specified, it defaults to a tensor of value 0 and
+                datatype float32
+        """
+
+        schema = get_schema("ConstantOfShape", 20, "")
+        op = Op(self, "ConstantOfShape", schema)
+        return op(*self._prepare_inputs(schema, input), value=value)
+
+    T1_DFT = TypeVar("T1_DFT", BFLOAT16, DOUBLE, FLOAT, FLOAT16)
+
+    T2_DFT = TypeVar("T2_DFT", INT32, INT64)
+
+    def DFT(
+        self,
+        input: T1_DFT,
+        dft_length: Optional[T2_DFT] = None,
+        axis: Optional[INT64] = None,
+        *,
+        inverse: int = 0,
+        onesided: int = 0,
+    ) -> T1_DFT:
+        r"""[🌐 DFT(20)](https://onnx.ai/onnx/operators/onnx__DFT.html#dft-20 "Online Documentation")
+
+        Computes the discrete Fourier Transform (DFT) of the input.
+
+        Assuming the input has shape `[M, N]`, where `N` is the dimension over which the
+        DFT is computed and `M` denotes the conceptual "all other dimensions,"
+        the DFT `y[m, k]` of shape `[M, N]` is defined as
+
+        $$y[m, k] = \sum_{n=0}^{N-1} e^{-2 \pi j \frac{k n}{N} } x[m, n] ,$$
+
+        and the inverse transform is defined as
+
+        $$x[m, n] = \frac{1}{N} \sum_{k=0}^{N-1} e^{2 \pi j \frac{k n}{N} } y[m, k] ,$$
+
+        where $j$ is the imaginary unit.
+
+        The actual shape of the output is specified in the "output" section.
+
+        Reference: https://docs.scipy.org/doc/scipy/tutorial/fft.html
+
+
+        Args:
+            input: (non-differentiable) For real input, the following shape is expected:
+                `[signal_dim0][signal_dim1][signal_dim2]...[signal_dimN][1]`. For
+                complex input, the following shape is expected:
+                `[signal_dim0][signal_dim1][signal_dim2]...[signal_dimN][2]`. The final
+                dimension represents the real and imaginary parts of the value in that
+                order.
+
+            dft_length: (optional, non-differentiable) The length of the signal as a
+                scalar. If greater than the axis dimension, the signal will be
+                zero-padded up to `dft_length`. If less than the axis dimension, only
+                the first `dft_length` values will be used as the signal.
+
+            axis: (optional, non-differentiable) The axis as a scalar on which to
+                perform the DFT. Default is `-2` (last signal axis). Negative value
+                means counting dimensions from the back. Accepted range is $[-r, -2]
+                \cup [0, r-2]$ where `r = rank(input)`. The last dimension is for
+                representing complex numbers and thus is an invalid axis.
+
+            inverse: Whether to perform the inverse discrete Fourier Transform. Default
+                is 0, which corresponds to `false`.
+
+            onesided: If `onesided` is `1` and input is real, only values for `k` in
+                `[0, 1, 2, ..., floor(n_fft/2) + 1]` are returned because the
+                real-to-complex Fourier transform satisfies the conjugate symmetry,
+                i.e., `X[m, k] = X[m, n_fft-k]*`, where `m` denotes "all other
+                dimensions" DFT was not applied on. If the input tensor is complex,
+                onesided output is not possible. Value can be `0` or `1`. Default is
+                `0`.
+        """
+
+        schema = get_schema("DFT", 20, "")
+        op = Op(self, "DFT", schema)
+        return op(
+            *self._prepare_inputs(schema, input, dft_length, axis),
+            inverse=inverse,
+            onesided=onesided,
+        )
+
+    T_Gelu = TypeVar("T_Gelu", BFLOAT16, DOUBLE, FLOAT, FLOAT16)
+
+    def Gelu(self, X: T_Gelu, *, approximate: str = "none") -> T_Gelu:
+        r"""[🌐 Gelu(20)](https://onnx.ai/onnx/operators/onnx__Gelu.html#gelu-20 "Online Documentation")
+
+
+        Gelu takes one input data (Tensor<T>) and produces one
+        output data (Tensor<T>) where the gaussian error linear units function,
+        $y = 0.5 * x * (1 + erf(x/sqrt(2)))$ is applied to the tensor elementwise.
+        If the attribute "approximate" is set to "tanh", the function estimation,
+        $y = 0.5 * x * (1 + Tanh(sqrt(2/\pi) * (x + 0.044715 * x^3)))$ is used and applied
+        to the tensor elementwise.
+
+
+
+        Args:
+            X: (differentiable) Input tensor
+
+            approximate: Gelu approximation algorithm: `"tanh"`,
+                `"none"`(default).`"none"`: do not use approximation.`"tanh"`: use tanh
+                approximation.
+        """
+
+        schema = get_schema("Gelu", 20, "")
+        op = Op(self, "Gelu", schema)
+        return op(*self._prepare_inputs(schema, X), approximate=approximate)
+
+    T1_GridSample = TypeVar(
+        "T1_GridSample",
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        INT16,
+        INT32,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT32,
+        UINT64,
+        UINT8,
+    )
+
+    T2_GridSample = TypeVar("T2_GridSample", DOUBLE, FLOAT, FLOAT16)
+
+    def GridSample(
+        self,
+        X: T1_GridSample,
+        grid: T2_GridSample,
+        *,
+        align_corners: int = 0,
+        mode: str = "linear",
+        padding_mode: str = "zeros",
+    ) -> T1_GridSample:
+        r"""[🌐 GridSample(20)](https://onnx.ai/onnx/operators/onnx__GridSample.html#gridsample-20 "Online Documentation")
+
+
+        Given an input `X` and a flow-field `grid`, computes the output `Y` using `X` values and pixel locations from the `grid`.
+        For spatial input `X` with shape (N, C, H, W), the `grid` will have shape (N, H_out, W_out, 2),
+        the output `Y` will have shape (N, C, H_out, W_out). For volumetric input `X` with shape (N, C, D, H, W),
+        the `grid` will have shape (N, D_out, H_out, W_out, 3), the output `Y` will have shape (N, C, D_out, H_out, W_out).
+        More generally, for an input `X` of rank r+2 with shape (N, C, d1, d2, ..., dr),
+        the `grid` will have shape (N, D1_out, D2_out, ..., Dr_out, r), the output `Y` will have shape (N, C, D1_out, D2_out, ..., Dr_out).
+
+        The tensor `X` contains values at centers of square pixels (voxels, etc) locations such as (n, c, d1_in, d2_in, ..., dr_in).
+        The (n, d1_out, d2_out, ..., dr_out, :) values from the tensor `grid` are the normalized positions for interpolating the values
+        at the (n, c, d1_out, d2_out, ..., dr_out) locations from the output tensor `Y` using a specified interpolation method (the mode)
+        and a padding mode (for `grid` positions falling outside the 2-dimensional image).
+
+        For example, the values in `grid[n, h_out, w_out, :]` are size-2 vectors specifying normalized positions in the 2-dimensional space of `X`.
+        They are used to interpolate output values of `Y[n, c, h_out, w_out]`.
+
+        The GridSample operator is often used in doing grid generator and sampler in the
+        [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025).
+        See also in [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html).
+
+
+        Args:
+            X: (differentiable) Input tensor of rank r+2 that has shape (N, C, D1, D2,
+                ..., Dr), where N is the batch size, C is the number of channels, D1,
+                D2, ..., Dr are the spatial dimensions.
+
+            grid: (non-differentiable) Input offset of shape (N, D1_out, D2_out, ...,
+                Dr_out, r), where D1_out, D2_out, ..., Dr_out are the spatial dimensions
+                of the grid and output, and r is the number of spatial dimensions. Grid
+                specifies the sampling locations normalized by the input spatial
+                dimensions. Therefore, it should have most values in the range of [-1,
+                1]. If the grid has values outside the range of [-1, 1], the
+                corresponding outputs will be handled as defined by padding_mode.
+                Following computer vision convention, the coordinates in the length-r
+                location vector are listed from the innermost tensor dimension to the
+                outermost, the opposite of regular tensor indexing.
+
+            align_corners: If align_corners=1, the extrema (-1 and 1) are considered as
+                referring to the center points of the input's corner pixels (voxels,
+                etc.). If align_corners=0, they are instead considered as referring to
+                the corner points of the input's corner pixels (voxels, etc.), making
+                the sampling more resolution agnostic.
+
+            mode: Three interpolation modes: linear (default), nearest and cubic. The
+                "linear" mode includes linear and N-linear interpolation modes depending
+                on the number of spatial dimensions of the input tensor (i.e. linear for
+                1 spatial dimension, bilinear for 2 spatial dimensions, etc.). The
+                "cubic" mode also includes N-cubic interpolation modes following the
+                same rules. The "nearest" mode rounds to the nearest even index when the
+                sampling point falls halfway between two indices.
+
+            padding_mode: Support padding modes for outside grid values:
+                `zeros`(default), `border`, `reflection`. zeros: use 0 for out-of-bound
+                grid locations, border: use border values for out-of-bound grid
+                locations, reflection: use values at locations reflected by the border
+                for out-of-bound grid locations. If index 0 represents the margin pixel,
+                the reflected value at index -1 will be the same as the value at index
+                1. For location far away from the border, it will keep being reflected
+                until becoming in bound. If pixel location x = -3.5 reflects by border
+                -1 and becomes x' = 1.5, then reflects by border 1 and becomes x'' =
+                0.5.
+        """
+
+        schema = get_schema("GridSample", 20, "")
+        op = Op(self, "GridSample", schema)
+        return op(
+            *self._prepare_inputs(schema, X, grid),
+            align_corners=align_corners,
+            mode=mode,
+            padding_mode=padding_mode,
+        )
+
+    T1_ImageDecoder: TypeAlias = UINT8
+
+    T2_ImageDecoder: TypeAlias = UINT8
+
+    def ImageDecoder(
+        self, encoded_stream: T1_ImageDecoder, *, pixel_format: str = "RGB"
+    ) -> T2_ImageDecoder:
+        r"""[🌐 ImageDecoder(20)](https://onnx.ai/onnx/operators/onnx__ImageDecoder.html#imagedecoder-20 "Online Documentation")
+
+        Loads and decodes and image from a file. If it can't decode for any reason (e.g. corrupted encoded
+        stream, invalid format, it will return an empty matrix).
+        The following image formats are supported:
+        * BMP
+        * JPEG (note: Lossless JPEG support is optional)
+        * JPEG2000
+        * TIFF
+        * PNG
+        * WebP
+        * Portable image format (PBM, PGM, PPM, PXM, PNM)
+        Decoded images follow a channel-last layout: (Height, Width, Channels).
+        **JPEG chroma upsampling method:**
+        When upsampling the chroma components by a factor of 2, the pixels are linearly interpolated so that the
+        centers of the output pixels are 1/4 and 3/4 of the way between input pixel centers.
+        When rounding, 0.5 is rounded down and up at alternative pixels locations to prevent bias towards
+        larger values (ordered dither pattern).
+        Considering adjacent input pixels A, B, and C, B is upsampled to pixels B0 and B1 so that
+        ::
+
+            B0 = round_half_down((1/4) * A + (3/4) * B)
+            B1 = round_half_up((3/4) * B + (1/4) * C)
+
+
+        This method,  is the default chroma upsampling method in the well-established libjpeg-turbo library,
+        also referred as "smooth" or "fancy" upsampling.
+
+
+        Args:
+            encoded_stream: (non-differentiable) Encoded stream
+
+            pixel_format: Pixel format. Can be one of "RGB", "BGR", or "Grayscale".
+        """
+
+        schema = get_schema("ImageDecoder", 20, "")
+        op = Op(self, "ImageDecoder", schema)
+        return op(*self._prepare_inputs(schema, encoded_stream), pixel_format=pixel_format)
+
+    T1_IsInf = TypeVar(
+        "T1_IsInf",
+        BFLOAT16,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+    )
+
+    T2_IsInf: TypeAlias = BOOL
+
+    def IsInf(
+        self, X: T1_IsInf, *, detect_negative: int = 1, detect_positive: int = 1
+    ) -> T2_IsInf:
+        r"""[🌐 IsInf(20)](https://onnx.ai/onnx/operators/onnx__IsInf.html#isinf-20 "Online Documentation")
+
+        Map infinity to true and other values to false.
+
+        Args:
+            X: (non-differentiable) input
+
+            detect_negative: (Optional) Whether map negative infinity to true. Default
+                to 1 so that negative infinity induces true. Set this attribute to 0 if
+                negative infinity should be mapped to false.
+
+            detect_positive: (Optional) Whether map positive infinity to true. Default
+                to 1 so that positive infinity induces true. Set this attribute to 0 if
+                positive infinity should be mapped to false.
+        """
+
+        schema = get_schema("IsInf", 20, "")
+        op = Op(self, "IsInf", schema)
+        return op(
+            *self._prepare_inputs(schema, X),
+            detect_negative=detect_negative,
+            detect_positive=detect_positive,
+        )
+
+    T1_IsNaN = TypeVar(
+        "T1_IsNaN",
+        BFLOAT16,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+    )
+
+    T2_IsNaN: TypeAlias = BOOL
+
+    def IsNaN(self, X: T1_IsNaN) -> T2_IsNaN:
+        r"""[🌐 IsNaN(20)](https://onnx.ai/onnx/operators/onnx__IsNaN.html#isnan-20 "Online Documentation")
+
+        Returns which elements of the input are NaN.
+
+        Args:
+            X: (non-differentiable) input
+        """
+
+        schema = get_schema("IsNaN", 20, "")
+        op = Op(self, "IsNaN", schema)
+        return op(*self._prepare_inputs(schema, X))
+
+    T_ReduceMax = TypeVar(
+        "T_ReduceMax",
+        BFLOAT16,
+        BOOL,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        INT32,
+        INT64,
+        INT8,
+        UINT32,
+        UINT64,
+        UINT8,
+    )
+
+    def ReduceMax(
+        self,
+        data: T_ReduceMax,
+        axes: Optional[INT64] = None,
+        *,
+        keepdims: int = 1,
+        noop_with_empty_axes: int = 0,
+    ) -> T_ReduceMax:
+        r"""[🌐 ReduceMax(20)](https://onnx.ai/onnx/operators/onnx__ReduceMax.html#reducemax-20 "Online Documentation")
+
+
+        Computes the max of the input tensor's elements along the provided axes. The resulting
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+        the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise.
+
+
+        If the input data type is Boolean, the comparison should consider `False < True`.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
+
+        Args:
+            data: (differentiable) An input tensor.
+
+            axes: (optional, non-differentiable) Optional input list of integers, along
+                which to reduce. The default is to reduce over all the dimensions of the
+                input tensor if 'noop_with_empty_axes' is false, else act as an Identity
+                op when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
+
+            keepdims: Keep the reduced dimension or not, default 1 means keep reduced
+                dimension.
+
+            noop_with_empty_axes: Defines behavior if 'axes' is empty. Default behavior
+                with 'false' is to reduce all axes. When axes is empty and this
+                attribute is set to true, input tensor will not be reduced,and the
+                output tensor would be equivalent to input tensor.
+        """
+
+        schema = get_schema("ReduceMax", 20, "")
+        op = Op(self, "ReduceMax", schema)
+        return op(
+            *self._prepare_inputs(schema, data, axes),
+            keepdims=keepdims,
+            noop_with_empty_axes=noop_with_empty_axes,
+        )
+
+    T_ReduceMin = TypeVar(
+        "T_ReduceMin",
+        BFLOAT16,
+        BOOL,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        INT32,
+        INT64,
+        INT8,
+        UINT32,
+        UINT64,
+        UINT8,
+    )
+
+    def ReduceMin(
+        self,
+        data: T_ReduceMin,
+        axes: Optional[INT64] = None,
+        *,
+        keepdims: int = 1,
+        noop_with_empty_axes: int = 0,
+    ) -> T_ReduceMin:
+        r"""[🌐 ReduceMin(20)](https://onnx.ai/onnx/operators/onnx__ReduceMin.html#reducemin-20 "Online Documentation")
+
+
+        Computes the min of the input tensor's elements along the provided axes. The resulting
+        tensor has the same rank as the input if `keepdims` equals 1. If `keepdims` equals 0, then
+        the resulting tensor has the reduced dimension pruned. Input tensors of rank zero are
+        valid. Reduction over an empty set of values yields plus infinity (if supported by the datatype) or the maximum value of the data type otherwise.
+
+
+        If the input data type is Boolean, the comparison should consider `False < True`.
+
+        The above behavior is similar to numpy, with the exception that numpy defaults `keepdims`
+        to `False` instead of `True`.
+
+        Args:
+            data: (differentiable) An input tensor.
+
+            axes: (optional, non-differentiable) Optional input list of integers, along
+                which to reduce. The default is to reduce over all the dimensions of the
+                input tensor if 'noop_with_empty_axes' is false, else act as an Identity
+                op when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
+
+            keepdims: Keep the reduced dimension or not, default 1 means keep reduced
+                dimension.
+
+            noop_with_empty_axes: Defines behavior if 'axes' is empty. Default behavior
+                with 'false' is to reduce all axes. When axes is empty and this
+                attribute is set to true, input tensor will not be reduced,and the
+                output tensor would be equivalent to input tensor.
+        """
+
+        schema = get_schema("ReduceMin", 20, "")
+        op = Op(self, "ReduceMin", schema)
+        return op(
+            *self._prepare_inputs(schema, data, axes),
+            keepdims=keepdims,
+            noop_with_empty_axes=noop_with_empty_axes,
+        )
+
+    T1_RegexFullMatch: TypeAlias = STRING
+
+    T2_RegexFullMatch: TypeAlias = BOOL
+
+    def RegexFullMatch(
+        self, X: T1_RegexFullMatch, *, pattern: Optional[str] = None
+    ) -> T2_RegexFullMatch:
+        r"""[🌐 RegexFullMatch(20)](https://onnx.ai/onnx/operators/onnx__RegexFullMatch.html#regexfullmatch-20 "Online Documentation")
+
+        RegexFullMatch performs a full regex match on each element of the input tensor. If an element fully matches the regex pattern specified as an attribute, the corresponding element in the output is True and it is False otherwise. [RE2](https://github.com/google/re2/wiki/Syntax) regex syntax is used.
+
+        Args:
+            X: (non-differentiable) Tensor with strings to match on.
+
+            pattern: Regex pattern to match on. This must be valid RE2 syntax.
+        """
+
+        schema = get_schema("RegexFullMatch", 20, "")
+        op = Op(self, "RegexFullMatch", schema)
+        return op(*self._prepare_inputs(schema, X), pattern=pattern)
+
+    T_StringConcat: TypeAlias = STRING
+
+    def StringConcat(self, X: T_StringConcat, Y: T_StringConcat) -> T_StringConcat:
+        r"""[🌐 StringConcat(20)](https://onnx.ai/onnx/operators/onnx__StringConcat.html#stringconcat-20 "Online Documentation")
+
+        StringConcat concatenates string tensors elementwise (with NumPy-style broadcasting support)
+
+        Args:
+            X: (non-differentiable) Tensor to prepend in concatenation
+
+            Y: (non-differentiable) Tensor to append in concatenation
+        """
+
+        schema = get_schema("StringConcat", 20, "")
+        op = Op(self, "StringConcat", schema)
+        return op(*self._prepare_inputs(schema, X, Y))
+
+    T1_StringSplit: TypeAlias = STRING
+
+    T2_StringSplit: TypeAlias = STRING
+
+    T3_StringSplit: TypeAlias = INT64
+
+    def StringSplit(
+        self,
+        X: T1_StringSplit,
+        *,
+        delimiter: Optional[str] = None,
+        maxsplit: Optional[int] = None,
+    ) -> Tuple[T2_StringSplit, T3_StringSplit]:
+        r"""[🌐 StringSplit(20)](https://onnx.ai/onnx/operators/onnx__StringSplit.html#stringsplit-20 "Online Documentation")
+
+        StringSplit splits a string tensor's elements into substrings based on a delimiter attribute and a maxsplit attribute.
+
+        The first output of this operator is a tensor of strings representing the substrings from splitting each input string on the `delimiter` substring. This tensor has one additional rank compared to the input tensor in order to store the substrings for each input element (where the input tensor is not empty). Note that, in order to ensure the same number of elements are present in the final dimension, this tensor will pad empty strings as illustrated in the examples below. Consecutive delimiters are not grouped together and are deemed to delimit empty strings, except if the `delimiter` is unspecified or is the empty string (""). In the case where the `delimiter` is unspecified or the empty string, consecutive whitespace characters are regarded as a single separator and leading or trailing whitespace is removed in the output.
+
+        The second output tensor represents the number of substrings generated. `maxsplit` can be used to limit the number of splits performed - after the `maxsplit`th split if the string is not fully split, the trailing suffix of input string after the final split point is also added. For elements where fewer splits are possible than specified in `maxsplit`, it has no effect.
+
+        Args:
+            X: (non-differentiable) Tensor of strings to split.
+
+            delimiter: Delimiter to split on. If left unset or set to the empty string
+                (""), the input is split on consecutive whitespace.
+
+            maxsplit: Maximum number of splits (from left to right). If left unset (or
+                if the number of possible splits are less than maxsplit), it will make
+                as many splits as possible. Note that the maximum possible number of
+                substrings returned with `maxsplit` specified is `maxsplit+1` since the
+                remaining suffix after the `maxsplit`th split is included in the output.
+        """
+
+        schema = get_schema("StringSplit", 20, "")
+        op = Op(self, "StringSplit", schema)
+        return op(*self._prepare_inputs(schema, X), delimiter=delimiter, maxsplit=maxsplit)
diff --git a/onnxscript/onnx_opset/_impl/opset7.py b/onnxscript/onnx_opset/_impl/opset7.py
index 09ecf1d19b..e584d06c5a 100644
--- a/onnxscript/onnx_opset/_impl/opset7.py
+++ b/onnxscript/onnx_opset/_impl/opset7.py
@@ -941,7 +941,7 @@ def PRelu(self, X: T_PRelu, slope: T_PRelu) -> T_PRelu:
         Args:
             X: Input tensor
 
-            slope: Slope tensor. The shape of slope can be smaller then first input X;
+            slope: Slope tensor. The shape of slope can be smaller than first input X;
                 if so, its shape must be unidirectional broadcastable to X
         """
 
diff --git a/onnxscript/onnx_opset/_impl/opset9.py b/onnxscript/onnx_opset/_impl/opset9.py
index 26fd984973..7d99f002ff 100644
--- a/onnxscript/onnx_opset/_impl/opset9.py
+++ b/onnxscript/onnx_opset/_impl/opset9.py
@@ -666,7 +666,7 @@ def MaxUnpool(
         MaxUnpool essentially computes the partial inverse of the MaxPool op.
          The input information to this op is typically the output information from a MaxPool op. The first
          input tensor X is the tensor that needs to be unpooled, which is typically the pooled tensor (first output)
-         from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corrsponding
+         from MaxPool. The second input tensor, I, contains the indices to the (locally maximal) elements corresponding
          to the elements in the first input tensor X. Input tensor I is typically the second output of the MaxPool op.
          The third (optional) input is a tensor that specifies the output size of the unpooling operation.
 
@@ -679,7 +679,7 @@ def MaxUnpool(
          known/predictable size.
 
         In addition to the inputs, MaxUnpool takes three attributes, namely kernel_shape, strides, and pads,
-         which define the exact unpooling op. The attributes typically have the same values as the corrsponding
+         which define the exact unpooling op. The attributes typically have the same values as the corresponding
          pooling op that the unpooling op is trying to invert.
 
 
@@ -747,7 +747,7 @@ def MeanVarianceNormalization(
         Args:
             X: Input tensor
 
-            axes: A list of integers, along which to reduce. The default is to caculate
+            axes: A list of integers, along which to reduce. The default is to calculate
                 along axes [0,2,3] for calculating mean and variance along each channel.
                 Two variables with the same C-coordinate are associated with the same
                 mean and variance.
@@ -907,7 +907,7 @@ def PRelu(self, X: T_PRelu, slope: T_PRelu) -> T_PRelu:
         Args:
             X: (differentiable) Input tensor
 
-            slope: (differentiable) Slope tensor. The shape of slope can be smaller then
+            slope: (differentiable) Slope tensor. The shape of slope can be smaller than
                 first input X; if so, its shape must be unidirectional broadcastable to
                 X
         """
diff --git a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml1.py b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml1.py
index a6e1ef38e1..a190eb17f9 100644
--- a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml1.py
+++ b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml1.py
@@ -238,7 +238,7 @@ def FeatureVectorizer(
 
             Concatenates input tensors into one continuous output.
 
-            All input shapes are 2-D and are concatenated along the second dimention. 1-D tensors are treated as [1,C].
+            All input shapes are 2-D and are concatenated along the second dimension. 1-D tensors are treated as [1,C].
             Inputs are copied to the output maintaining the order of the input arguments.
 
             All inputs must be integers or floats, while the output will be all floating point values.
@@ -773,7 +773,7 @@ def TreeEnsembleClassifier(
             nodes_missing_value_tracks_true: For each node, define what to do in the
                 presence of a missing value: if a value is missing (NaN), use the 'true'
                 or 'false' branch based on the value in this array.<br>This attribute
-                may be left undefined, and the defalt value is false (0) for all nodes.
+                may be left undefined, and the default value is false (0) for all nodes.
 
             nodes_modes: The node kind, that is, the comparison to make at the node.
                 There is no comparison to make at a leaf node.<br>One of 'BRANCH_LEQ',
@@ -882,7 +882,7 @@ def TreeEnsembleRegressor(
             nodes_missing_value_tracks_true: For each node, define what to do in the
                 presence of a NaN: use the 'true' (if the attribute value is 1) or
                 'false' (if the attribute value is 0) branch based on the value in this
-                array.<br>This attribute may be left undefined and the defalt value is
+                array.<br>This attribute may be left undefined and the default value is
                 false (0) for all nodes.
 
             nodes_modes: The node kind, that is, the comparison to make at the node.
diff --git a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml3.py b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml3.py
index 8e2821a924..0092b4fd40 100644
--- a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml3.py
+++ b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml3.py
@@ -119,7 +119,7 @@ def TreeEnsembleClassifier(
             nodes_missing_value_tracks_true: For each node, define what to do in the
                 presence of a missing value: if a value is missing (NaN), use the 'true'
                 or 'false' branch based on the value in this array.<br>This attribute
-                may be left undefined, and the defalt value is false (0) for all nodes.
+                may be left undefined, and the default value is false (0) for all nodes.
 
             nodes_modes: The node kind, that is, the comparison to make at the node.
                 There is no comparison to make at a leaf node.<br>One of 'BRANCH_LEQ',
@@ -224,13 +224,13 @@ def TreeEnsembleRegressor(
             aggregate_function: Defines how to aggregate leaf values within a target.
                 <br>One of 'AVERAGE,' 'SUM,' 'MIN,' 'MAX.'
 
-            base_values: Base values for classification, added to final class score; the
-                size must be the same as the classes or can be left unassigned (assumed
-                0)
+            base_values: Base values for regression, added to final prediction after
+                applying aggregate_function; the size must be the same as the classes or
+                can be left unassigned (assumed 0)
 
-            base_values_as_tensor: Base values for classification, added to final class
-                score; the size must be the same as the classes or can be left
-                unassigned (assumed 0)
+            base_values_as_tensor: Base values for regression, added to final prediction
+                after applying aggregate_function; the size must be the same as the
+                classes or can be left unassigned (assumed 0)
 
             n_targets: The total number of targets.
 
@@ -247,7 +247,7 @@ def TreeEnsembleRegressor(
             nodes_missing_value_tracks_true: For each node, define what to do in the
                 presence of a NaN: use the 'true' (if the attribute value is 1) or
                 'false' (if the attribute value is 0) branch based on the value in this
-                array.<br>This attribute may be left undefined and the defalt value is
+                array.<br>This attribute may be left undefined and the default value is
                 false (0) for all nodes.
 
             nodes_modes: The node kind, that is, the comparison to make at the node.
diff --git a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml4.py b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml4.py
new file mode 100644
index 0000000000..552e545d75
--- /dev/null
+++ b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml4.py
@@ -0,0 +1,129 @@
+# --------------------------------------------------------------------------
+# ⚠️ WARNING - AUTO-GENERATED CODE - DO NOT EDIT ⚠️
+# ⚙️ Generated by 'python -m opgen'
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+# pylint: disable=W0221,W0222,R0901,W0237
+# mypy: disable-error-code=override
+# ruff: noqa: N801,E741
+# ruff: noqa: D214,D402,D405,D411,D412,D416,D417
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Optional, Sequence, TypeVar, Union
+
+from onnx import TensorProto
+from onnx.defs import get_schema
+from typing_extensions import TypeAlias
+
+from onnxscript.onnx_opset._impl.opset_ai_onnx_ml3 import Opset_ai_onnx_ml3
+from onnxscript.onnx_types import DOUBLE, FLOAT, INT16, INT32, INT64, STRING
+from onnxscript.values import Op, Opset
+
+
+class Opset_ai_onnx_ml4(Opset_ai_onnx_ml3):
+    def __new__(cls):
+        return Opset.__new__(cls, "ai.onnx.ml", 4)
+
+    T1_LabelEncoder = TypeVar("T1_LabelEncoder", DOUBLE, FLOAT, INT16, INT32, INT64, STRING)
+
+    T2_LabelEncoder: TypeAlias = Union[DOUBLE, FLOAT, INT16, INT32, INT64, STRING]
+
+    def LabelEncoder(
+        self,
+        X: T1_LabelEncoder,
+        *,
+        default_float: float = -0.0,
+        default_int64: int = -1,
+        default_string: str = "_Unused",
+        default_tensor: Optional[TensorProto] = None,
+        keys_floats: Optional[Sequence[float]] = None,
+        keys_int64s: Optional[Sequence[int]] = None,
+        keys_strings: Optional[Sequence[str]] = None,
+        keys_tensor: Optional[TensorProto] = None,
+        values_floats: Optional[Sequence[float]] = None,
+        values_int64s: Optional[Sequence[int]] = None,
+        values_strings: Optional[Sequence[str]] = None,
+        values_tensor: Optional[TensorProto] = None,
+    ) -> T2_LabelEncoder:
+        r"""[🌐 ai.onnx.ml::LabelEncoder(4)](https://onnx.ai/onnx/operators/onnx_aionnxml_LabelEncoder.html#labelencoder-4 "Online Documentation")
+
+
+            Maps each element in the input tensor to another value.
+
+            The mapping is determined by the two parallel attributes, 'keys_*' and
+            'values_*' attribute. The i-th value in the specified 'keys_*' attribute
+            would be mapped to the i-th value in the specified 'values_*' attribute. It
+            implies that input's element type and the element type of the specified
+            'keys_*' should be identical while the output type is identical to the
+            specified 'values_*' attribute. Note that the 'keys_*' and 'values_*' attributes
+            must have the same length. If an input element can not be found in the
+            specified 'keys_*' attribute, the 'default_*' that matches the specified
+            'values_*' attribute may be used as its output value. The type of the 'default_*'
+            attribute must match the 'values_*' attribute chosen.
+
+            Let's consider an example which maps a string tensor to an integer tensor.
+            Assume and 'keys_strings' is ["Amy", "Sally"], 'values_int64s' is [5, 6],
+            and 'default_int64' is '-1'.  The input ["Dori", "Amy", "Amy", "Sally",
+            "Sally"] would be mapped to [-1, 5, 5, 6, 6].
+
+            Since this operator is an one-to-one mapping, its input and output shapes
+            are the same. Notice that only one of 'keys_*'/'values_*' can be set.
+
+            Float keys with value 'NaN' match any input 'NaN' value regardless of bit
+            value. If a key is repeated, the last key takes precedence.
+
+
+        Args:
+            X: Input data. It must have the same element type as the keys_* attribute
+                set.
+
+            default_float: A float.
+
+            default_int64: An integer.
+
+            default_string: A string.
+
+            default_tensor: A default tensor. {"_Unused"} if values_* has string type,
+                {-1} if values_* has integral type, and {-0.f} if values_* has float
+                type.
+
+            keys_floats: A list of floats.
+
+            keys_int64s: A list of ints.
+
+            keys_strings: A list of strings.
+
+            keys_tensor: Keys encoded as a 1D tensor. One and only one of 'keys_*'s
+                should be set.
+
+            values_floats: A list of floats.
+
+            values_int64s: A list of ints.
+
+            values_strings: A list of strings.
+
+            values_tensor: Values encoded as a 1D tensor. One and only one of
+                'values_*'s should be set.
+        """
+
+        schema = get_schema("LabelEncoder", 4, "ai.onnx.ml")
+        op = Op(self, "LabelEncoder", schema)
+        return op(
+            *self._prepare_inputs(schema, X),
+            default_float=default_float,
+            default_int64=default_int64,
+            default_string=default_string,
+            default_tensor=default_tensor,
+            keys_floats=keys_floats,
+            keys_int64s=keys_int64s,
+            keys_strings=keys_strings,
+            keys_tensor=keys_tensor,
+            values_floats=values_floats,
+            values_int64s=values_int64s,
+            values_strings=values_strings,
+            values_tensor=values_tensor,
+        )
diff --git a/pyproject.toml b/pyproject.toml
index e27bbc48d9..9b708d3002 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,11 +23,10 @@ classifiers = [
   "Programming Language :: Python :: 3.11",
   "License :: OSI Approved :: MIT License",
 ]
-dependencies = ["numpy", "onnx>=1.14", "typing_extensions"]
+dependencies = ["numpy", "onnx>=1.15", "typing_extensions"]
 
 [tool.setuptools.packages.find]
 include = ["onnxscript*"]
-# Tests are by default excluded: https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
 
 [tool.setuptools.package-data]
 onnxscript = ["py.typed"]