Merge branch 'main' into torchscriptgraph_initializer

BowenBao · web-flow · commit dcca04f20c15 · 2023-03-01T10:36:11.000-08:00
diff --git a/onnxscript/function_libs/torch_aten/ops/core.py b/onnxscript/function_libs/torch_aten/ops/core.py
@@ -2039,6 +2039,8 @@ def aten_expand(self: TTensor, size: TInt) -> TTensor:
     """expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)"""
 
     size = op.Cast(size, to=INT64.dtype)
+    # To support -1 dim.
+    size = op.Abs(size)
     return op.Expand(self, size)
 
 
@@ -2893,17 +2895,27 @@ def aten_kthvalue(
     raise NotImplementedError()
 
 
+@torch_op("aten::layer_norm", trace_only=True)
 def aten_layer_norm(
-    input: TensorType,
+    input: TReal,
     normalized_shape: Sequence[int],
-    weight: Optional[TensorType] = None,
-    bias: Optional[TensorType] = None,
+    weight: Optional[TReal] = None,
+    bias: Optional[TReal] = None,
     eps: float = 1e-05,
-    cudnn_enable: bool = True,
-) -> TensorType:
+) -> TReal:
     """layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor"""
 
-    raise NotImplementedError()
+    axes_list = [-i for i in range(len(normalized_shape), 0, -1)]
+    start_axis = axes_list[0]
+    if not op.OptionalHasElement(weight):
+        one = op.Constant(value_float=1.0)
+        weight = op.Expand(one, op.Shape(input, start=start_axis))
+    if not op.OptionalHasElement(bias):
+        zero = op.Constant(value_float=0.0)
+        bias = op.Expand(zero, op.Shape(input, start=start_axis))
+
+    result, _, _ = op.LayerNormalization(input, weight, bias, axis=start_axis, epsilon=eps)
+    return result
 
 
 def aten_lcm(self: TensorType, other: TensorType) -> TensorType:
@@ -3257,10 +3269,58 @@ def aten_matrix_power(self: TensorType, n: int) -> TensorType:
     raise NotImplementedError()
 
 
-def aten_max(self: TensorType) -> TensorType:
+@torch_op("aten::max", trace_only=True)
+def aten_max(
+    self: TReal, dim_or_other: Union[TReal, INT64] = None, keepdim: BOOL = None
+) -> TReal:
     """max(Tensor self) -> Tensor"""
 
-    raise NotImplementedError()
+    self_rank = op.Size(op.Shape(self))
+    if self_rank == 0:
+        self = op.Reshape(self, op.Constant(value_int=[-1]))
+
+    output = 1
+
+    if op.OptionalHasElement(dim_or_other):
+        if isinstance(dim_or_other, int):
+            if not op.OptionalHasElement(keepdim):
+                keepdim = False
+            result, indices = _aten_max_with_dim(self, dim_or_other, keepdim)
+            output = 2
+        else:  # dim_or_other is tensor
+            result = _aten_max_with_other(self, dim_or_other)
+    else:
+        result = _aten_max_with_no_dim(self)
+
+    if self_rank == 0:
+        result = op.Squeeze(result)
+
+    if output == 2:
+        if self_rank == 0:
+            indices = op.Squeeze(indices)  # type: ignore[has-type]
+        return result, indices
+    return result
+
+
+@torch_op("aten::max", overload=True)
+def _aten_max_with_no_dim(self: TReal) -> TReal:
+    result = op.ReduceMax(self, keepdims=0)
+    return result
+
+
+@torch_op("aten::max", overload=True)
+def _aten_max_with_other(self: TReal, other: TReal) -> TReal:
+    result = op.Max(self, other)
+    return result
+
+
+@torch_op("aten::max", overload=True)
+# def _aten_max_with_dim(self: TReal, dim: int, keepdim: bool) -> tuple[TReal, TInt]:
+def _aten_max_with_dim(self: TReal, dim: int, keepdim: bool):
+    dims = op.Reshape(dim, op.Constant(value_int=[-1]))
+    result = op.ReduceMax(self, dims, keepdims=keepdim)
+    indices = op.ArgMax(self, axis=dim, keepdims=keepdim)
+    return result, indices
 
 
 def aten_max_pool1d(
@@ -3918,12 +3978,13 @@ def aten_native_layer_norm(
     # where D is the dimension of normalized_shape. For example, if normalized_shape is
     # (3, 5) (a 2-dimensional shape), the mean and standard-deviation are computed
     # over the last 2 dimensions of the input (i.e. input.mean((-2, -1))).
-    axes = [-i for i in range(len(normalized_shape), 0, -1)]
-    if weight is None:
+    axes_list = [-i for i in range(len(normalized_shape), 0, -1)]
+    axes = op.Constant(value_ints=axes_list)
+    if not op.OptionalHasElement(weight):
         weight = op.CastLike(1, input)
-    if bias is None:
+    if not op.OptionalHasElement(bias):
         bias = op.CastLike(0, input)
-    return _aten_native_layer_norm_onnx(input, weight, bias, axes=axes, eps=eps)
+    return _aten_native_layer_norm_onnx(input, weight, bias, axes, eps)
 
 
 @torch_op("aten::native_layer_norm", overload=True)
@@ -3936,18 +3997,18 @@ def _aten_native_layer_norm_onnx(
 ) -> Tuple[TReal, TReal, TReal]:
 
     # FIXME(justinchuby): Use opset18 when it is supported by onnxruntime
-    mean = opset17.ReduceMean(input, axes=axes)
-    numerator = opset17.Sub(input, mean)
-    power_num = opset17.Pow(numerator, 2.0)
-    variance = opset17.ReduceMean(power_num, axes=axes)
-    variance_eps = opset17.Add(variance, eps)
-    denominator = opset17.Sqrt(variance_eps)
-    result = opset17.Div(numerator, denominator)
-    weight = opset17.CastLike(weight, result)
-    result = opset17.Mul(result, weight)
-    bias = opset17.CastLike(bias, result)
-    result = opset17.Add(result, bias)
-    rdenominator = opset17.Reciprocal(denominator)
+    mean = op.ReduceMean(input, axes)
+    numerator = op.Sub(input, mean)
+    power_num = op.Pow(numerator, 2.0)
+    variance = op.ReduceMean(power_num, axes)
+    variance_eps = op.Add(variance, eps)
+    denominator = op.Sqrt(variance_eps)
+    result = op.Div(numerator, denominator)
+    weight = op.CastLike(weight, result)
+    result = op.Mul(result, weight)
+    bias = op.CastLike(bias, result)
+    result = op.Add(result, bias)
+    rdenominator = op.Reciprocal(denominator)
     return result, mean, rdenominator
 
 
@@ -5053,20 +5114,10 @@ def aten_square(self: TensorType) -> TensorType:
     raise NotImplementedError()
 
 
-@torch_op("aten::squeeze", trace_only=True)
-def aten_squeeze(self: TTensor, dim: Optional[int] = None) -> TTensor:
+def aten_squeeze(self: TensorType) -> TensorType:
     """squeeze(Tensor(a) self) -> Tensor(a)"""
 
-    if op.OptionalHasElement(dim):
-        rank = op.Size(op.Shape(self))
-        if rank == 0:
-            self = op.Reshape(self, op.Constant(value_ints=[-1]))
-        dims = op.Reshape(dim, op.Constant(value_ints=[-1]))
-        result = op.Squeeze(self, dims)
-    else:
-        result = op.Squeeze(self)
-
-    return result
+    raise NotImplementedError()
 
 
 def aten_squeeze_copy(self: TensorType) -> TensorType:
diff --git a/onnxscript/tests/function_libs/torch_aten/extra_opinfo.py b/onnxscript/tests/function_libs/torch_aten/extra_opinfo.py
@@ -159,6 +159,44 @@ def sample_inputs_convolution(op_info, device, dtype, requires_grad, **kwargs):
         )
 
 
+def sample_inputs_layer_norm(
+    op_info, device, dtype, requires_grad, **kwargs  # pylint: disable=unused-argument
+):
+    make_arg = functools.partial(
+        torch_testing.make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    # Ordered as input shape, normalized_shape, eps
+    cases: tuple[tuple[int], tuple[int], float] = (  # type: ignore[assignment]
+        ((1, 2, 3), (1, 2, 3), 0.5),
+        ((2, 2, 3), (2, 3), -0.5),
+        ((1,), (1,), 1e-5),
+        ((1, 2), (2,), 1e-5),
+        ((0, 1), (1,), 1e-5),
+    )
+
+    for input_shape, normalized_shape, eps in cases:  # type: ignore[misc]
+        # Shape of weight and bias should be the same as normalized_shape
+        weight = make_arg(normalized_shape)  # type: ignore[has-type]
+        bias = make_arg(normalized_shape)  # type: ignore[has-type]
+        yield opinfo_core.SampleInput(
+            make_arg(input_shape),  # type: ignore[has-type]
+            args=(normalized_shape, weight, bias, eps),  # type: ignore[has-type]
+        )
+        yield opinfo_core.SampleInput(
+            make_arg(input_shape),  # type: ignore[has-type]
+            args=(normalized_shape, None, bias, eps),  # type: ignore[has-type]
+        )
+        yield opinfo_core.SampleInput(
+            make_arg(input_shape),  # type: ignore[has-type]
+            args=(normalized_shape, weight, None, eps),  # type: ignore[has-type]
+        )
+        yield opinfo_core.SampleInput(
+            make_arg(input_shape),  # type: ignore[has-type]
+            args=(normalized_shape, None, None, eps),  # type: ignore[has-type]
+        )
+
+
 OP_DB: List[opinfo_core.OpInfo] = [
     opinfo_core.OpInfo(
         "convolution",
@@ -184,4 +222,16 @@ def sample_inputs_convolution(op_info, device, dtype, requires_grad, **kwargs):
         skips=(),
         supports_out=False,
     ),
+    opinfo_core.OpInfo(
+        "layer_norm",
+        aliases=("layer_norm",),
+        aten_name="layer_norm",
+        dtypes=common_dtype.floating_and_complex_types_and(torch.int64, torch.bfloat16),
+        sample_inputs_func=sample_inputs_layer_norm,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        gradcheck_nondet_tol=common_utils.GRADCHECK_NONDET_TOL,
+        skips=(),
+        supports_out=False,
+    ),
 ]
diff --git a/onnxscript/tests/function_libs/torch_aten/ops_correctness_test.py b/onnxscript/tests/function_libs/torch_aten/ops_correctness_test.py
@@ -398,6 +398,8 @@ def _where_input_wrangler(
     "convolution": core_ops.aten_convolution,
     "empty_like": core_ops.aten_empty_like,
     "index_select": core_ops.aten_index_select,
+    "layer_norm": core_ops.aten_layer_norm,
+    "max": core_ops.aten_max,
     "native_layer_norm": core_ops.aten_native_layer_norm,
     "new_empty": core_ops.aten_new_empty,
     "new_empty_strided": core_ops.aten_new_empty_strided,
@@ -412,7 +414,6 @@ def _where_input_wrangler(
     ),
     "ones_like": core_ops.aten_ones_like,
     "slice": core_ops.aten_slice,
-    "squeeze": core_ops.aten_squeeze,
     "sum": (core_ops.aten_sum_dim_IntList, _sum_input_wrangler),
     "transpose": core_ops.aten_transpose,
     "zeros_like": core_ops.aten_zeros_like,
@@ -557,13 +558,6 @@ def _where_input_wrangler(
         matcher=lambda sample: len(sample.args[0]) == 0,
         reason="Empty perm is not supported",
     ),
-    skip(
-        "squeeze",
-        matcher=lambda sample: len(sample.args) > 0
-        and len(sample.input.shape) > 0
-        and sample.input.shape[sample.args[0]] != 1,
-        reason="Cannot select an axis to squeeze out which has size not equal to one",
-    ),
 )
 
 duplicate_opinfo(