feat(atenlib): add ops(layer_norm) (#459)

xiaowuhu · web-flow · commit 690ed5db3bbc · 2023-03-01T17:25:55.000+08:00
diff --git a/onnxscript/function_libs/torch_aten/ops/core.py b/onnxscript/function_libs/torch_aten/ops/core.py
@@ -2893,17 +2893,27 @@ def aten_kthvalue(
     raise NotImplementedError()
 
 
+@torch_op("aten::layer_norm", trace_only=True)
 def aten_layer_norm(
-    input: TensorType,
+    input: TReal,
     normalized_shape: Sequence[int],
-    weight: Optional[TensorType] = None,
-    bias: Optional[TensorType] = None,
+    weight: Optional[TReal] = None,
+    bias: Optional[TReal] = None,
     eps: float = 1e-05,
-    cudnn_enable: bool = True,
-) -> TensorType:
+) -> TReal:
     """layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor"""
 
-    raise NotImplementedError()
+    axes_list = [-i for i in range(len(normalized_shape), 0, -1)]
+    start_axis = axes_list[0]
+    if not op.OptionalHasElement(weight):
+        one = op.Constant(value_float=1.0)
+        weight = op.Expand(one, op.Shape(input, start=start_axis))
+    if not op.OptionalHasElement(bias):
+        zero = op.Constant(value_float=0.0)
+        bias = op.Expand(zero, op.Shape(input, start=start_axis))
+
+    result, _, _ = op.LayerNormalization(input, weight, bias, axis=start_axis, epsilon=eps)
+    return result
 
 
 def aten_lcm(self: TensorType, other: TensorType) -> TensorType:
@@ -3966,12 +3976,13 @@ def aten_native_layer_norm(
     # where D is the dimension of normalized_shape. For example, if normalized_shape is
     # (3, 5) (a 2-dimensional shape), the mean and standard-deviation are computed
     # over the last 2 dimensions of the input (i.e. input.mean((-2, -1))).
-    axes = [-i for i in range(len(normalized_shape), 0, -1)]
-    if weight is None:
+    axes_list = [-i for i in range(len(normalized_shape), 0, -1)]
+    axes = op.Constant(value_ints=axes_list)
+    if not op.OptionalHasElement(weight):
         weight = op.CastLike(1, input)
-    if bias is None:
+    if not op.OptionalHasElement(bias):
         bias = op.CastLike(0, input)
-    return _aten_native_layer_norm_onnx(input, weight, bias, axes=axes, eps=eps)
+    return _aten_native_layer_norm_onnx(input, weight, bias, axes, eps)
 
 
 @torch_op("aten::native_layer_norm", overload=True)
@@ -3984,18 +3995,18 @@ def _aten_native_layer_norm_onnx(
 ) -> Tuple[TReal, TReal, TReal]:
 
     # FIXME(justinchuby): Use opset18 when it is supported by onnxruntime
-    mean = opset17.ReduceMean(input, axes=axes)
-    numerator = opset17.Sub(input, mean)
-    power_num = opset17.Pow(numerator, 2.0)
-    variance = opset17.ReduceMean(power_num, axes=axes)
-    variance_eps = opset17.Add(variance, eps)
-    denominator = opset17.Sqrt(variance_eps)
-    result = opset17.Div(numerator, denominator)
-    weight = opset17.CastLike(weight, result)
-    result = opset17.Mul(result, weight)
-    bias = opset17.CastLike(bias, result)
-    result = opset17.Add(result, bias)
-    rdenominator = opset17.Reciprocal(denominator)
+    mean = op.ReduceMean(input, axes)
+    numerator = op.Sub(input, mean)
+    power_num = op.Pow(numerator, 2.0)
+    variance = op.ReduceMean(power_num, axes)
+    variance_eps = op.Add(variance, eps)
+    denominator = op.Sqrt(variance_eps)
+    result = op.Div(numerator, denominator)
+    weight = op.CastLike(weight, result)
+    result = op.Mul(result, weight)
+    bias = op.CastLike(bias, result)
+    result = op.Add(result, bias)
+    rdenominator = op.Reciprocal(denominator)
     return result, mean, rdenominator
 
 
diff --git a/onnxscript/tests/function_libs/torch_aten/extra_opinfo.py b/onnxscript/tests/function_libs/torch_aten/extra_opinfo.py
@@ -159,6 +159,44 @@ def sample_inputs_convolution(op_info, device, dtype, requires_grad, **kwargs):
         )
 
 
+def sample_inputs_layer_norm(
+    op_info, device, dtype, requires_grad, **kwargs  # pylint: disable=unused-argument
+):
+    make_arg = functools.partial(
+        torch_testing.make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    # Ordered as input shape, normalized_shape, eps
+    cases: tuple[tuple[int], tuple[int], float] = (  # type: ignore[assignment]
+        ((1, 2, 3), (1, 2, 3), 0.5),
+        ((2, 2, 3), (2, 3), -0.5),
+        ((1,), (1,), 1e-5),
+        ((1, 2), (2,), 1e-5),
+        ((0, 1), (1,), 1e-5),
+    )
+
+    for input_shape, normalized_shape, eps in cases:  # type: ignore[misc]
+        # Shape of weight and bias should be the same as normalized_shape
+        weight = make_arg(normalized_shape)  # type: ignore[has-type]
+        bias = make_arg(normalized_shape)  # type: ignore[has-type]
+        yield opinfo_core.SampleInput(
+            make_arg(input_shape),  # type: ignore[has-type]
+            args=(normalized_shape, weight, bias, eps),  # type: ignore[has-type]
+        )
+        yield opinfo_core.SampleInput(
+            make_arg(input_shape),  # type: ignore[has-type]
+            args=(normalized_shape, None, bias, eps),  # type: ignore[has-type]
+        )
+        yield opinfo_core.SampleInput(
+            make_arg(input_shape),  # type: ignore[has-type]
+            args=(normalized_shape, weight, None, eps),  # type: ignore[has-type]
+        )
+        yield opinfo_core.SampleInput(
+            make_arg(input_shape),  # type: ignore[has-type]
+            args=(normalized_shape, None, None, eps),  # type: ignore[has-type]
+        )
+
+
 OP_DB: List[opinfo_core.OpInfo] = [
     opinfo_core.OpInfo(
         "convolution",
@@ -184,4 +222,16 @@ def sample_inputs_convolution(op_info, device, dtype, requires_grad, **kwargs):
         skips=(),
         supports_out=False,
     ),
+    opinfo_core.OpInfo(
+        "layer_norm",
+        aliases=("layer_norm",),
+        aten_name="layer_norm",
+        dtypes=common_dtype.floating_and_complex_types_and(torch.int64, torch.bfloat16),
+        sample_inputs_func=sample_inputs_layer_norm,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        gradcheck_nondet_tol=common_utils.GRADCHECK_NONDET_TOL,
+        skips=(),
+        supports_out=False,
+    ),
 ]
diff --git a/onnxscript/tests/function_libs/torch_aten/ops_correctness_test.py b/onnxscript/tests/function_libs/torch_aten/ops_correctness_test.py
@@ -398,6 +398,7 @@ def _where_input_wrangler(
     "convolution": core_ops.aten_convolution,
     "empty_like": core_ops.aten_empty_like,
     "index_select": core_ops.aten_index_select,
+    "layer_norm": core_ops.aten_layer_norm,
     "max": core_ops.aten_max,
     "native_layer_norm": core_ops.aten_native_layer_norm,
     "new_empty": core_ops.aten_new_empty,
@@ -747,6 +748,10 @@ def test_output_match(self, device: str, dtype: torch.dtype, op):
                 inputs=repr(inputs),
                 kwargs=repr(cpu_sample.kwargs),
             ):
+
+                if i == 5:
+                    print(i)
+
                 skip_reason = _should_skip_test_sample(op.name, cpu_sample)
                 if skip_reason is not None:
                     # Cannot use self.skip because pytest would skip the entire test