Improved speed of frobenous norm for non-complex dtype (pytorch#30871)

dylanbespalko · facebook-github-bot · commit c29c0b85b606 · 2020-01-29T11:43:53.000-08:00
Summary: In-tree changes to pytorch to support complex numbers are being submitted here. Out-of-tree support for CUDA complex numbers is here: [pytorch-cuda-strided-complex extension](https://gitlab.com/pytorch-complex/pytorch-cuda-strided-complex) Changes: [x] Fixed performance issue raise in pytorch#30704 so that non-complex numbers do not call `conj()` and `real()`. [x] Fixed tensor_to_numpy() conversion likely broken by a `checkBackend()` in pytorch#27064. [x] Fixed some ReduceOps and TensorCompare Ops that recently added a `checkBackend()`. - `checkBackend()` is replaced with a device type check and a layout check. - This ensures the ComplexCPU Type ID is supported. [x] Added AVX support for complex `exp()`, as requested in pytorch#755 Pull Request resolved: pytorch#30871 Differential Revision: D19200726 Pulled By: ezyang fbshipit-source-id: d7e1be0b0a89c5d6e5f4a68ce5fcd2adc5b88277
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
@@ -202,7 +202,14 @@ template <> class Vec256<std::complex<double>> {
     AT_ERROR("not supported for complex numbers");
   }
   Vec256<std::complex<double>> exp() const {
-    return map(std::exp);
+    //exp(a + bi)
+    // = exp(a)*(cos(b) + sin(b)i)
+    auto exp = Sleef_expd4_u10(values);                               //exp(a)           exp(b)
+    exp = _mm256_blend_pd(exp, _mm256_permute_pd(exp, 0x05), 0x0A);   //exp(a)           exp(a)
+
+    auto sin_cos = Sleef_sincosd4_u10(values);                        //[sin(a), cos(a)] [sin(b), cos(b)]
+    auto cos_sin = _mm256_blend_pd(sin_cos.y, sin_cos.x, 0x0A);       //cos(b)           sin(b)
+    return _mm256_mul_pd(exp, cos_sin);
   }
   Vec256<std::complex<double>> expm1() const {
     AT_ERROR("not supported for complex numbers");
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
@@ -240,7 +240,14 @@ template <> class Vec256<std::complex<float>> {
     AT_ERROR("not supported for complex numbers");
   }
   Vec256<std::complex<float>> exp() const {
-    return map(std::exp);
+    //exp(a + bi)
+    // = exp(a)*(cos(b) + sin(b)i)
+    auto exp = Sleef_expf8_u10(values);                               //exp(a)           exp(b)
+    exp = _mm256_blend_ps(exp, _mm256_permute_ps(exp, 0xB1), 0xAA);   //exp(a)           exp(a)
+
+    auto sin_cos = Sleef_sincosf8_u10(values);                        //[sin(a), cos(a)] [sin(b), cos(b)]
+    auto cos_sin = _mm256_blend_ps(sin_cos.y, sin_cos.x, 0xAA);       //cos(b)           sin(b)
+    return _mm256_mul_ps(exp, cos_sin);
   }
   Vec256<std::complex<float>> expm1() const {
     AT_ERROR("not supported for complex numbers");
diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp
@@ -24,7 +24,7 @@ Tensor& fill_out(Tensor& self, Scalar value) {
   // Ideally this fast pass should be implemented in TensorIterator,
   // but we also want to skip compute_types which in not avoidable
   // in TensorIterator for now.
-  if (self.device() == at::kCPU && self.numel() == 1 && !value.isComplex()) {
+  if (self.device() == at::kCPU && self.numel() == 1 && !self.is_complex() && !value.isComplex()) {
      AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "fill_out", [&]() {
         fill_fast<scalar_t>(self, value);});
      return self;
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -513,7 +513,11 @@ Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) {
   if (dim.size() == 1) {
     return at::norm(self, 2, dim, keepdim, self.scalar_type());
   }
-  return at::sqrt(at::sum((self.conj() * self).real(), dim, keepdim));
+  if (self.is_complex()){
+    return at::sqrt(at::sum((self.conj() * self).real(), dim, keepdim));
+  } else {
+    return at::sqrt(at::sum((self * self), dim, keepdim));
+  }
 }
 
 Tensor &frobenius_norm_out(
@@ -529,7 +533,11 @@ Tensor &frobenius_norm_out(
   if (dim.size() == 1) {
     return at::norm_out(result, self, 2, dim, keepdim, self.scalar_type());
   }
-  return at::sqrt_out(result, at::sum((self.conj() * self).real(), dim, keepdim));
+  if (self.is_complex()){
+    return at::sqrt_out(result, at::sum((self.conj() * self).real(), dim, keepdim));
+  } else {
+    return at::sqrt_out(result, at::sum((self * self), dim, keepdim));
+  }
 }
 
 Tensor nuclear_norm(const Tensor& self, bool keepdim) {
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
@@ -488,9 +488,10 @@ Tensor& logsumexp_out(Tensor& result, const Tensor& self, DimnameList dims, bool
 static Tensor& norm_out(Tensor &result, const Tensor &self, optional<Scalar> opt_p,
                                IntArrayRef dim, bool keepdim, optional<ScalarType> opt_dtype) {
   auto p = opt_p.value_or(2.0);
-  TORCH_CHECK(self.options().backend() == Backend::CPU || self.options().backend() == Backend::CUDA,
-              "norm only supports CPU AND CUDA backend, got: ", toString(self.options().backend()));
-
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "norm only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "norm only supports strided layout, got: ", self.layout());
 
   ScalarType scalarType = opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type();
   TORCH_CHECK(
@@ -513,8 +514,10 @@ static inline Tensor _norm(const Tensor &self, Scalar p) {
   if (self.is_sparse()) {
     return at::native_norm(self, p);
   } else {
-    TORCH_CHECK(self.options().backend() == Backend::CPU || self.options().backend() == Backend::CUDA,
-                "norm only supports CPU AND CUDA backend, got: ", toString(self.options().backend()));
+    TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+                "norm only supports CPU AND CUDA device type, got: ", self.device().type());
+    TORCH_CHECK(self.layout() == Layout::Strided,
+                "norm only supports strided layout, got: ", self.layout());
     TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()),
                 "norm only supports floating-point dtypes");
 
@@ -565,9 +568,10 @@ inline Tensor & _all(Tensor & result, TensorIterator & iter) {
 }
 
 Tensor all(const Tensor& self) {
-  TORCH_CHECK(self.options().backend() == Backend::CPU ||
-    self.options().backend() == Backend::CUDA, "all only supports CPU AND CUDA "
-    "backend, got: ", toString(self.options().backend()));
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "all only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "all only supports strided layout, got: ", self.layout());
   TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool,
     "all only supports torch.uint8 and torch.bool dtypes");
 
@@ -583,9 +587,10 @@ Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
 }
 
 Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
-  TORCH_CHECK(self.options().backend() == Backend::CPU ||
-    self.options().backend() == Backend::CUDA, "all only supports CPU AND CUDA "
-    "backend, got: ", toString(self.options().backend()));
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "all only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "all only supports strided layout, got: ", self.layout());
   TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool,
     "all only supports torch.uint8 and torch.bool dtypes");
   dim = maybe_wrap_dim(dim, self.dim());
@@ -609,11 +614,10 @@ inline Tensor & _any(Tensor & result, TensorIterator & iter) {
 }
 
 Tensor any(const Tensor& self) {
-  TORCH_CHECK(self.options().backend() == Backend::CPU ||
-    self.options().backend() == Backend::CUDA ||
-    self.options().backend() == Backend::SparseCPU ||
-    self.options().backend() == Backend::SparseCUDA, "any only supports CPU, CUDA, "
-    "SparseCPU and SparseCUDA backend, got: ", toString(self.options().backend()));
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "any only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided || self.layout() == Layout::Sparse,
+              "any only supports strided AND sparse layout, got: ", self.layout());
   TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool,
     "all only supports torch.uint8 and torch.bool dtypes");
 
@@ -629,9 +633,10 @@ Tensor any(const Tensor& self, int64_t dim, bool keepdim) {
 }
 
 Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
-  TORCH_CHECK(self.options().backend() == Backend::CPU ||
-    self.options().backend() == Backend::CUDA, "any only supports CPU AND CUDA "
-    "backend, got: ", toString(self.options().backend()));
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "any only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "any only supports strided layout, got: ", self.layout());
   TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool,
     "all only supports torch.uint8 and torch.bool dtypes");
   dim = maybe_wrap_dim(dim, self.dim());
@@ -730,8 +735,10 @@ Tensor argmin(const Tensor& self, c10::optional<int64_t> dim, bool keepdims) {
 }
 
 static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim, bool take_sqrt) {
-  TORCH_CHECK(self.options().backend() == Backend::CPU || self.options().backend() == Backend::CUDA,
-           "std and var only support CPU AND CUDA backend, got: ", toString(self.options().backend()));
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "std and var only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "std and var only supports strided layout, got: ", self.layout());
   TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()),
               "std and var only support floating-point dtypes");
 
@@ -769,8 +776,12 @@ static Tensor &std_var_out(Tensor &result, const Tensor &self, IntArrayRef dim,
 
 static std::tuple<Tensor&,Tensor&> std_var_mean_out(const char* fname, Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, bool unbiased, bool keepdim, bool take_sqrt) {
   AT_ASSERT(result1.defined() && result2.defined());
-  TORCH_CHECK(self.options().backend() == Backend::CPU || self.options().backend() == Backend::CUDA, fname, " only support CPU AND CUDA backend, got: ", toString(self.options().backend()));
-  TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()), fname, " only support floating-point dtypes");
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              fname, " only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              fname, " only supports strided layout, got: ", self.layout());
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()),
+              fname, " only support floating-point dtypes");
   TORCH_CHECK(result1.scalar_type() == result2.scalar_type(),
            "provided by result1 dtype must match dtype of result2. Got ",
            toString(result1.scalar_type()),
@@ -856,8 +867,10 @@ std::tuple<Tensor,Tensor> var_mean(const Tensor& self, bool unbiased) {
 }
 
 Tensor var(const Tensor& self, bool unbiased) {
-  TORCH_CHECK(self.options().backend() == Backend::CPU || self.options().backend() == Backend::CUDA,
-              "var only supports CPU AND CUDA backend, got: ", toString(self.options().backend()));
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "var only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "var only supports strided layout, got: ", self.layout());
   TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()),
               "var only supports floating-point dtypes");
   auto trivial_return = _allreduce_return_trivial(self, std::numeric_limits<double>::quiet_NaN());
@@ -874,8 +887,10 @@ Tensor &var_out(Tensor &result, const Tensor &self, IntArrayRef dim, bool unbias
 }
 
 Tensor std(const Tensor& self, bool unbiased) {
-  TORCH_CHECK(self.options().backend() == Backend::CPU || self.options().backend() == Backend::CUDA,
-              "std only supports CPU AND CUDA backend, got: ", toString(self.options().backend()));
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "std only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "std only supports strided layout, got: ", self.layout());
   TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()),
               "std only supports floating-point dtypes");
   auto trivial_return = _allreduce_return_trivial(self, std::numeric_limits<double>::quiet_NaN());
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
@@ -159,8 +159,10 @@ std::tuple<Tensor, Tensor> mode(const Tensor& self, int64_t dim, bool keepdim) {
 
 std::tuple<Tensor &,Tensor &> mode_out(Tensor& values, Tensor& indices,
                                        const Tensor& self, int64_t dim, bool keepdim) {
-  TORCH_CHECK(self.options().backend() == Backend::CPU || self.options().backend() == Backend::CUDA,
-           "mode only supports CPU AND CUDA backend, got: ", toString(self.options().backend()));
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "mode only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "mode only supports strided layout, got: ", self.layout());
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial_no_ident(values, self, dim, keepdim, "mode")) {
     AT_ASSERT(values.dim() == 0);
@@ -207,8 +209,10 @@ std::tuple<Tensor, Tensor> max(const Tensor& self, int64_t dim, bool keepdim) {
 
 static std::tuple<Tensor &,Tensor &> max_out_impl(Tensor& max, Tensor& max_indices,
                                                   const Tensor& self, int64_t dim, bool keepdim) {
-  TORCH_CHECK(self.options().backend() == Backend::CPU || self.options().backend() == Backend::CUDA,
-           "max only supports CPU AND CUDA backend, got: ", toString(self.options().backend()));
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "max only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "max only supports strided layout, got: ", self.layout());
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial_no_ident(max, self, dim, keepdim, "max")) {
     AT_ASSERT(max.dim() == 0);
@@ -263,8 +267,10 @@ std::tuple<Tensor, Tensor> min(const Tensor& self, int64_t dim, bool keepdim) {
 
 static std::tuple<Tensor &,Tensor &> min_out_impl(Tensor& min, Tensor& min_indices,
                                                   const Tensor& self, int64_t dim, bool keepdim) {
-  TORCH_CHECK(self.options().backend() == Backend::CPU || self.options().backend() == Backend::CUDA,
-           "min only supports CPU AND CUDA backend, got: ", toString(self.options().backend()));
+  TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA,
+              "min only supports CPU AND CUDA device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "min only supports strided layout, got: ", self.layout());
   dim = maybe_wrap_dim(dim, self.dim());
   if (_dimreduce_return_trivial_no_ident(min, self, dim, keepdim, "min")) {
     AT_ASSERT(min.dim() == 0);
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
@@ -229,7 +229,10 @@ Tensor& _clamp_out_cpu(
     optional<Scalar> min,
     optional<Scalar> max) {
   if (min && max) {
-    checkBackend("clamp", result, Backend::CPU);
+    TORCH_CHECK(self.device().type() == DeviceType::CPU,
+                "clamp only supports CPU device type, got: ", self.device().type());
+    TORCH_CHECK(self.layout() == Layout::Strided,
+                "clamp only supports strided layout, got: ", self.layout());
     auto iter = TensorIterator::unary_op(result, self,
         /*check_mem_overlap=*/true);
     clamp_stub(iter.device_type(), iter, *min, *max);
@@ -248,7 +251,10 @@ Tensor& _clamp_max__cpu(Tensor& self, Scalar max) {
 }
 
 Tensor& _clamp_max_out_cpu(Tensor& result, const Tensor& self, Scalar max) {
-  checkBackend("clamp_max", result, Backend::CPU);
+  TORCH_CHECK(self.device().type() == DeviceType::CPU,
+              "clamp_max only supports CPU device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "clamp_max only supports strided layout, got: ", self.layout());
   auto iter = TensorIterator::unary_op(result, self,
       /*check_mem_overlap=*/true);
   clamp_max_stub(iter.device_type(), iter, max);
@@ -260,7 +266,10 @@ Tensor& _clamp_min__cpu(Tensor& self, Scalar min) {
 }
 
 Tensor& _clamp_min_out_cpu(Tensor& result, const Tensor& self, Scalar min) {
-  checkBackend("clamp_min", result, Backend::CPU);
+  TORCH_CHECK(self.device().type() == DeviceType::CPU,
+              "clamp_min only supports CPU device type, got: ", self.device().type());
+  TORCH_CHECK(self.layout() == Layout::Strided,
+              "clamp_min only supports strided layout, got: ", self.layout());
   auto iter = TensorIterator::unary_op(result, self,
       /*check_mem_overlap=*/true);
   clamp_min_stub(iter.device_type(), iter, min);
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
@@ -74,18 +74,15 @@ static std::vector<int64_t> seq_to_aten_shape(PyObject *py_seq) {
 }
 
 PyObject* tensor_to_numpy(const at::Tensor& tensor) {
-  if (tensor.is_cuda()) {
-    throw TypeError(
-        "can't convert CUDA tensor to numpy. Use Tensor.cpu() to "
-        "copy the tensor to host memory first.");
+  if (tensor.device().type() != DeviceType::CPU) {
+      throw TypeError(
+        "can't convert %s device type tensor to numpy. Use Tensor.cpu() to "
+        "copy the tensor to host memory first.", tensor.device().type());
   }
-  if (tensor.is_sparse()) {
-    throw TypeError(
-        "can't convert sparse tensor to numpy. Use Tensor.to_dense() to "
-        "convert to a dense tensor first.");
-  }
-  if (tensor.options().backend() != Backend::CPU) {
-    throw TypeError("NumPy conversion for %s is not supported", tensor.toString().c_str());
+  if (tensor.layout() != Layout::Strided) {
+      throw TypeError(
+        "can't convert %s layout tensor to numpy."
+        "convert the tensor to a strided layout first.", tensor.layout());
   }
   if (tensor.requires_grad()) {
     throw std::runtime_error(

Original file line number	Diff line number	Diff line change
`@@ -513,7 +513,11 @@ Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) {`
`513`	`513`	`if (dim.size() == 1) {`
`514`	`514`	`return at::norm(self, 2, dim, keepdim, self.scalar_type());`
`515`	`515`	`}`
`516`		`- return at::sqrt(at::sum((self.conj() * self).real(), dim, keepdim));`
	`516`	`+ if (self.is_complex()){`
	`517`	`+ return at::sqrt(at::sum((self.conj() * self).real(), dim, keepdim));`
	`518`	`+ } else {`
	`519`	`+ return at::sqrt(at::sum((self * self), dim, keepdim));`
	`520`	`+ }`
`517`	`521`	`}`
`518`	`522`
`519`	`523`	`Tensor &frobenius_norm_out(`
`@@ -529,7 +533,11 @@ Tensor &frobenius_norm_out(`
`529`	`533`	`if (dim.size() == 1) {`
`530`	`534`	`return at::norm_out(result, self, 2, dim, keepdim, self.scalar_type());`
`531`	`535`	`}`
`532`		`- return at::sqrt_out(result, at::sum((self.conj() * self).real(), dim, keepdim));`
	`536`	`+ if (self.is_complex()){`
	`537`	`+ return at::sqrt_out(result, at::sum((self.conj() * self).real(), dim, keepdim));`
	`538`	`+ } else {`
	`539`	`+ return at::sqrt_out(result, at::sum((self * self), dim, keepdim));`
	`540`	`+ }`
`533`	`541`	`}`
`534`	`542`
`535`	`543`	`Tensor nuclear_norm(const Tensor& self, bool keepdim) {`