perf(dipu): faster aten::mul in cuda & muxi (DeepLink-org#855)

Wrench-Git · Wrench-Git · commit c62bb39c9ff9 · 2024-07-16T16:38:26.000+08:00
* faster aten::mul in cuda

* improve the code format

* loose the check for scalar tensor

* improve the code

* let the logic of mul diff on different devices.

* Update autogen_diopi_wrapper.py

* Update diopi_functions.yaml

* Update OpUtils.hpp
diff --git a/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py b/dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py
@@ -650,10 +650,10 @@ def create_device_check_code(fun_config):
 
     for args in set(tensors):
         if not args.endswith("?"):
-            code += f'  TORCH_CHECK(({args}.defined() == false) || ({args}.device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, ": {op_name}: {args} should be on dipu");\n'
+            code += f'  TORCH_CHECK(({args}.defined() == false) || ({args}.device().type() == dipu::DIPU_DEVICE_TYPE || ignore_device_check({args})), __FILE__, ":", __LINE__, ": {op_name}: {args} should be on dipu");\n'
         else:
             args = args[0:-1]
-            code += f'  TORCH_CHECK(({args}.has_value() == false) || ({args}.value().defined() == false) || ({args}.value().device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, "{op_name}: {args} should be on dipu");\n'
+            code += f'  TORCH_CHECK(({args}.has_value() == false) || ({args}.value().defined() == false) || ({args}.value().device().type() == dipu::DIPU_DEVICE_TYPE || ignore_device_check({args})), __FILE__, ":", __LINE__, "{op_name}: {args} should be on dipu");\n'
 
     if len(tensors) > 0:
         code += "}"
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -159,13 +159,23 @@
   interface: diopiMulScalar(ctx, out, self, other)
 
 - schema: "mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)"
+  device: [cuda, muxi]
+  interface: diopiMulInp(ctx, self, other)
+
+- schema: "mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)"
+  device: [-cuda, -muxi, all]
   custom_code_at_the_beginning: |
     if (is_scalar_on_cpu(other)) {
         return dipu_mul__scalar(self, other.item());
     }
   interface: diopiMulInp(ctx, self, other)
 
 - schema: "mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
+  device: [cuda, muxi]
+  interface: diopiMul(ctx, out, self, other)
+
+- schema: "mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
+  device: [-cuda, -muxi, all]
   custom_code_at_the_beginning: |
     // if (is_scalar_on_cpu(other)) {
     // Pytorch 2.0 has a bug, causing for_each mul passing a cpu scalar tensor. Fixed in PyTorch 2.1
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUOpInferrer.cpp b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUOpInferrer.cpp
@@ -79,14 +79,12 @@ void OpInferrerMeta::add_input(const at::Tensor& tensor) {
   inputs_.push_back(c10::MaybeOwned<at::Tensor>::borrowed(tensor));
 }
 
-at::Tensor OpInferrerMeta::malloc_output() {
+inline at::Tensor OpInferrerMeta::malloc_output() {
   at::TensorOptions options = at::TensorOptions().dtype(dtype_).device(device_);
-  auto out = native::nodispatch::empty(shape_, options, memory_format_);
-
   if (!strides_.empty()) {
-    out.as_strided_(shape_, strides_);
+    return native::nodispatch::empty_strided(shape_, strides_, options);
   }
-  return out;
+  return native::nodispatch::empty(shape_, options, memory_format_);
 }
 
 void OpInferrer::compute_dtype() {
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUOpInferrer.h b/dipu/torch_dipu/csrc_dipu/aten/ops/DIPUOpInferrer.h
@@ -40,7 +40,7 @@ class OpInferrerMeta {
   size_t ntensors() const { return inputs_.size(); }
 
   // Allocates the output based on the inferred attributes, use strides_ if set
-  at::Tensor malloc_output();
+  inline at::Tensor malloc_output();
 
   c10::SmallVector<c10::MaybeOwned<at::Tensor>, 4> inputs_;
   c10::DimVector shape_;
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp
@@ -254,5 +254,16 @@ inline bool is_scalar_on_cpu(const at::Tensor& t) {
   return t.defined() && t.is_cpu() && t.numel() == 1;
 }
 
+// This function is used to check if tensor is a scalar tensor by any means.
+inline bool is_scalar_tensor(const c10::optional<at::Tensor>& t) {
+  return t.has_value() && ((*t).unsafeGetTensorImpl()->is_wrapped_number() ||
+                           ((*t).is_cpu() && (*t).numel() == 1));
+}
+
+inline bool ignore_device_check(const c10::optional<at::Tensor>& t) {
+  return (kDipuVendorDeviceType == devapis::VendorDeviceType::CUDA ||
+          kDipuVendorDeviceType == devapis::VendorDeviceType::MUXI) &&
+         is_scalar_tensor(t);
+}
 }  // namespace native
 }  // namespace dipu

Original file line number	Diff line number	Diff line change
`@@ -79,14 +79,12 @@ void OpInferrerMeta::add_input(const at::Tensor& tensor) {`
`79`	`79`	`inputs_.push_back(c10::MaybeOwned<at::Tensor>::borrowed(tensor));`
`80`	`80`	`}`
`81`	`81`
`82`		`-at::Tensor OpInferrerMeta::malloc_output() {`
	`82`	`+inline at::Tensor OpInferrerMeta::malloc_output() {`
`83`	`83`	`at::TensorOptions options = at::TensorOptions().dtype(dtype_).device(device_);`
`84`		`- auto out = native::nodispatch::empty(shape_, options, memory_format_);`
`85`		`-`
`86`	`84`	`if (!strides_.empty()) {`
`87`		`- out.as_strided_(shape_, strides_);`
	`85`	`+ return native::nodispatch::empty_strided(shape_, strides_, options);`
`88`	`86`	`}`
`89`		`- return out;`
	`87`	`+ return native::nodispatch::empty(shape_, options, memory_format_);`
`90`	`88`	`}`
`91`	`89`
`92`	`90`	`void OpInferrer::compute_dtype() {`