Skip to content

Commit c62bb39

Browse files
committed
perf(dipu): faster aten::mul in cuda & muxi (DeepLink-org#855)
* faster aten::mul in cuda * improve the code format * loose the check for scalar tensor * improve the code * let the logic of mul diff on different devices. * Update autogen_diopi_wrapper.py * Update diopi_functions.yaml * Update OpUtils.hpp
1 parent 8fd3ec8 commit c62bb39

File tree

5 files changed

+27
-8
lines changed

5 files changed

+27
-8
lines changed

dipu/scripts/autogen_diopi_wrapper/autogen_diopi_wrapper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -650,10 +650,10 @@ def create_device_check_code(fun_config):
650650

651651
for args in set(tensors):
652652
if not args.endswith("?"):
653-
code += f' TORCH_CHECK(({args}.defined() == false) || ({args}.device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, ": {op_name}: {args} should be on dipu");\n'
653+
code += f' TORCH_CHECK(({args}.defined() == false) || ({args}.device().type() == dipu::DIPU_DEVICE_TYPE || ignore_device_check({args})), __FILE__, ":", __LINE__, ": {op_name}: {args} should be on dipu");\n'
654654
else:
655655
args = args[0:-1]
656-
code += f' TORCH_CHECK(({args}.has_value() == false) || ({args}.value().defined() == false) || ({args}.value().device().type() == dipu::DIPU_DEVICE_TYPE), __FILE__, ":", __LINE__, "{op_name}: {args} should be on dipu");\n'
656+
code += f' TORCH_CHECK(({args}.has_value() == false) || ({args}.value().defined() == false) || ({args}.value().device().type() == dipu::DIPU_DEVICE_TYPE || ignore_device_check({args})), __FILE__, ":", __LINE__, "{op_name}: {args} should be on dipu");\n'
657657

658658
if len(tensors) > 0:
659659
code += "}"

dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,13 +159,23 @@
159159
interface: diopiMulScalar(ctx, out, self, other)
160160

161161
- schema: "mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)"
162+
device: [cuda, muxi]
163+
interface: diopiMulInp(ctx, self, other)
164+
165+
- schema: "mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)"
166+
device: [-cuda, -muxi, all]
162167
custom_code_at_the_beginning: |
163168
if (is_scalar_on_cpu(other)) {
164169
return dipu_mul__scalar(self, other.item());
165170
}
166171
interface: diopiMulInp(ctx, self, other)
167172

168173
- schema: "mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
174+
device: [cuda, muxi]
175+
interface: diopiMul(ctx, out, self, other)
176+
177+
- schema: "mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"
178+
device: [-cuda, -muxi, all]
169179
custom_code_at_the_beginning: |
170180
// if (is_scalar_on_cpu(other)) {
171181
// Pytorch 2.0 has a bug, causing for_each mul passing a cpu scalar tensor. Fixed in PyTorch 2.1

dipu/torch_dipu/csrc_dipu/aten/ops/DIPUOpInferrer.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,14 +79,12 @@ void OpInferrerMeta::add_input(const at::Tensor& tensor) {
7979
inputs_.push_back(c10::MaybeOwned<at::Tensor>::borrowed(tensor));
8080
}
8181

82-
at::Tensor OpInferrerMeta::malloc_output() {
82+
inline at::Tensor OpInferrerMeta::malloc_output() {
8383
at::TensorOptions options = at::TensorOptions().dtype(dtype_).device(device_);
84-
auto out = native::nodispatch::empty(shape_, options, memory_format_);
85-
8684
if (!strides_.empty()) {
87-
out.as_strided_(shape_, strides_);
85+
return native::nodispatch::empty_strided(shape_, strides_, options);
8886
}
89-
return out;
87+
return native::nodispatch::empty(shape_, options, memory_format_);
9088
}
9189

9290
void OpInferrer::compute_dtype() {

dipu/torch_dipu/csrc_dipu/aten/ops/DIPUOpInferrer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class OpInferrerMeta {
4040
size_t ntensors() const { return inputs_.size(); }
4141

4242
// Allocates the output based on the inferred attributes, use strides_ if set
43-
at::Tensor malloc_output();
43+
inline at::Tensor malloc_output();
4444

4545
c10::SmallVector<c10::MaybeOwned<at::Tensor>, 4> inputs_;
4646
c10::DimVector shape_;

dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,5 +254,16 @@ inline bool is_scalar_on_cpu(const at::Tensor& t) {
254254
return t.defined() && t.is_cpu() && t.numel() == 1;
255255
}
256256

257+
// This function is used to check if tensor is a scalar tensor by any means.
258+
inline bool is_scalar_tensor(const c10::optional<at::Tensor>& t) {
259+
return t.has_value() && ((*t).unsafeGetTensorImpl()->is_wrapped_number() ||
260+
((*t).is_cpu() && (*t).numel() == 1));
261+
}
262+
263+
inline bool ignore_device_check(const c10::optional<at::Tensor>& t) {
264+
return (kDipuVendorDeviceType == devapis::VendorDeviceType::CUDA ||
265+
kDipuVendorDeviceType == devapis::VendorDeviceType::MUXI) &&
266+
is_scalar_tensor(t);
267+
}
257268
} // namespace native
258269
} // namespace dipu

0 commit comments

Comments
 (0)