From b889dc4af8107bc45cd6dae2234a5397bf830977 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Wed, 17 Jan 2024 16:22:01 -0600
Subject: [PATCH 01/11] dot_func

---
 dpnp/backend/extensions/blas/CMakeLists.txt   |   4 +-
 dpnp/backend/extensions/blas/blas_py.cpp      |  21 +-
 dpnp/backend/extensions/blas/dot.cpp          | 229 +++++++++++
 dpnp/backend/extensions/blas/dot.hpp          |  60 +++
 dpnp/backend/extensions/blas/dotc.cpp         | 232 +++++++++++
 dpnp/backend/extensions/blas/gemm.cpp         |   2 +-
 dpnp/backend/extensions/blas/gemm.hpp         |   2 +-
 dpnp/backend/extensions/blas/gemm_batch.cpp   |   2 +-
 dpnp/backend/extensions/blas/types_matrix.hpp |  45 ++-
 dpnp/backend/kernels/dpnp_krnl_common.cpp     |   1 +
 dpnp/dpnp_algo/dpnp_algo.pxd                  |   7 -
 dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi    | 100 -----
 dpnp/dpnp_array.py                            |  25 +-
 dpnp/dpnp_iface_linearalgebra.py              | 122 +++---
 dpnp/dpnp_iface_manipulation.py               |   2 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   | 166 ++++++--
 tests/skipped_tests.tbl                       |   3 +-
 tests/skipped_tests_gpu.tbl                   |   6 +-
 tests/skipped_tests_gpu_no_fp64.tbl           |  85 ----
 tests/test_dot.py                             | 363 +++++++++++++++++-
 tests/test_mathematical.py                    |   7 +-
 tests/test_sycl_queue.py                      |   8 +-
 tests/test_usm_type.py                        |   4 +-
 .../cupy/linalg_tests/test_product.py         |  28 +-
 24 files changed, 1182 insertions(+), 342 deletions(-)
 create mode 100644 dpnp/backend/extensions/blas/dot.cpp
 create mode 100644 dpnp/backend/extensions/blas/dot.hpp
 create mode 100644 dpnp/backend/extensions/blas/dotc.cpp

diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index d19f60c9792b..e85c8fd0aca5 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -1,5 +1,5 @@
 # *****************************************************************************
-# Copyright (c) 2016-2023, Intel Corporation
+# Copyright (c) 2024, Intel Corporation
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,6 +27,8 @@
 set(python_module_name _blas_impl)
 set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/blas_py.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dot.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dotc.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gemm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gemm_batch.cpp
 )
diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
index 524f16fcc7df..cb7caa9e2d22 100644
--- a/dpnp/backend/extensions/blas/blas_py.cpp
+++ b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2023, Intel Corporation
+// Copyright (c) 2024, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -30,6 +30,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "dot.hpp"
 #include "gemm.hpp"
 
 namespace blas_ext = dpnp::backend::ext::blas;
@@ -38,6 +39,8 @@ namespace py = pybind11;
 // populate dispatch tables
 void init_dispatch_tables(void)
 {
+    blas_ext::init_dot_dispatch_table();
+    blas_ext::init_dotc_dispatch_table();
     blas_ext::init_gemm_batch_dispatch_table();
     blas_ext::init_gemm_dispatch_table();
 }
@@ -46,6 +49,22 @@ PYBIND11_MODULE(_blas_impl, m)
 {
     init_dispatch_tables();
 
+    {
+        m.def("_dotc", &blas_ext::dotc,
+              "Call `dotc` from OneMKL LAPACK library to return "
+              "the dot product of two complex-valued vectors.",
+              py::arg("sycl_queue"), py::arg("vectorA"), py::arg("vectorB"),
+              py::arg("result"), py::arg("depends") = py::list());
+    }
+
+    {
+        m.def("_dot", &blas_ext::dot,
+              "Call `dot` from OneMKL LAPACK library to return "
+              "the dot product of two real-valued vectors.",
+              py::arg("sycl_queue"), py::arg("vectorA"), py::arg("vectorB"),
+              py::arg("result"), py::arg("depends") = py::list());
+    }
+
     {
         m.def("_gemm", &blas_ext::gemm,
               "Call `gemm` from OneMKL LAPACK library to return "
diff --git a/dpnp/backend/extensions/blas/dot.cpp b/dpnp/backend/extensions/blas/dot.cpp
new file mode 100644
index 000000000000..44dbfe949017
--- /dev/null
+++ b/dpnp/backend/extensions/blas/dot.cpp
@@ -0,0 +1,229 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "dot.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+namespace mkl_blas = oneapi::mkl::blas;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*dot_impl_fn_ptr_t)(sycl::queue,
+                                         const std::int64_t,
+                                         char *,
+                                         const std::int64_t,
+                                         char *,
+                                         const std::int64_t,
+                                         char *,
+                                         const std::vector<sycl::event> &);
+
+static dot_impl_fn_ptr_t dot_dispatch_table[dpctl_td_ns::num_types]
+                                           [dpctl_td_ns::num_types];
+
+template <typename Tab, typename Tc>
+static sycl::event dot_impl(sycl::queue exec_q,
+                            const std::int64_t n,
+                            char *vectorA,
+                            const std::int64_t stride_a,
+                            char *vectorB,
+                            const std::int64_t stride_b,
+                            char *result,
+                            const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<Tab>(exec_q);
+    type_utils::validate_type_for_device<Tc>(exec_q);
+
+    Tab *a = reinterpret_cast<Tab *>(vectorA);
+    Tab *b = reinterpret_cast<Tab *>(vectorB);
+    Tc *res = reinterpret_cast<Tc *>(result);
+
+    std::stringstream error_msg;
+    bool is_exception_caught = false;
+
+    sycl::event dot_event;
+    try {
+        dot_event = mkl_blas::row_major::dot(exec_q,
+                                             n, // size of the input vectors
+                                             a, // Pointer to vector a.
+                                             stride_a, // Stride of vector a.
+                                             b,        // Pointer to vector b.
+                                             stride_b, // Stride of vector b.
+                                             res,      // Pointer to result.
+                                             depends);
+    } catch (oneapi::mkl::exception const &e) {
+        error_msg
+            << "Unexpected MKL exception caught during dot() call:\nreason: "
+            << e.what();
+        is_exception_caught = true;
+    } catch (sycl::exception const &e) {
+        error_msg << "Unexpected SYCL exception caught during dot() call:\n"
+                  << e.what();
+        is_exception_caught = true;
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        throw std::runtime_error(error_msg.str());
+    }
+
+    return dot_event;
+}
+
+std::pair<sycl::event, sycl::event> dot(sycl::queue exec_q,
+                                        dpctl::tensor::usm_ndarray vectorA,
+                                        dpctl::tensor::usm_ndarray vectorB,
+                                        dpctl::tensor::usm_ndarray result,
+                                        const std::vector<sycl::event> &depends)
+{
+    const int vectorA_nd = vectorA.get_ndim();
+    const int vectorB_nd = vectorB.get_ndim();
+    const int result_nd = result.get_ndim();
+
+    if ((vectorA_nd != 1)) {
+        throw py::value_error(
+            "The first input array has ndim=" + std::to_string(vectorA_nd) +
+            ", but a 1-dimensional array is expected.");
+    }
+
+    if ((vectorB_nd != 1)) {
+        throw py::value_error(
+            "The second input array has ndim=" + std::to_string(vectorB_nd) +
+            ", but a 1-dimensional array is expected.");
+    }
+
+    if ((result_nd != 0)) {
+        throw py::value_error(
+            "The output array has ndim=" + std::to_string(result_nd) +
+            ", but a 0-dimensional array is expected.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(vectorA, result)) {
+        throw py::value_error(
+            "The first input array and output array are overlapping "
+            "segments of memory");
+    }
+    if (overlap(vectorB, result)) {
+        throw py::value_error(
+            "The second input array and output array are overlapping "
+            "segments of memory");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q,
+            {vectorA.get_queue(), vectorB.get_queue(), result.get_queue()}))
+    {
+        throw py::value_error(
+            "USM allocations are not compatible with the execution queue.");
+    }
+
+    py::ssize_t a_size = vectorA.get_size();
+    py::ssize_t b_size = vectorB.get_size();
+    if (a_size != b_size) {
+        throw py::value_error("The size of the first input array must be "
+                              "equal to the size of the second input array.");
+    }
+
+    std::vector<py::ssize_t> a_stride = vectorA.get_strides_vector();
+    std::vector<py::ssize_t> b_stride = vectorB.get_strides_vector();
+
+    const std::int64_t n = a_size;
+    const std::int64_t str_a = a_stride[0];
+    const std::int64_t str_b = b_stride[0];
+
+    int vectorA_typenum = vectorA.get_typenum();
+    int vectorB_typenum = vectorB.get_typenum();
+    int result_typenum = result.get_typenum();
+
+    if (vectorA_typenum != vectorB_typenum) {
+        throw py::value_error("vectorA and vectorB must be of the same type.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int vectorAB_type_id = array_types.typenum_to_lookup_id(vectorA_typenum);
+    int result_type_id = array_types.typenum_to_lookup_id(result_typenum);
+
+    dot_impl_fn_ptr_t dot_fn =
+        dot_dispatch_table[vectorAB_type_id][result_type_id];
+    if (dot_fn == nullptr) {
+        throw py::value_error(
+            "Types of input vectors and result array are mismatched.");
+    }
+
+    char *a_typeless_ptr = vectorA.get_data();
+    char *b_typeless_ptr = vectorB.get_data();
+    char *r_typeless_ptr = result.get_data();
+
+    sycl::event dot_ev = dot_fn(exec_q, n, a_typeless_ptr, str_a,
+                                b_typeless_ptr, str_b, r_typeless_ptr, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {vectorA, vectorB, result}, {dot_ev});
+
+    return std::make_pair(args_ev, dot_ev);
+}
+
+template <typename fnT, typename Tab, typename Tc>
+struct DotContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::DotTypePairSupportFactory<Tab, Tc>::is_defined) {
+            return dot_impl<Tab, Tc>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_dot_dispatch_table(void)
+{
+    dpctl_td_ns::DispatchTableBuilder<dot_impl_fn_ptr_t, DotContigFactory,
+                                      dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_table(dot_dispatch_table);
+}
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/blas/dot.hpp b/dpnp/backend/extensions/blas/dot.hpp
new file mode 100644
index 000000000000..806de3997b1f
--- /dev/null
+++ b/dpnp/backend/extensions/blas/dot.hpp
@@ -0,0 +1,60 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+extern std::pair<sycl::event, sycl::event>
+    dotc(sycl::queue exec_q,
+         dpctl::tensor::usm_ndarray vectorA,
+         dpctl::tensor::usm_ndarray vectorB,
+         dpctl::tensor::usm_ndarray result,
+         const std::vector<sycl::event> &depends);
+
+extern std::pair<sycl::event, sycl::event>
+    dot(sycl::queue exec_q,
+        dpctl::tensor::usm_ndarray vectorA,
+        dpctl::tensor::usm_ndarray vectorB,
+        dpctl::tensor::usm_ndarray result,
+        const std::vector<sycl::event> &depends);
+
+extern void init_dot_dispatch_table(void);
+extern void init_dotc_dispatch_table(void);
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/blas/dotc.cpp b/dpnp/backend/extensions/blas/dotc.cpp
new file mode 100644
index 000000000000..0f8fd34f171a
--- /dev/null
+++ b/dpnp/backend/extensions/blas/dotc.cpp
@@ -0,0 +1,232 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "dot.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace blas
+{
+namespace mkl_blas = oneapi::mkl::blas;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*dotc_impl_fn_ptr_t)(sycl::queue,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::int64_t,
+                                          char *,
+                                          const std::vector<sycl::event> &);
+
+static dotc_impl_fn_ptr_t dotc_dispatch_table[dpctl_td_ns::num_types]
+                                             [dpctl_td_ns::num_types];
+
+template <typename Tab, typename Tc>
+static sycl::event dotc_impl(sycl::queue exec_q,
+                             const std::int64_t n,
+                             char *vectorA,
+                             const std::int64_t stride_a,
+                             char *vectorB,
+                             const std::int64_t stride_b,
+                             char *result,
+                             const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<Tab>(exec_q);
+    type_utils::validate_type_for_device<Tc>(exec_q);
+
+    Tab *a = reinterpret_cast<Tab *>(vectorA);
+    Tab *b = reinterpret_cast<Tab *>(vectorB);
+    Tc *res = reinterpret_cast<Tc *>(result);
+
+    std::stringstream error_msg;
+    bool is_exception_caught = false;
+
+    sycl::event dotc_event;
+    try {
+        dotc_event = mkl_blas::row_major::dotc(exec_q,
+                                               n, // size of the input vectors
+                                               a, // Pointer to vector a.
+                                               stride_a, // Stride of vector a.
+                                               b,        // Pointer to vector b.
+                                               stride_b, // Stride of vector b.
+                                               res,      // Pointer to result.
+                                               depends);
+    } catch (oneapi::mkl::exception const &e) {
+        error_msg
+            << "Unexpected MKL exception caught during dotc() call:\nreason: "
+            << e.what();
+        is_exception_caught = true;
+    } catch (sycl::exception const &e) {
+        error_msg << "Unexpected SYCL exception caught during dotc() call:\n"
+                  << e.what();
+        is_exception_caught = true;
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        throw std::runtime_error(error_msg.str());
+    }
+
+    return dotc_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    dotc(sycl::queue exec_q,
+         dpctl::tensor::usm_ndarray vectorA,
+         dpctl::tensor::usm_ndarray vectorB,
+         dpctl::tensor::usm_ndarray result,
+         const std::vector<sycl::event> &depends)
+{
+    const int vectorA_nd = vectorA.get_ndim();
+    const int vectorB_nd = vectorB.get_ndim();
+    const int result_nd = result.get_ndim();
+
+    if ((vectorA_nd != 1)) {
+        throw py::value_error(
+            "The first input array has ndim=" + std::to_string(vectorA_nd) +
+            ", but a 1-dimensional array is expected.");
+    }
+
+    if ((vectorB_nd != 1)) {
+        throw py::value_error(
+            "The second input array has ndim=" + std::to_string(vectorB_nd) +
+            ", but a 1-dimensional array is expected.");
+    }
+
+    if ((result_nd != 0)) {
+        throw py::value_error(
+            "The output array has ndim=" + std::to_string(result_nd) +
+            ", but a 0-dimensional array is expected.");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(vectorA, result)) {
+        throw py::value_error(
+            "The first input array and output array are overlapping "
+            "segments of memory");
+    }
+    if (overlap(vectorB, result)) {
+        throw py::value_error(
+            "The second input array and output array are overlapping "
+            "segments of memory");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(
+            exec_q,
+            {vectorA.get_queue(), vectorB.get_queue(), result.get_queue()}))
+    {
+        throw py::value_error(
+            "USM allocations are not compatible with the execution queue.");
+    }
+
+    py::ssize_t a_size = vectorA.get_size();
+    py::ssize_t b_size = vectorB.get_size();
+    if (a_size != b_size) {
+        throw py::value_error("The size of the first input array must be "
+                              "equal to the size of the second input array.");
+    }
+
+    std::vector<py::ssize_t> a_stride = vectorA.get_strides_vector();
+    std::vector<py::ssize_t> b_stride = vectorB.get_strides_vector();
+
+    const std::int64_t n = a_size;
+    const std::int64_t str_a = a_stride[0];
+    const std::int64_t str_b = b_stride[0];
+
+    int vectorA_typenum = vectorA.get_typenum();
+    int vectorB_typenum = vectorB.get_typenum();
+    int result_typenum = result.get_typenum();
+
+    if (vectorA_typenum != vectorB_typenum) {
+        throw py::value_error(
+            "Input arrays must be of must be of the same type.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int vectorAB_type_id = array_types.typenum_to_lookup_id(vectorA_typenum);
+    int result_type_id = array_types.typenum_to_lookup_id(result_typenum);
+
+    dotc_impl_fn_ptr_t dotc_fn =
+        dotc_dispatch_table[vectorAB_type_id][result_type_id];
+    if (dotc_fn == nullptr) {
+        throw py::value_error(
+            "Types of input vectors and result array are mismatched.");
+    }
+
+    char *a_typeless_ptr = vectorA.get_data();
+    char *b_typeless_ptr = vectorB.get_data();
+    char *r_typeless_ptr = result.get_data();
+
+    sycl::event dotc_ev =
+        dotc_fn(exec_q, n, a_typeless_ptr, str_a, b_typeless_ptr, str_b,
+                r_typeless_ptr, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(
+        exec_q, {vectorA, vectorB, result}, {dotc_ev});
+
+    return std::make_pair(args_ev, dotc_ev);
+}
+
+template <typename fnT, typename Tab, typename Tc>
+struct DotcContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::DotcTypePairSupportFactory<Tab, Tc>::is_defined) {
+            return dotc_impl<Tab, Tc>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_dotc_dispatch_table(void)
+{
+    dpctl_td_ns::DispatchTableBuilder<dotc_impl_fn_ptr_t, DotcContigFactory,
+                                      dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_table(dotc_dispatch_table);
+}
+} // namespace blas
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp
index 5526ecd3c1b4..a26420f49b3d 100644
--- a/dpnp/backend/extensions/blas/gemm.cpp
+++ b/dpnp/backend/extensions/blas/gemm.cpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2023, Intel Corporation
+// Copyright (c) 2024, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp
index 25f78b5b8501..3f1ec6e745a3 100644
--- a/dpnp/backend/extensions/blas/gemm.hpp
+++ b/dpnp/backend/extensions/blas/gemm.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2023, Intel Corporation
+// Copyright (c) 2024, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/dpnp/backend/extensions/blas/gemm_batch.cpp b/dpnp/backend/extensions/blas/gemm_batch.cpp
index 32f592f6b8a9..9359901edd88 100644
--- a/dpnp/backend/extensions/blas/gemm_batch.cpp
+++ b/dpnp/backend/extensions/blas/gemm_batch.cpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2023, Intel Corporation
+// Copyright (c) 2024, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/dpnp/backend/extensions/blas/types_matrix.hpp b/dpnp/backend/extensions/blas/types_matrix.hpp
index 49154df03c4c..b6c690405c4e 100644
--- a/dpnp/backend/extensions/blas/types_matrix.hpp
+++ b/dpnp/backend/extensions/blas/types_matrix.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2023, Intel Corporation
+// Copyright (c) 2024, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,49 @@ namespace blas
 {
 namespace types
 {
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL BLAS library provides support in oneapi::mkl::blas::dotc<Tab, Tc>
+ * function.
+ *
+ * @tparam Tab Type of arrays containing input vectors A and B.
+ * @tparam Tc Type of array containing output.
+ */
+template <typename Tab, typename Tc>
+struct DotcTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<float>,
+                                          Tc,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<double>,
+                                          Tc,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL BLAS library provides support in oneapi::mkl::blas::dot<Tab, Tc>
+ * function.
+ *
+ * @tparam Tab Type of arrays containing input vectors A and B.
+ * @tparam Tc Type of array containing output.
+ */
+template <typename Tab, typename Tc>
+struct DotTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<Tab, float, Tc, float>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, float, Tc, double>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, double, Tc, double>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL BLAS library provides support in oneapi::mkl::blas::gemm<Tab, Tc>
diff --git a/dpnp/backend/kernels/dpnp_krnl_common.cpp b/dpnp/backend/kernels/dpnp_krnl_common.cpp
index e664c30b848f..04eac54310db 100644
--- a/dpnp/backend/kernels/dpnp_krnl_common.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_common.cpp
@@ -1040,6 +1040,7 @@ void func_map_init_linalg(func_map_t &fmap)
     fmap[DPNPFuncName::DPNP_FN_DOT][eft_DBL][eft_DBL] = {
         eft_DBL, (void *)dpnp_dot_default_c<double, double, double>};
 
+    // needed for "dpnp_correlate_c" function in dpnp_krnl_statistics.cpp
     fmap[DPNPFuncName::DPNP_FN_DOT_EXT][eft_INT][eft_INT] = {
         eft_INT, (void *)dpnp_dot_ext_c<int32_t, int32_t, int32_t>};
     fmap[DPNPFuncName::DPNP_FN_DOT_EXT][eft_INT][eft_LNG] = {
diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd
index 895b393aeffb..8f8fb37fa191 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pxd
+++ b/dpnp/dpnp_algo/dpnp_algo.pxd
@@ -54,8 +54,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName":  # need this na
         DPNP_FN_DIAG_INDICES_EXT
         DPNP_FN_DIAGONAL
         DPNP_FN_DIAGONAL_EXT
-        DPNP_FN_DOT
-        DPNP_FN_DOT_EXT
         DPNP_FN_EDIFF1D
         DPNP_FN_EDIFF1D_EXT
         DPNP_FN_EIG
@@ -284,11 +282,6 @@ cpdef dpnp_descriptor dpnp_isclose(dpnp_descriptor input1, dpnp_descriptor input
                                    double rtol=*, double atol=*, cpp_bool equal_nan=*)
 
 
-"""
-Linear algebra
-"""
-cpdef dpnp_descriptor dpnp_dot(dpnp_descriptor in_array1, dpnp_descriptor in_array2)
-
 """
 Array creation routines
 """
diff --git a/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi b/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi
index 9b4faf2a1b57..09336b5aaa3d 100644
--- a/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi
+++ b/dpnp/dpnp_algo/dpnp_algo_linearalgebra.pxi
@@ -36,7 +36,6 @@ and the rest of the library
 # NO IMPORTs here. All imports must be placed into main "dpnp_algo.pyx" file
 
 __all__ += [
-    "dpnp_dot",
     "dpnp_inner",
     "dpnp_kron",
 ]
@@ -47,105 +46,6 @@ ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_shapes_t)(c_dpctl.DPCTLSyclQue
                                                             void *, void * , void * , shape_elem_type * ,
                                                             shape_elem_type *, shape_elem_type * , size_t,
                                                             const c_dpctl.DPCTLEventVectorRef)
-ctypedef c_dpctl.DPCTLSyclEventRef(*fptr_2in_1out_dot_t)(c_dpctl.DPCTLSyclQueueRef,
-                                                         void * , const size_t, const size_t,
-                                                         const shape_elem_type *, const shape_elem_type * ,
-                                                         void * , const size_t, const size_t,
-                                                         const shape_elem_type *, const shape_elem_type * ,
-                                                         void * , const size_t, const size_t,
-                                                         const shape_elem_type *, const shape_elem_type * ,
-                                                         const c_dpctl.DPCTLEventVectorRef) except +
-
-cpdef utils.dpnp_descriptor dpnp_dot(utils.dpnp_descriptor in_array1,
-                                     utils.dpnp_descriptor in_array2,
-                                     utils.dpnp_descriptor out=None):
-    cdef shape_type_c shape1, shape2
-
-    shape1 = in_array1.shape
-    shape2 = in_array2.shape
-
-    # convert string type names (array.dtype) to C enum DPNPFuncType
-    cdef DPNPFuncType param1_type = dpnp_dtype_to_DPNPFuncType(in_array1.dtype)
-    cdef DPNPFuncType param2_type = dpnp_dtype_to_DPNPFuncType(in_array2.dtype)
-
-    # get the FPTR data structure
-    cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(DPNP_FN_DOT_EXT, param1_type, param2_type)
-    cdef utils.dpnp_descriptor result
-
-    ndim1 = in_array1.ndim
-    ndim2 = in_array2.ndim
-    cdef shape_type_c result_shape
-    if ndim1 == 0:
-        result_shape = shape2
-    elif ndim2 == 0:
-        result_shape = shape1
-    elif ndim1 == 1 and ndim2 == 1:
-        result_shape = ()
-    elif ndim1 == 1:  # ndim2 > 1
-        result_shape = shape2[::-2] if ndim2 == 2 else shape2[::2]
-    elif ndim2 == 1:  # ndim1 > 1
-        result_shape = shape1[:-1]
-    else:
-        if ndim1 == 1:
-            shape1 = (1, shape1[0])
-        if ndim2 == 1:
-            shape2 = (shape1[0], 1)
-        result_shape = shape1[:-1] + shape2[:-2] + shape2[-1:]
-
-    result_sycl_device, result_usm_type, result_sycl_queue = utils.get_common_usm_allocation(in_array1, in_array2)
-
-    if out is None:
-        # create result array with type given by FPTR data
-        result = utils.create_output_descriptor(result_shape,
-                                                kernel_data.return_type,
-                                                None,
-                                                device=result_sycl_device,
-                                                usm_type=result_usm_type,
-                                                sycl_queue=result_sycl_queue)
-    else:
-        result_type = dpnp_DPNPFuncType_to_dtype(< size_t > kernel_data.return_type)
-        if out.dtype != result_type:
-            utils.checker_throw_value_error('dot', 'out.dtype', out.dtype, result_type)
-        if out.shape != result_shape:
-            utils.checker_throw_value_error('dot', 'out.shape', out.shape, result_shape)
-
-        result = out
-
-        utils.get_common_usm_allocation(in_array1, result)  # check USM allocation is common
-
-    cdef shape_type_c result_strides = utils.strides_to_vector(result.strides, result.shape)
-    cdef shape_type_c in_array1_shape = in_array1.shape
-    cdef shape_type_c in_array1_strides = utils.strides_to_vector(in_array1.strides, in_array1.shape)
-    cdef shape_type_c in_array2_shape = in_array2.shape
-    cdef shape_type_c in_array2_strides = utils.strides_to_vector(in_array2.strides, in_array2.shape)
-
-    cdef c_dpctl.SyclQueue q = <c_dpctl.SyclQueue> result_sycl_queue
-    cdef c_dpctl.DPCTLSyclQueueRef q_ref = q.get_queue_ref()
-
-    cdef fptr_2in_1out_dot_t func = <fptr_2in_1out_dot_t > kernel_data.ptr
-    # call FPTR function
-    cdef c_dpctl.DPCTLSyclEventRef event_ref = func(q_ref,
-                                                    result.get_data(),
-                                                    result.size,
-                                                    result.ndim,
-                                                    result_shape.data(),
-                                                    result_strides.data(),
-                                                    in_array1.get_data(),
-                                                    in_array1.size,
-                                                    in_array1.ndim,
-                                                    in_array1_shape.data(),
-                                                    in_array1_strides.data(),
-                                                    in_array2.get_data(),
-                                                    in_array2.size,
-                                                    in_array2.ndim,
-                                                    in_array2_shape.data(),
-                                                    in_array2_strides.data(),
-                                                    NULL)  # dep_events_ref
-
-    with nogil: c_dpctl.DPCTLEvent_WaitAndThrow(event_ref)
-    c_dpctl.DPCTLEvent_Delete(event_ref)
-
-    return result
 
 
 cpdef utils.dpnp_descriptor dpnp_inner(dpnp_descriptor array1, dpnp_descriptor array2):
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index cf848b506905..b5e75dde07c2 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -704,8 +704,29 @@ def diagonal(input, offset=0, axis1=0, axis2=1):
 
         return dpnp.diagonal(input, offset, axis1, axis2)
 
-    def dot(self, other, out=None):
-        return dpnp.dot(self, other, out)
+    def dot(self, b, out=None):
+        """
+        Dot product of two arrays.
+
+        For full documentation refer to :obj:`dpnp.dot`.
+
+        Examples
+        --------
+        >>> import dpnp as np
+        >>> a = np.eye(2)
+        >>> b = np.ones((2, 2)) * 2
+        >>> a.dot(b)
+        array([[2., 2.],
+               [2., 2.]])
+
+        This array method can be conveniently chained:
+
+        >>> a.dot(b).dot(b)
+        array([[8., 8.],
+               [8., 8.]])
+        """
+
+        return dpnp.dot(self, b, out)
 
     @property
     def dtype(self):
diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py
index d39b84a50ec7..0213a23e265b 100644
--- a/dpnp/dpnp_iface_linearalgebra.py
+++ b/dpnp/dpnp_iface_linearalgebra.py
@@ -38,13 +38,12 @@
 """
 
 
-import dpctl.tensor as dpt
 import numpy
 
 import dpnp
 from dpnp.dpnp_algo import *
 from dpnp.dpnp_utils import *
-from dpnp.dpnp_utils.dpnp_utils_linearalgebra import dpnp_matmul
+from dpnp.dpnp_utils.dpnp_utils_linearalgebra import dpnp_dot, dpnp_matmul
 
 __all__ = [
     "dot",
@@ -59,16 +58,28 @@
 ]
 
 
-def dot(x1, x2, out=None, **kwargs):
+def dot(a, b, out=None):
     """
-    Dot product of `x1` and `x2`.
+    Dot product of `a` and `b`.
 
     For full documentation refer to :obj:`numpy.dot`.
 
+    Parameters
+    ----------
+    a : {dpnp_array, usm_ndarray, scalar}
+        First input array. Both inputs `a` and `b` can not be scalars at the same time.
+    b : {dpnp_array, usm_ndarray, scalar}
+        Second input array. Both inputs `a` and `b` can not be scalars at the same time.
+    out : {dpnp.ndarray, usm_ndarray}, optional
+        Alternative output array in which to place the result. It must have
+        the same shape and data type as the expected output and should be
+        C-contiguous. If these conditions are not met, an exception is
+        raised, instead of attempting to be flexible.
+
     Returns
     -------
-    y : dpnp.ndarray
-        Returns the dot product of `x1` and `x2`.
+    out : dpnp.ndarray
+        Returns the dot product of `a` and `b`.
         If `out` is given, then it is returned.
 
     Limitations
@@ -81,65 +92,64 @@ def dot(x1, x2, out=None, **kwargs):
 
     See Also
     --------
+    :obj:`dpnp.ndarray.dot` : Equivalent method.
     :obj:`dpnp.tensordot` : Sum products over arbitrary axes.
     :obj:`dpnp.vdot` : Complex-conjugating dot product.
 
     Examples
     --------
-    >>> import dpnp as dp
-    >>> a = dp.array([1, 2, 3])
-    >>> b = dp.array([1, 2, 3])
-    >>> dp.dot(a, b)
-    14
+    >>> import dpnp as np
+    >>> a = np.array([1, 2, 3])
+    >>> b = np.array([1, 2, 3])
+    >>> np.dot(a, b)
+    array(14)
+
+    Neither argument is complex-conjugated:
+
+    >>> np.dot(np.array([2j, 3j]), np.array([2j, 3j]))
+    array(-13+0j)
+
+    For 2-D arrays it is the matrix product:
+
+    >>> a = np.array([[1, 0], [0, 1]])
+    >>> b = np.array([[4, 1], [2, 2]])
+    >>> np.dot(a, b)
+    array([[4, 1],
+           [2, 2]])
+
+    >>> a = np.arange(3*4*5*6).reshape((3,4,5,6))
+    >>> b = np.arange(3*4*5*6)[::-1].reshape((5,4,6,3))
+    >>> np.dot(a, b)[2,3,2,1,2,2]
+    array(499128)
+    >>> sum(a[2,3,2,:] * b[1,2,:,2])
+    array(499128)
 
     """
 
-    if kwargs:
-        pass
-    elif dpnp.isscalar(x1) and dpnp.isscalar(x2):
-        # at least either x1 or x2 has to be an array
-        pass
+    dpnp.check_supported_arrays_type(a, scalar_type=True)
+    dpnp.check_supported_arrays_type(b, scalar_type=True)
+
+    if out is not None:
+        dpnp.check_supported_arrays_type(out)
+        if not out.flags.c_contiguous:
+            raise ValueError("Only C-contiguous array is acceptable.")
+
+    if dpnp.isscalar(a) or dpnp.isscalar(b):
+        return dpnp.multiply(a, b, out=out)
+    elif a.ndim == 0 or b.ndim == 0:
+        return dpnp.multiply(a, b, out=out)
+    elif a.ndim == 1 and b.ndim == 1:
+        return dpnp_dot(a, b, out=out)
+    elif a.ndim == 2 and b.ndim == 2:
+        # NumPy does not allow casting even if it is safe
+        return dpnp.matmul(a, b, out=out, casting="no")
+    elif a.ndim > 1 and b.ndim == 1:
+        # NumPy does not allow casting even if it is safe
+        return dpnp.matmul(a, b, out=out, casting="no")
     else:
-        # get USM type and queue to copy scalar from the host memory into a USM allocation
-        usm_type, queue = (
-            get_usm_allocations([x1, x2])
-            if dpnp.isscalar(x1) or dpnp.isscalar(x2)
-            else (None, None)
-        )
-
-        x1_desc = dpnp.get_dpnp_descriptor(
-            x1,
-            copy_when_strides=False,
-            copy_when_nondefault_queue=False,
-            alloc_usm_type=usm_type,
-            alloc_queue=queue,
-        )
-        x2_desc = dpnp.get_dpnp_descriptor(
-            x2,
-            copy_when_strides=False,
-            copy_when_nondefault_queue=False,
-            alloc_usm_type=usm_type,
-            alloc_queue=queue,
-        )
-        if x1_desc and x2_desc:
-            if out is not None:
-                if not isinstance(out, (dpnp.ndarray, dpt.usm_ndarray)):
-                    raise TypeError(
-                        "return array must be of supported array type"
-                    )
-                out_desc = (
-                    dpnp.get_dpnp_descriptor(
-                        out,
-                        copy_when_strides=False,
-                        copy_when_nondefault_queue=False,
-                    )
-                    or None
-                )
-            else:
-                out_desc = None
-            return dpnp_dot(x1_desc, x2_desc, out=out_desc).get_pyobj()
-
-    return call_origin(numpy.dot, x1, x2, out=out, **kwargs)
+        result = dpnp.tensordot(a, b, axes=(-1, -2))
+        # NumPy does not allow casting even if it is safe
+        return dpnp.get_result_array(result, out, casting="no")
 
 
 def einsum(*args, **kwargs):
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 6a48e4d1e105..b64cb76d1322 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -709,7 +709,7 @@ def copyto(dst, src, casting="same_kind", where=True):
         src = dpnp.array(src, sycl_queue=dst.sycl_queue)
 
     if not dpnp.can_cast(src.dtype, dst.dtype, casting=casting):
-        raise TypeError(
+        raise ValueError(
             f"Cannot cast from {src.dtype} to {dst.dtype} "
             f"according to the rule {casting}."
         )
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index d0add55eee39..fb8faa642e12 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -31,20 +31,22 @@
 import dpnp.backend.extensions.blas._blas_impl as bi
 from dpnp.dpnp_utils import get_usm_allocations
 
-__all__ = ["dpnp_matmul"]
+__all__ = ["dpnp_dot", "dpnp_matmul"]
 
 
-def _gemm_res_dtype(*arrays, dtype, casting, sycl_queue):
+def _op_res_dtype(*arrays, dtype, casting, sycl_queue):
     """
-    Determines the output array data type and the intermediate data type.
-
-    If dtype is ``None``, the output array data type is determined based on
-    the Promotion Type Rule and device capabilities. Otherwise, `dtype` is
-    used as output array dtype if input arrays can cast to it according to
-    the casting rule determined. If casting cannot be done, a ``TypeError``
-    is raised.
-    The intermediate data type is the data type used for performing matmul
-    operation calculations. If output array dtype is a floating-point data type,
+    _op_res_dtype(*arrays, dtype, casting, sycl_queue)
+
+    Determines the output array data type and an intermediate data type
+    used in performing calculations related to a specific math function.
+    If dtype is ``None``, the output array data type of the operation is
+    determined based on the Promotion Type Rule and device capabilities.
+    Otherwise, `dtype` is used as output array dtype, if input arrays
+    can cast to it according to the casting rule determined. If casting
+    cannot be done, a ``TypeError`` is raised.
+    The intermediate data type is the data type used for performing the math
+    function calculations. If output array dtype is a floating-point data type,
     it is also used for the intermediate data type. If output array dtype is an
     integral data type, the default floating point data type of the device where
     input arrays are allocated on are used for intermediate data type.
@@ -62,9 +64,9 @@ def _gemm_res_dtype(*arrays, dtype, casting, sycl_queue):
 
     Returns
     -------
-    gemm_dtype, res_dtype :
-        `gemm_dtype` is the data type used in performing matmul calculations.
-        The input arrays of matmul function are cast to `gemm_dtype` and then
+    op_dtype, res_dtype :
+        `op_dtype` is the data type used in performing math function calculations.
+        The input arrays of the math function are cast to `op_dtype` and then
         the calculations are performed.
         `res_dtype` is the output data type. When the result is obtained, it is cast
         to `res_dtype`.
@@ -82,11 +84,11 @@ def _gemm_res_dtype(*arrays, dtype, casting, sycl_queue):
                 f"Cannot cast ufunc 'matmul' output from dtype({res_dtype}) to dtype({dtype}) with casting rule {casting}"
             )
 
-    gemm_dtype = (
+    op_dtype = (
         res_dtype if dpnp.issubdtype(res_dtype, dpnp.inexact) else default_dtype
     )
 
-    return gemm_dtype, res_dtype
+    return op_dtype, res_dtype
 
 
 def _gemm_batch_matmul(exec_q, x1, x2, res, x1_is_2D, x2_is_2D, dev_tasks_list):
@@ -95,8 +97,10 @@ def _gemm_batch_matmul(exec_q, x1, x2, res, x1_is_2D, x2_is_2D, dev_tasks_list):
     # when the input array is F-contiguous, the data of 2D array
     # that needs to be called in mkl::gemm_batch are not contiguous.
     ht_tasks_list = []
-    x1 = _get_gemm_contig_array(x1, dev_tasks_list, ht_tasks_list)
-    x2 = _get_gemm_contig_array(x2, dev_tasks_list, ht_tasks_list)
+    contig_copy = not x1.flags.c_contiguous
+    x1 = _copy_array(x1, dev_tasks_list, ht_tasks_list, contig_copy=contig_copy)
+    contig_copy = not x2.flags.c_contiguous
+    x2 = _copy_array(x2, dev_tasks_list, ht_tasks_list, contig_copy=contig_copy)
 
     x1_strides = x1.strides
     x2_strides = x2.strides
@@ -149,29 +153,22 @@ def _gemm_batch_matmul(exec_q, x1, x2, res, x1_is_2D, x2_is_2D, dev_tasks_list):
     return ht_blas_ev, ht_tasks_list, res
 
 
-def _get_gemm_contig_array(x, dep_events, host_events, dtype=None):
+def _copy_array(x, dep_events, host_events, contig_copy=False, dtype=None):
     """
     Creating a copy of input array if needed.
 
-    This function has two use cases. In the first use case, which is more general,
-    if the input array is not c-contiguous or f-contiguous, we ensure it becomes
-    c-contiguous. Additionally, if the input array has an integral dtype, we
-    convert it to an appropriate floating-point data type specified by `dtype`.
-    In the second use case, which is for N-dimensional arrays with N>2, we need
-    to ensure c-contiguity. This is crucial because the implementation of the
-    `gemm_batch` function in dpnp only works for C-contiguous arrays. This use case
-    is essential when the input array is f-contiguous with floating point dtype for
-    which the array is not modified in the first use case.
+    If `contig_copy` is ``True``, a C-contiguous copy of input array is returned.
+    In this case, the copy array has the input array data type unless `dtype` is
+    determined.
+    If `contig_copy` is ``False`` and input array data type is different than `dtype`,
+    a C-contiguous copy of input array with specified `dtype` is returned.
 
     """
 
-    if dtype is None:
-        copy = not x.flags.c_contiguous
+    if contig_copy:
+        copy = contig_copy
     else:
-        copy = (
-            not (x.flags.c_contiguous or x.flags.f_contiguous)
-            or x.dtype != dtype
-        )
+        copy = x.dtype != dtype if dtype is not None else False
 
     if copy:
         x_copy = dpnp.empty_like(x, dtype=dtype, order="C")
@@ -186,6 +183,79 @@ def _get_gemm_contig_array(x, dep_events, host_events, dtype=None):
     return x
 
 
+def dpnp_dot(
+    a,
+    b,
+    /,
+    out=None,
+):
+    """
+    dpnp_dot(a, b, out=None)
+
+    Return the dot product of two arrays.
+
+    The main calculation is done by calling an extension function
+    for BLAS library of OneMKL. Depending on the data type of `a` and `b` arrays,
+    it will be either ``dot`` (for real-valued data types) or
+    ``dotc``(for complex-valued data types).
+
+    """
+
+    if a.size != b.size:
+        raise ValueError(
+            "Input arrays have a mismatch in their size. "
+            f"(size {a.size} is different from {b.size})"
+        )
+
+    res_usm_type, exec_q = get_usm_allocations([a, b])
+
+    # Determine the appropriate data types
+    # casting is irrelevant here since dtype is `None``
+    dot_dtype, res_dtype = _op_res_dtype(
+        a, b, dtype=None, casting="no", sycl_queue=exec_q
+    )
+
+    # create result array
+    result = dpnp.empty(
+        (),
+        dtype=dot_dtype,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+    )
+
+    # input arrays should have the proper data type
+    dep_events_list = []
+    host_tasks_list = []
+    a = _copy_array(a, dep_events_list, host_tasks_list, dtype=dot_dtype)
+    b = _copy_array(b, dep_events_list, host_tasks_list, dtype=dot_dtype)
+
+    if dpnp.issubdtype(dot_dtype, dpnp.complexfloating):
+        ht_blas_ev, _ = bi._dotc(
+            exec_q,
+            # OneMKL Library conjugates the first input
+            dpnp.get_usm_ndarray(a.conj()),
+            dpnp.get_usm_ndarray(b),
+            dpnp.get_usm_ndarray(result),
+            dep_events_list,
+        )
+    else:
+        ht_blas_ev, _ = bi._dot(
+            exec_q,
+            dpnp.get_usm_ndarray(a),
+            dpnp.get_usm_ndarray(b),
+            dpnp.get_usm_ndarray(result),
+            dep_events_list,
+        )
+    host_tasks_list.append(ht_blas_ev)
+    dpctl.SyclEvent.wait_for(host_tasks_list)
+
+    if dot_dtype != res_dtype:
+        result = result.astype(res_dtype, copy=False)
+
+    # NumPy does not allow casting even if it is safe
+    return dpnp.get_result_array(result, out, casting="no")
+
+
 def dpnp_matmul(
     x1,
     x2,
@@ -222,14 +292,16 @@ def dpnp_matmul(
 
     res_usm_type, exec_q = get_usm_allocations([x1, x2])
 
-    squeeze_flag = x1_ndim == 1 or x2_ndim == 1
+    appended_axes = []
     if x1_ndim == 1:
         x1 = x1[dpnp.newaxis, :]
         x1_ndim = x1.ndim
+        appended_axes.append(-2)
 
     if x2_ndim == 1:
         x2 = x2[:, dpnp.newaxis]
         x2_ndim = x2.ndim
+        appended_axes.append(-1)
 
     x1_shape = x1.shape
     x2_shape = x2.shape
@@ -241,7 +313,7 @@ def dpnp_matmul(
         )
 
     # Determine the appropriate data types
-    gemm_dtype, res_dtype = _gemm_res_dtype(
+    gemm_dtype, res_dtype = _op_res_dtype(
         x1, x2, dtype=dtype, casting=casting, sycl_queue=exec_q
     )
 
@@ -306,11 +378,21 @@ def dpnp_matmul(
         # and be C_CONTIGUOUS or F_CONTIGUOUS
         dep_events_list = []
         host_tasks_list = []
-        x1 = _get_gemm_contig_array(
-            x1, dep_events_list, host_tasks_list, gemm_dtype
+        contig_copy = not (x1.flags.c_contiguous or x1.flags.f_contiguous)
+        x1 = _copy_array(
+            x1,
+            dep_events_list,
+            host_tasks_list,
+            contig_copy=contig_copy,
+            dtype=gemm_dtype,
         )
-        x2 = _get_gemm_contig_array(
-            x2, dep_events_list, host_tasks_list, gemm_dtype
+        contig_copy = not (x2.flags.c_contiguous or x2.flags.f_contiguous)
+        x2 = _copy_array(
+            x2,
+            dep_events_list,
+            host_tasks_list,
+            contig_copy=contig_copy,
+            dtype=gemm_dtype,
         )
 
         if x1_is_2D and x2_is_2D:
@@ -340,8 +422,8 @@ def dpnp_matmul(
         host_tasks_list.append(ht_blas_ev)
         dpctl.SyclEvent.wait_for(host_tasks_list)
 
-    if squeeze_flag:
-        result = dpnp.squeeze(result)
+    if appended_axes:
+        result = dpnp.squeeze(result, tuple(appended_axes))
 
     if x1_is_2D and x2_is_2D:
         # add new axes only if one of the input arrays
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
index 8eb46d3c9839..c90a3123b79b 100644
--- a/tests/skipped_tests.tbl
+++ b/tests/skipped_tests.tbl
@@ -355,13 +355,12 @@ tests/third_party/cupy/linalg_tests/test_einsum.py::TestEinSumUnaryOperationWith
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_dim_mismatch3
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_invalid_sub1
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_too_many_dims3
+
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_invlarge
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_large
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_of_two
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_dot_vec2
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_multidim_vdot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_tensordot_zero_dim
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_dot_with_out_f_contiguous
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_multidim_vdot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_int_axes
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
index b8c195b98611..84eb9fe75351 100644
--- a/tests/skipped_tests_gpu.tbl
+++ b/tests/skipped_tests_gpu.tbl
@@ -151,8 +151,6 @@ tests/third_party/cupy/linalg_tests/test_einsum.py::TestEinSumError::test_too_ma
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_dim_mismatch3
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_too_many_dims3
 
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_reversed_vdot
-
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsMultivariateNormal_param_0_{d=2, shape=(4, 3, 2)}::test_normal
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsMultivariateNormal_param_1_{d=2, shape=(3, 2)}::test_normal
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsMultivariateNormal_param_2_{d=4, shape=(4, 3, 2)}::test_normal
@@ -459,17 +457,17 @@ tests/third_party/cupy/linalg_tests/test_einsum.py::TestEinSumLarge_param_9_{opt
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestEinSumUnaryOperationWithScalar::test_scalar_float
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestEinSumUnaryOperationWithScalar::test_scalar_int
 tests/third_party/cupy/linalg_tests/test_einsum.py::TestListArgEinSumError::test_invalid_sub1
+
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_invlarge
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_large
 tests/third_party/cupy/linalg_tests/test_product.py::TestMatrixPower::test_matrix_power_of_two
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_dot_vec2
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_multidim_vdot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_int_axes
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_list_axes
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_tensordot_zero_dim
-tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_dot_with_out_f_contiguous
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_multidim_vdot
+tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_reversed_vdot
 
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_broadcast_not_allowed
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_diff_dtypes_is_equal
diff --git a/tests/skipped_tests_gpu_no_fp64.tbl b/tests/skipped_tests_gpu_no_fp64.tbl
index 26e11a700624..d724a6043e57 100644
--- a/tests/skipped_tests_gpu_no_fp64.tbl
+++ b/tests/skipped_tests_gpu_no_fp64.tbl
@@ -30,91 +30,6 @@ tests/test_umath.py::test_umaths[('floor_divide', 'ff')]
 tests/third_party/cupy/linalg_tests/test_eigenvalue.py::TestEigenvalue_param_0_{UPLO='U'}::test_eigh_batched
 tests/third_party/cupy/linalg_tests/test_eigenvalue.py::TestEigenvalue_param_1_{UPLO='L'}::test_eigh_batched
 
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_0_{shape=((2, 3, 4), (3, 4, 2)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_1_{shape=((2, 3, 4), (3, 4, 2)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_2_{shape=((2, 3, 4), (3, 4, 2)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_3_{shape=((2, 3, 4), (3, 4, 2)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_4_{shape=((1, 1), (1, 1)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_5_{shape=((1, 1), (1, 1)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_6_{shape=((1, 1), (1, 1)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_7_{shape=((1, 1), (1, 1)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_8_{shape=((1, 1), (1, 2)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_9_{shape=((1, 1), (1, 2)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_10_{shape=((1, 1), (1, 2)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_11_{shape=((1, 1), (1, 2)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_12_{shape=((1, 2), (2, 1)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_13_{shape=((1, 2), (2, 1)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_14_{shape=((1, 2), (2, 1)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_15_{shape=((1, 2), (2, 1)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_16_{shape=((2, 1), (1, 1)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_17_{shape=((2, 1), (1, 1)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_18_{shape=((2, 1), (1, 1)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_19_{shape=((2, 1), (1, 1)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_20_{shape=((1, 2), (2, 3)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_21_{shape=((1, 2), (2, 3)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_22_{shape=((1, 2), (2, 3)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_23_{shape=((1, 2), (2, 3)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_24_{shape=((2, 1), (1, 3)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_25_{shape=((2, 1), (1, 3)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_26_{shape=((2, 1), (1, 3)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_27_{shape=((2, 1), (1, 3)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_28_{shape=((2, 3), (3, 1)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_29_{shape=((2, 3), (3, 1)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_30_{shape=((2, 3), (3, 1)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_31_{shape=((2, 3), (3, 1)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_32_{shape=((2, 3), (3, 4)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_33_{shape=((2, 3), (3, 4)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_34_{shape=((2, 3), (3, 4)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_35_{shape=((2, 3), (3, 4)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_36_{shape=((0, 3), (3, 4)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_37_{shape=((0, 3), (3, 4)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_38_{shape=((0, 3), (3, 4)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_39_{shape=((0, 3), (3, 4)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_40_{shape=((2, 3), (3, 0)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_41_{shape=((2, 3), (3, 0)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_42_{shape=((2, 3), (3, 0)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_43_{shape=((2, 3), (3, 0)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_44_{shape=((0, 3), (3, 0)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_45_{shape=((0, 3), (3, 0)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_46_{shape=((0, 3), (3, 0)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_47_{shape=((0, 3), (3, 0)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_48_{shape=((3, 0), (0, 4)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_49_{shape=((3, 0), (0, 4)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_50_{shape=((3, 0), (0, 4)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_51_{shape=((3, 0), (0, 4)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_52_{shape=((2, 3, 0), (3, 0, 2)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_53_{shape=((2, 3, 0), (3, 0, 2)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_54_{shape=((2, 3, 0), (3, 0, 2)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_55_{shape=((2, 3, 0), (3, 0, 2)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_56_{shape=((0, 0), (0, 0)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_57_{shape=((0, 0), (0, 0)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_58_{shape=((0, 0), (0, 0)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_59_{shape=((0, 0), (0, 0)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_60_{shape=((3,), (3,)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_61_{shape=((3,), (3,)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_62_{shape=((3,), (3,)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_63_{shape=((3,), (3,)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_64_{shape=((2,), (2, 4)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_65_{shape=((2,), (2, 4)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_66_{shape=((2,), (2, 4)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_67_{shape=((2,), (2, 4)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_68_{shape=((4, 2), (2,)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_69_{shape=((4, 2), (2,)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_70_{shape=((4, 2), (2,)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDot_param_71_{shape=((4, 2), (2,)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_0_{shape=((), ()), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_1_{shape=((), ()), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_2_{shape=((), ()), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_3_{shape=((), ()), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_4_{shape=((), (2, 4)), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_5_{shape=((), (2, 4)), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_6_{shape=((), (2, 4)), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_7_{shape=((), (2, 4)), trans_a=False, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_8_{shape=((4, 2), ()), trans_a=True, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_9_{shape=((4, 2), ()), trans_a=True, trans_b=False}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_10_{shape=((4, 2), ()), trans_a=False, trans_b=True}::test_dot
-tests/third_party/cupy/linalg_tests/test_product.py::TestDotFor0Dim_param_11_{shape=((4, 2), ()), trans_a=False, trans_b=False}::test_dot
-
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsBeta_param_6_{a_shape=(3, 2), b_shape=(3, 2), shape=(4, 3, 2)}::test_beta
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsBeta_param_7_{a_shape=(3, 2), b_shape=(3, 2), shape=(3, 2)}::test_beta
 tests/third_party/cupy/random_tests/test_distributions.py::TestDistributionsChisquare_param_0_{df_shape=(), shape=(4, 3, 2)}::test_chisquare
diff --git a/tests/test_dot.py b/tests/test_dot.py
index 80da5090e1b7..34d9d70a7270 100644
--- a/tests/test_dot.py
+++ b/tests/test_dot.py
@@ -1,52 +1,373 @@
+import dpctl
 import numpy
 import pytest
 from numpy.testing import assert_allclose, assert_array_equal
 
-import dpnp as inp
+import dpnp
 
-from .helper import get_all_dtypes
+from .helper import assert_dtype_allclose, get_all_dtypes, get_complex_dtypes
 
 
-@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
-def test_dot_ones(type):
+@pytest.mark.parametrize("dtype", get_all_dtypes())
+def test_dot_ones(dtype):
     n = 10**5
-    a = numpy.ones(n, dtype=type)
-    b = numpy.ones(n, dtype=type)
-    ia = inp.array(a)
-    ib = inp.array(b)
+    a = numpy.ones(n, dtype=dtype)
+    b = numpy.ones(n, dtype=dtype)
+    ia = dpnp.array(a)
+    ib = dpnp.array(b)
 
-    result = inp.dot(ia, ib)
+    result = dpnp.dot(ia, ib)
     expected = numpy.dot(a, b)
-    assert_array_equal(expected, result)
+    assert_dtype_allclose(result, expected)
 
 
-@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
 def test_dot_arange(dtype):
     n = 10**2
-    m = 10**3 if dtype is not inp.float32 else 10**2
+    m = 10**3 if dtype is not dpnp.float32 else 10**2
     a = numpy.hstack((numpy.arange(n, dtype=dtype),) * m)
     b = numpy.flipud(a)
-    ia = inp.array(a)
-    ib = inp.array(b)
+    ia = dpnp.array(a)
+    ib = dpnp.array(b)
+
+    result = dpnp.dot(ia, ib)
+    expected = numpy.dot(a, b)
+    assert_dtype_allclose(result, expected)
+
+
+@pytest.mark.parametrize("dtype", get_all_dtypes())
+def test_dot_scalar(dtype):
+    a = 2
+    b = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype)
+    ib = dpnp.array(b)
+
+    result = dpnp.dot(a, ib)
+    expected = numpy.dot(a, b)
+    assert_allclose(result, expected)
+
+
+# TODO: get rid of falls back on NumPy when tensordot
+# is implemented using OneMKL
+@pytest.mark.usefixtures("allow_fall_back_on_numpy")
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_complex=True))
+@pytest.mark.parametrize(
+    "array_info",
+    [
+        (1, 10, (), (10,)),
+        (10, 1, (10,), ()),
+        (1, 1, (), ()),
+        (10, 10, (10,), (10,)),
+        (12, 6, (4, 3), (3, 2)),
+        (12, 3, (4, 3), (3,)),
+        (60, 3, (5, 4, 3), (3,)),
+        (4, 8, (4,), (4, 2)),
+        (60, 48, (5, 3, 4), (6, 4, 2)),
+    ],
+    ids=[
+        "0d_1d",
+        "1d_0d",
+        "0d_0d",
+        "1d_1d",
+        "2d_2d",
+        "2d_1d",
+        "3d_1d",
+        "1d_2d",
+        "3d_3d",
+    ],
+)
+def test_dot(dtype, array_info):
+    size1, size2, shape1, shape2 = array_info
+    a = numpy.array(numpy.random.uniform(-5, 5, size1), dtype=dtype).reshape(
+        shape1
+    )
+    b = numpy.array(numpy.random.uniform(-5, 5, size2), dtype=dtype).reshape(
+        shape2
+    )
+    ia = dpnp.array(a)
+    ib = dpnp.array(b)
+
+    result = dpnp.dot(ia, ib)
+    expected = numpy.dot(a, b)
+    assert_dtype_allclose(result, expected)
+
+
+# TODO: get rid of falls back on NumPy when tensordot
+# is implemented using OneMKL
+@pytest.mark.usefixtures("allow_fall_back_on_numpy")
+@pytest.mark.parametrize("dtype", get_complex_dtypes())
+@pytest.mark.parametrize(
+    "array_info",
+    [
+        (1, 10, (), (10,)),
+        (10, 1, (10,), ()),
+        (1, 1, (), ()),
+        (10, 10, (10,), (10,)),
+        (12, 6, (4, 3), (3, 2)),
+        (12, 3, (4, 3), (3,)),
+        (60, 3, (5, 4, 3), (3,)),
+        (4, 8, (4,), (4, 2)),
+        (60, 48, (5, 3, 4), (6, 4, 2)),
+    ],
+    ids=[
+        "0d_1d",
+        "1d_0d",
+        "0d_0d",
+        "1d_1d",
+        "2d_2d",
+        "2d_1d",
+        "3d_1d",
+        "1d_2d",
+        "3d_3d",
+    ],
+)
+def test_dot_complex(dtype, array_info):
+    size1, size2, shape1, shape2 = array_info
+    x11 = numpy.random.uniform(-5, 5, size1)
+    x12 = numpy.random.uniform(-5, 5, size1)
+    x21 = numpy.random.uniform(-5, 5, size2)
+    x22 = numpy.random.uniform(-5, 5, size2)
+    a = numpy.array(x11 + 1j * x12, dtype=dtype).reshape(shape1)
+    b = numpy.array(x21 + 1j * x22, dtype=dtype).reshape(shape2)
+    ia = dpnp.array(a)
+    ib = dpnp.array(b)
+
+    result = dpnp.dot(ia, ib)
+    expected = numpy.dot(a, b)
+    assert_dtype_allclose(result, expected)
+
+
+# TODO: get rid of falls back on NumPy when tensordot
+# is implemented using OneMKL
+@pytest.mark.usefixtures("allow_fall_back_on_numpy")
+@pytest.mark.parametrize("dtype", get_all_dtypes())
+@pytest.mark.parametrize(
+    "array_info",
+    [
+        (1, 10, (), (10,)),
+        (10, 1, (10,), ()),
+        (1, 1, (), ()),
+        (10, 10, (10,), (10,)),
+        (12, 6, (4, 3), (3, 2)),
+        (12, 3, (4, 3), (3,)),
+        (60, 3, (5, 4, 3), (3,)),
+        (4, 8, (4,), (4, 2)),
+        (60, 48, (5, 3, 4), (6, 4, 2)),
+    ],
+    ids=[
+        "0d_1d",
+        "1d_0d",
+        "0d_0d",
+        "1d_1d",
+        "2d_2d",
+        "2d_1d",
+        "3d_1d",
+        "1d_2d",
+        "3d_3d",
+    ],
+)
+def test_dot_ndarray(dtype, array_info):
+    size1, size2, shape1, shape2 = array_info
+    a = numpy.array(numpy.random.uniform(-5, 5, size1), dtype=dtype).reshape(
+        shape1
+    )
+    b = numpy.array(numpy.random.uniform(-5, 5, size2), dtype=dtype).reshape(
+        shape2
+    )
+    ia = dpnp.array(a)
+    ib = dpnp.array(b)
+
+    result = ia.dot(ib)
+    expected = a.dot(b)
+    assert_dtype_allclose(result, expected)
+
+
+@pytest.mark.parametrize("dtype", get_all_dtypes())
+def test_dot_strided(dtype):
+    a = numpy.ones(25, dtype=dtype)
+    b = numpy.ones(25, dtype=dtype)
+    ia = dpnp.array(a)
+    ib = dpnp.array(b)
+
+    result = dpnp.dot(ia[::3], ib[::3])
+    expected = numpy.dot(a[::3], b[::3])
+    assert_dtype_allclose(result, expected)
+
+    # TODO: unmute when the problem with negative stride is fixed
+    # result = dpnp.dot(ia, ib[::-1])
+    # expected = numpy.dot(a, b[::-1])
+    # assert_dtype_allclose(result, expected)
+
 
-    result = inp.dot(ia, ib)
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+def test_dot_out_scalar(dtype):
+    size = 10
+    a = 2
+    b = numpy.array(numpy.random.uniform(-5, 5, size), dtype=dtype)
+    ia = 2
+    ib = dpnp.array(b)
+
+    dp_out = dpnp.empty((size,), dtype=dtype)
+    result = dpnp.dot(ia, ib, out=dp_out)
+    expected = numpy.dot(a, b)
+
+    assert result is dp_out
+    assert_allclose(result, expected)
+
+
+# TODO: get rid of falls back on NumPy when tensordot
+# is implemented using OneMKL
+@pytest.mark.usefixtures("allow_fall_back_on_numpy")
+@pytest.mark.parametrize("dtype", get_all_dtypes())
+@pytest.mark.parametrize(
+    "array_info",
+    [
+        (1, 10, (), (10,), (10,)),
+        (10, 1, (10,), (), (10,)),
+        (1, 1, (), (), ()),
+        (10, 10, (10,), (10,), ()),
+        (12, 6, (4, 3), (3, 2), (4, 2)),
+        (12, 3, (4, 3), (3,), (4,)),
+        (60, 3, (5, 4, 3), (3,), (5, 4)),
+        (4, 8, (4,), (4, 2), (2,)),
+        (60, 48, (5, 3, 4), (6, 4, 2), (5, 3, 6, 2)),
+    ],
+    ids=[
+        "0d_1d",
+        "1d_0d",
+        "0d_0d",
+        "1d_1d",
+        "2d_2d",
+        "2d_1d",
+        "3d_1d",
+        "1d_2d",
+        "3d_3d",
+    ],
+)
+def test_dot_out(dtype, array_info):
+    size1, size2, shape1, shape2, out_shape = array_info
+    a = numpy.array(numpy.random.uniform(-5, 5, size1), dtype=dtype).reshape(
+        shape1
+    )
+    b = numpy.array(numpy.random.uniform(-5, 5, size2), dtype=dtype).reshape(
+        shape2
+    )
+    ia = dpnp.array(a)
+    ib = dpnp.array(b)
+
+    dp_out = dpnp.empty(out_shape, dtype=dtype)
+    result = dpnp.dot(ia, ib, out=dp_out)
     expected = numpy.dot(a, b)
-    assert_allclose(expected, result)
+
+    assert result is dp_out
+    assert_dtype_allclose(result, expected)
+
+
+@pytest.mark.parametrize("dtype1", get_all_dtypes())
+@pytest.mark.parametrize("dtype2", get_all_dtypes())
+def test_dot_input_dtype_matrix(dtype1, dtype2):
+    a = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype1)
+    b = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype2)
+    ia = dpnp.array(a)
+    ib = dpnp.array(b)
+
+    result = dpnp.dot(ia, ib)
+    expected = numpy.dot(a, b)
+    assert_dtype_allclose(result, expected)
+
+
+def test_dot_1d_error():
+    a = dpnp.ones(25)
+    b = dpnp.ones(24)
+    # size of input arrays differ
+    with pytest.raises(ValueError):
+        dpnp.dot(a, b)
+
+
+def test_dot_sycl_queue_error():
+    a = dpnp.ones((5,), sycl_queue=dpctl.SyclQueue())
+    b = dpnp.ones((5,), sycl_queue=dpctl.SyclQueue())
+    with pytest.raises(ValueError):
+        dpnp.dot(a, b)
+
+
+# NumPy does not raise an error for the following test.
+# it just does not update the out keyword if it as not properly defined
+@pytest.mark.parametrize("ia", [1, dpnp.ones((), dtype=dpnp.int32)])
+def test_dot_out_error_scalar(ia):
+    ib = dpnp.ones(10, dtype=dpnp.int32)
+
+    # output data type is incorrect
+    dp_out = dpnp.empty((10,), dtype=dpnp.int64)
+    with pytest.raises(ValueError):
+        dpnp.dot(ia, ib, out=dp_out)
+
+    # output shape is incorrect
+    dp_out = dpnp.empty((2,), dtype=dpnp.int32)
+    with pytest.raises(ValueError):
+        dpnp.dot(ia, ib, out=dp_out)
+
+
+# TODO: get rid of falls back on NumPy when tensordot
+# is implemented using OneMKL
+@pytest.mark.usefixtures("allow_fall_back_on_numpy")
+@pytest.mark.parametrize(
+    "shape_pair",
+    [
+        ((10,), (10,), ()),
+        ((3, 4), (4, 2), (3, 2)),
+        ((3, 4), (4,), (3, 1)),
+        ((5, 4, 3), (3,), (5, 4)),
+        ((4,), (3, 4, 2), (3, 2)),
+        ((5, 3, 4), (6, 4, 2), (5, 3, 6, 2)),
+    ],
+    ids=["1d_1d", "2d_2d", "2d_1d", "3d_1d", "1d_3d", "3d_3d"],
+)
+def test_dot_out_error(shape_pair):
+    shape1, shape2, shape_out = shape_pair
+    a = numpy.ones(shape1, dtype=numpy.int32)
+    b = numpy.ones(shape2, dtype=numpy.int32)
+    ia = dpnp.array(a)
+    ib = dpnp.array(b)
+
+    # output data type is incorrect
+    np_out = numpy.empty(shape_out, dtype=numpy.int64)
+    dp_out = dpnp.empty(shape_out, dtype=dpnp.int64)
+    with pytest.raises(ValueError):
+        dpnp.dot(ia, ib, out=dp_out)
+    with pytest.raises(ValueError):
+        numpy.dot(a, b, out=np_out)
+
+    # output shape is incorrect
+    np_out = numpy.empty((2, 3), dtype=numpy.int32)
+    dp_out = dpnp.empty((2, 3), dtype=dpnp.int32)
+    with pytest.raises(ValueError):
+        dpnp.dot(ia, ib, out=dp_out)
+    with pytest.raises(ValueError):
+        numpy.dot(a, b, out=np_out)
+
+    if not (a.ndim == 1 and b.ndim == 1):
+        # output should be C-contiguous
+        np_out = numpy.empty(shape_out, dtype=numpy.int32, order="F")
+        dp_out = dpnp.empty(shape_out, dtype=dpnp.int32, order="F")
+        with pytest.raises(ValueError):
+            dpnp.dot(ia, ib, out=dp_out)
+        with pytest.raises(ValueError):
+            numpy.dot(a, b, out=np_out)
 
 
 @pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
 def test_multi_dot(type):
     n = 16
-    a = inp.reshape(inp.arange(n, dtype=type), (4, 4))
-    b = inp.reshape(inp.arange(n, dtype=type), (4, 4))
-    c = inp.reshape(inp.arange(n, dtype=type), (4, 4))
-    d = inp.reshape(inp.arange(n, dtype=type), (4, 4))
+    a = dpnp.reshape(dpnp.arange(n, dtype=type), (4, 4))
+    b = dpnp.reshape(dpnp.arange(n, dtype=type), (4, 4))
+    c = dpnp.reshape(dpnp.arange(n, dtype=type), (4, 4))
+    d = dpnp.reshape(dpnp.arange(n, dtype=type), (4, 4))
 
     a1 = numpy.arange(n, dtype=type).reshape((4, 4))
     b1 = numpy.arange(n, dtype=type).reshape((4, 4))
     c1 = numpy.arange(n, dtype=type).reshape((4, 4))
     d1 = numpy.arange(n, dtype=type).reshape((4, 4))
 
-    result = inp.linalg.multi_dot([a, b, c, d])
+    result = dpnp.linalg.multi_dot([a, b, c, d])
     expected = numpy.linalg.multi_dot([a1, b1, c1, d1])
     assert_array_equal(expected, result)
diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index 75735e89bc90..c5f6bcdce43a 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -2525,6 +2525,7 @@ class TestMatmul:
             ((4,), (4,)),
             ((4,), (4, 2)),
             ((2, 4), (4,)),
+            ((1, 4), (4,)),  # output should be 1-d not 0-d
             ((2, 4), (4, 3)),
             ((1, 2, 3), (1, 3, 5)),
             ((4, 2, 3), (4, 3, 5)),
@@ -2680,7 +2681,7 @@ def test_matmul_dtype(self, dtype, shape_pair):
             "((6, 7, 4, 3), (6, 7, 3, 5))",
         ],
     )
-    def test_matmul_dtype_matrix_inputs(self, dtype1, dtype2, shape_pair):
+    def test_matmul_dtype_matrix_inout(self, dtype1, dtype2, shape_pair):
         shape1, shape2 = shape_pair
         a1 = numpy.arange(numpy.prod(shape1), dtype=dtype1).reshape(shape1)
         a2 = numpy.arange(numpy.prod(shape2), dtype=dtype1).reshape(shape2)
@@ -2714,7 +2715,7 @@ def test_matmul_dtype_matrix_inputs(self, dtype1, dtype2, shape_pair):
             "((6, 7, 4, 3), (6, 7, 3, 5))",
         ],
     )
-    def test_matmul_dtype_matrix_inout(self, dtype1, dtype2, shape_pair):
+    def test_matmul_dtype_matrix_inputs(self, dtype1, dtype2, shape_pair):
         shape1, shape2 = shape_pair
         a1 = numpy.arange(numpy.prod(shape1), dtype=dtype1).reshape(shape1)
         a2 = numpy.arange(numpy.prod(shape2), dtype=dtype2).reshape(shape2)
@@ -2836,7 +2837,7 @@ def test_invalid_dtype(self, dtype):
         a2 = dpnp.arange(7 * 4, dtype=dpnp_dtype).reshape(4, 7)
         dp_out = dpnp.empty((5, 7), dtype=dtype)
 
-        with pytest.raises(TypeError):
+        with pytest.raises(ValueError):
             dpnp.matmul(a1, a2, out=dp_out)
 
     def test_exe_q(self):
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index 4b0d21eae40a..e781f5cc6a0a 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -529,8 +529,8 @@ def test_reduce_hypot(device):
         ),
         pytest.param(
             "dot",
-            [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]],
-            [[4.0, 4.0], [4.0, 4.0], [4.0, 4.0]],
+            [3.0, 4.0, 5.0],
+            [1.0, 2.0, 3.0],
         ),
         pytest.param(
             "floor_divide", [1.0, 2.0, 3.0, 4.0], [2.5, 2.5, 2.5, 2.5]
@@ -837,8 +837,8 @@ def test_out_1in_1out(func, data, device):
         ),
         pytest.param(
             "dot",
-            [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]],
-            [[4.0, 4.0], [4.0, 4.0], [4.0, 4.0]],
+            [3.0, 4.0, 5.0],
+            [1.0, 2.0, 3.0],
         ),
         pytest.param(
             "floor_divide", [1.0, 2.0, 3.0, 4.0], [2.5, 2.5, 2.5, 2.5]
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index ada68ebfa6c3..e38416db3f7a 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -493,8 +493,8 @@ def test_1in_1out(func, data, usm_type):
         pytest.param("copysign", [0.0, 1.0, 2.0], [-1.0, 0.0, 1.0]),
         pytest.param(
             "dot",
-            [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]],
-            [[4.0, 4.0], [4.0, 4.0], [4.0, 4.0]],
+            [3.0, 4.0, 5.0],
+            [1.0, 2.0, 3.0],
         ),
         pytest.param("fmax", [[0.0, 1.0, 2.0]], [[3.0, 4.0, 5.0]]),
         pytest.param("fmin", [[0.0, 1.0, 2.0]], [[3.0, 4.0, 5.0]]),
diff --git a/tests/third_party/cupy/linalg_tests/test_product.py b/tests/third_party/cupy/linalg_tests/test_product.py
index 93b13c93e873..c5bad7f10b9b 100644
--- a/tests/third_party/cupy/linalg_tests/test_product.py
+++ b/tests/third_party/cupy/linalg_tests/test_product.py
@@ -36,10 +36,12 @@
         }
     )
 )
-@testing.gpu
+# TODO: get rid of falls back on NumPy when tensordot
+# is implemented using OneMKL
+@pytest.mark.usefixtures("allow_fall_back_on_numpy")
 class TestDot(unittest.TestCase):
     @testing.for_all_dtypes_combination(["dtype_a", "dtype_b"])
-    @testing.numpy_cupy_allclose()
+    @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
     def test_dot(self, xp, dtype_a, dtype_b):
         shape_a, shape_b = self.shape
         if self.trans_a:
@@ -72,7 +74,7 @@ def test_dot_with_out(self, xp, dtype_a, dtype_b, dtype_c):
             shape_c = shape_a[:-1] + shape_b[:-2] + shape_b[-1:]
         c = xp.empty(shape_c, dtype=dtype_c)
         out = xp.dot(a, b, out=c)
-        self.assertIs(out, c)
+        assert out is c
         return c
 
 
@@ -128,10 +130,11 @@ def test_cross(self, xp, dtype_a, dtype_b):
         }
     )
 )
-@testing.gpu
 class TestDotFor0Dim(unittest.TestCase):
     @testing.for_all_dtypes_combination(["dtype_a", "dtype_b"])
-    @testing.numpy_cupy_allclose(contiguous_check=False)
+    @testing.numpy_cupy_allclose(
+        type_check=has_support_aspect64(), contiguous_check=False
+    )
     def test_dot(self, xp, dtype_a, dtype_b):
         shape_a, shape_b = self.shape
         if self.trans_a:
@@ -145,8 +148,7 @@ def test_dot(self, xp, dtype_a, dtype_b):
         return xp.dot(a, b)
 
 
-@testing.gpu
-class TestProduct(unittest.TestCase):
+class TestProduct:
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_dot_vec1(self, xp, dtype):
@@ -154,6 +156,9 @@ def test_dot_vec1(self, xp, dtype):
         b = testing.shaped_arange((2,), xp, dtype)
         return xp.dot(a, b)
 
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_dot_vec2(self, xp, dtype):
@@ -168,6 +173,9 @@ def test_dot_vec3(self, xp, dtype):
         b = testing.shaped_arange((2,), xp, dtype)
         return xp.dot(a, b)
 
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_transposed_dot(self, xp, dtype):
@@ -175,6 +183,9 @@ def test_transposed_dot(self, xp, dtype):
         b = testing.shaped_arange((2, 3, 4), xp, dtype).transpose(0, 2, 1)
         return xp.dot(a, b)
 
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_transposed_dot_with_out(self, xp, dtype):
@@ -184,6 +195,9 @@ def test_transposed_dot_with_out(self, xp, dtype):
         xp.dot(a, b, out=c)
         return c
 
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     def test_transposed_dot_with_out_f_contiguous(self, dtype):
         for xp in (numpy, cupy):

From 7e008925b64ec4627eff8e19c8c262edaf264438 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Thu, 25 Jan 2024 15:23:45 -0600
Subject: [PATCH 02/11] using mkl::dotu instead mkl::dotc for complex

---
 dpnp/backend/extensions/blas/CMakeLists.txt   |  2 +-
 dpnp/backend/extensions/blas/blas_py.cpp      |  6 +--
 dpnp/backend/extensions/blas/dot.hpp          |  4 +-
 .../extensions/blas/{dotc.cpp => dotu.cpp}    | 44 +++++++++----------
 dpnp/backend/extensions/blas/types_matrix.hpp |  4 +-
 dpnp/dpnp_iface_manipulation.py               |  2 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   |  5 +--
 tests/test_mathematical.py                    |  2 +-
 .../cupy/linalg_tests/test_product.py         |  7 ++-
 9 files changed, 40 insertions(+), 36 deletions(-)
 rename dpnp/backend/extensions/blas/{dotc.cpp => dotu.cpp} (88%)

diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index e85c8fd0aca5..fe3a92d2181d 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -28,7 +28,7 @@ set(python_module_name _blas_impl)
 set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/blas_py.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dot.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/dotc.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dotu.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gemm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gemm_batch.cpp
 )
diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
index cb7caa9e2d22..893e240ef251 100644
--- a/dpnp/backend/extensions/blas/blas_py.cpp
+++ b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -40,7 +40,7 @@ namespace py = pybind11;
 void init_dispatch_tables(void)
 {
     blas_ext::init_dot_dispatch_table();
-    blas_ext::init_dotc_dispatch_table();
+    blas_ext::init_dotu_dispatch_table();
     blas_ext::init_gemm_batch_dispatch_table();
     blas_ext::init_gemm_dispatch_table();
 }
@@ -50,8 +50,8 @@ PYBIND11_MODULE(_blas_impl, m)
     init_dispatch_tables();
 
     {
-        m.def("_dotc", &blas_ext::dotc,
-              "Call `dotc` from OneMKL LAPACK library to return "
+        m.def("_dotu", &blas_ext::dotu,
+              "Call `dotu` from OneMKL LAPACK library to return "
               "the dot product of two complex-valued vectors.",
               py::arg("sycl_queue"), py::arg("vectorA"), py::arg("vectorB"),
               py::arg("result"), py::arg("depends") = py::list());
diff --git a/dpnp/backend/extensions/blas/dot.hpp b/dpnp/backend/extensions/blas/dot.hpp
index 806de3997b1f..dd76f3e0f631 100644
--- a/dpnp/backend/extensions/blas/dot.hpp
+++ b/dpnp/backend/extensions/blas/dot.hpp
@@ -39,7 +39,7 @@ namespace ext
 namespace blas
 {
 extern std::pair<sycl::event, sycl::event>
-    dotc(sycl::queue exec_q,
+    dotu(sycl::queue exec_q,
          dpctl::tensor::usm_ndarray vectorA,
          dpctl::tensor::usm_ndarray vectorB,
          dpctl::tensor::usm_ndarray result,
@@ -53,7 +53,7 @@ extern std::pair<sycl::event, sycl::event>
         const std::vector<sycl::event> &depends);
 
 extern void init_dot_dispatch_table(void);
-extern void init_dotc_dispatch_table(void);
+extern void init_dotu_dispatch_table(void);
 } // namespace blas
 } // namespace ext
 } // namespace backend
diff --git a/dpnp/backend/extensions/blas/dotc.cpp b/dpnp/backend/extensions/blas/dotu.cpp
similarity index 88%
rename from dpnp/backend/extensions/blas/dotc.cpp
rename to dpnp/backend/extensions/blas/dotu.cpp
index 0f8fd34f171a..566160db6c70 100644
--- a/dpnp/backend/extensions/blas/dotc.cpp
+++ b/dpnp/backend/extensions/blas/dotu.cpp
@@ -46,7 +46,7 @@ namespace mkl_blas = oneapi::mkl::blas;
 namespace py = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
-typedef sycl::event (*dotc_impl_fn_ptr_t)(sycl::queue,
+typedef sycl::event (*dotu_impl_fn_ptr_t)(sycl::queue,
                                           const std::int64_t,
                                           char *,
                                           const std::int64_t,
@@ -55,11 +55,11 @@ typedef sycl::event (*dotc_impl_fn_ptr_t)(sycl::queue,
                                           char *,
                                           const std::vector<sycl::event> &);
 
-static dotc_impl_fn_ptr_t dotc_dispatch_table[dpctl_td_ns::num_types]
+static dotu_impl_fn_ptr_t dotu_dispatch_table[dpctl_td_ns::num_types]
                                              [dpctl_td_ns::num_types];
 
 template <typename Tab, typename Tc>
-static sycl::event dotc_impl(sycl::queue exec_q,
+static sycl::event dotu_impl(sycl::queue exec_q,
                              const std::int64_t n,
                              char *vectorA,
                              const std::int64_t stride_a,
@@ -78,9 +78,9 @@ static sycl::event dotc_impl(sycl::queue exec_q,
     std::stringstream error_msg;
     bool is_exception_caught = false;
 
-    sycl::event dotc_event;
+    sycl::event dotu_event;
     try {
-        dotc_event = mkl_blas::row_major::dotc(exec_q,
+        dotu_event = mkl_blas::row_major::dotu(exec_q,
                                                n, // size of the input vectors
                                                a, // Pointer to vector a.
                                                stride_a, // Stride of vector a.
@@ -90,11 +90,11 @@ static sycl::event dotc_impl(sycl::queue exec_q,
                                                depends);
     } catch (oneapi::mkl::exception const &e) {
         error_msg
-            << "Unexpected MKL exception caught during dotc() call:\nreason: "
+            << "Unexpected MKL exception caught during dotu() call:\nreason: "
             << e.what();
         is_exception_caught = true;
     } catch (sycl::exception const &e) {
-        error_msg << "Unexpected SYCL exception caught during dotc() call:\n"
+        error_msg << "Unexpected SYCL exception caught during dotu() call:\n"
                   << e.what();
         is_exception_caught = true;
     }
@@ -104,11 +104,11 @@ static sycl::event dotc_impl(sycl::queue exec_q,
         throw std::runtime_error(error_msg.str());
     }
 
-    return dotc_event;
+    return dotu_event;
 }
 
 std::pair<sycl::event, sycl::event>
-    dotc(sycl::queue exec_q,
+    dotu(sycl::queue exec_q,
          dpctl::tensor::usm_ndarray vectorA,
          dpctl::tensor::usm_ndarray vectorB,
          dpctl::tensor::usm_ndarray result,
@@ -184,9 +184,9 @@ std::pair<sycl::event, sycl::event>
     int vectorAB_type_id = array_types.typenum_to_lookup_id(vectorA_typenum);
     int result_type_id = array_types.typenum_to_lookup_id(result_typenum);
 
-    dotc_impl_fn_ptr_t dotc_fn =
-        dotc_dispatch_table[vectorAB_type_id][result_type_id];
-    if (dotc_fn == nullptr) {
+    dotu_impl_fn_ptr_t dotu_fn =
+        dotu_dispatch_table[vectorAB_type_id][result_type_id];
+    if (dotu_fn == nullptr) {
         throw py::value_error(
             "Types of input vectors and result array are mismatched.");
     }
@@ -195,23 +195,23 @@ std::pair<sycl::event, sycl::event>
     char *b_typeless_ptr = vectorB.get_data();
     char *r_typeless_ptr = result.get_data();
 
-    sycl::event dotc_ev =
-        dotc_fn(exec_q, n, a_typeless_ptr, str_a, b_typeless_ptr, str_b,
+    sycl::event dotu_ev =
+        dotu_fn(exec_q, n, a_typeless_ptr, str_a, b_typeless_ptr, str_b,
                 r_typeless_ptr, depends);
 
     sycl::event args_ev = dpctl::utils::keep_args_alive(
-        exec_q, {vectorA, vectorB, result}, {dotc_ev});
+        exec_q, {vectorA, vectorB, result}, {dotu_ev});
 
-    return std::make_pair(args_ev, dotc_ev);
+    return std::make_pair(args_ev, dotu_ev);
 }
 
 template <typename fnT, typename Tab, typename Tc>
-struct DotcContigFactory
+struct DotuContigFactory
 {
     fnT get()
     {
-        if constexpr (types::DotcTypePairSupportFactory<Tab, Tc>::is_defined) {
-            return dotc_impl<Tab, Tc>;
+        if constexpr (types::DotuTypePairSupportFactory<Tab, Tc>::is_defined) {
+            return dotu_impl<Tab, Tc>;
         }
         else {
             return nullptr;
@@ -219,12 +219,12 @@ struct DotcContigFactory
     }
 };
 
-void init_dotc_dispatch_table(void)
+void init_dotu_dispatch_table(void)
 {
-    dpctl_td_ns::DispatchTableBuilder<dotc_impl_fn_ptr_t, DotcContigFactory,
+    dpctl_td_ns::DispatchTableBuilder<dotu_impl_fn_ptr_t, DotuContigFactory,
                                       dpctl_td_ns::num_types>
         contig;
-    contig.populate_dispatch_table(dotc_dispatch_table);
+    contig.populate_dispatch_table(dotu_dispatch_table);
 }
 } // namespace blas
 } // namespace ext
diff --git a/dpnp/backend/extensions/blas/types_matrix.hpp b/dpnp/backend/extensions/blas/types_matrix.hpp
index b6c690405c4e..51859884eae3 100644
--- a/dpnp/backend/extensions/blas/types_matrix.hpp
+++ b/dpnp/backend/extensions/blas/types_matrix.hpp
@@ -45,14 +45,14 @@ namespace types
 {
 /**
  * @brief A factory to define pairs of supported types for which
- * MKL BLAS library provides support in oneapi::mkl::blas::dotc<Tab, Tc>
+ * MKL BLAS library provides support in oneapi::mkl::blas::dotu<Tab, Tc>
  * function.
  *
  * @tparam Tab Type of arrays containing input vectors A and B.
  * @tparam Tc Type of array containing output.
  */
 template <typename Tab, typename Tc>
-struct DotcTypePairSupportFactory
+struct DotuTypePairSupportFactory
 {
     static constexpr bool is_defined = std::disjunction<
         dpctl_td_ns::TypePairDefinedEntry<Tab,
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index b64cb76d1322..6a48e4d1e105 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -709,7 +709,7 @@ def copyto(dst, src, casting="same_kind", where=True):
         src = dpnp.array(src, sycl_queue=dst.sycl_queue)
 
     if not dpnp.can_cast(src.dtype, dst.dtype, casting=casting):
-        raise ValueError(
+        raise TypeError(
             f"Cannot cast from {src.dtype} to {dst.dtype} "
             f"according to the rule {casting}."
         )
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index fb8faa642e12..5dafff9481f1 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -230,10 +230,9 @@ def dpnp_dot(
     b = _copy_array(b, dep_events_list, host_tasks_list, dtype=dot_dtype)
 
     if dpnp.issubdtype(dot_dtype, dpnp.complexfloating):
-        ht_blas_ev, _ = bi._dotc(
+        ht_blas_ev, _ = bi._dotu(
             exec_q,
-            # OneMKL Library conjugates the first input
-            dpnp.get_usm_ndarray(a.conj()),
+            dpnp.get_usm_ndarray(a),
             dpnp.get_usm_ndarray(b),
             dpnp.get_usm_ndarray(result),
             dep_events_list,
diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index c5f6bcdce43a..10b28a899f34 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -2837,7 +2837,7 @@ def test_invalid_dtype(self, dtype):
         a2 = dpnp.arange(7 * 4, dtype=dpnp_dtype).reshape(4, 7)
         dp_out = dpnp.empty((5, 7), dtype=dtype)
 
-        with pytest.raises(ValueError):
+        with pytest.raises(TypeError):
             dpnp.matmul(a1, a2, out=dp_out)
 
     def test_exe_q(self):
diff --git a/tests/third_party/cupy/linalg_tests/test_product.py b/tests/third_party/cupy/linalg_tests/test_product.py
index c5bad7f10b9b..1fd048356b46 100644
--- a/tests/third_party/cupy/linalg_tests/test_product.py
+++ b/tests/third_party/cupy/linalg_tests/test_product.py
@@ -73,7 +73,12 @@ def test_dot_with_out(self, xp, dtype_a, dtype_b, dtype_c):
         else:
             shape_c = shape_a[:-1] + shape_b[:-2] + shape_b[-1:]
         c = xp.empty(shape_c, dtype=dtype_c)
-        out = xp.dot(a, b, out=c)
+        try:
+            out = xp.dot(a, b, out=c)
+        except TypeError:
+            # When output dtype is incorrect, NumPy raises ValueError
+            # While DPNP raises TypeError, so we change it to ValueError
+            raise ValueError
         assert out is c
         return c
 

From 4cb04cf774ad8af32c4a989c901ce611f9a574b0 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Thu, 25 Jan 2024 16:52:35 -0600
Subject: [PATCH 03/11] fix a test

---
 tests/test_dot.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_dot.py b/tests/test_dot.py
index 34d9d70a7270..6425e45821d9 100644
--- a/tests/test_dot.py
+++ b/tests/test_dot.py
@@ -315,7 +315,7 @@ def test_dot_out_error_scalar(ia):
     [
         ((10,), (10,), ()),
         ((3, 4), (4, 2), (3, 2)),
-        ((3, 4), (4,), (3, 1)),
+        ((3, 4), (4,), (3,)),
         ((5, 4, 3), (3,), (5, 4)),
         ((4,), (3, 4, 2), (3, 2)),
         ((5, 3, 4), (6, 4, 2), (5, 3, 6, 2)),
@@ -332,7 +332,7 @@ def test_dot_out_error(shape_pair):
     # output data type is incorrect
     np_out = numpy.empty(shape_out, dtype=numpy.int64)
     dp_out = dpnp.empty(shape_out, dtype=dpnp.int64)
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         dpnp.dot(ia, ib, out=dp_out)
     with pytest.raises(ValueError):
         numpy.dot(a, b, out=np_out)
@@ -345,7 +345,8 @@ def test_dot_out_error(shape_pair):
     with pytest.raises(ValueError):
         numpy.dot(a, b, out=np_out)
 
-    if not (a.ndim == 1 and b.ndim == 1):
+    # "F" or "C" is irrelevant for 0d or 1d arrays
+    if not (len(shape_out) in [0, 1]):
         # output should be C-contiguous
         np_out = numpy.empty(shape_out, dtype=numpy.int32, order="F")
         dp_out = dpnp.empty(shape_out, dtype=dpnp.int32, order="F")

From a356439f7ef5fd6f46096cd4949eda35b03657a7 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Fri, 26 Jan 2024 12:25:52 -0600
Subject: [PATCH 04/11] fix negative strides

---
 dpnp/backend/extensions/blas/dot.cpp  |  9 +++++++++
 dpnp/backend/extensions/blas/dotu.cpp |  9 +++++++++
 tests/test_dot.py                     | 21 ++++++++++++++-------
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/dpnp/backend/extensions/blas/dot.cpp b/dpnp/backend/extensions/blas/dot.cpp
index 44dbfe949017..239e60cf374a 100644
--- a/dpnp/backend/extensions/blas/dot.cpp
+++ b/dpnp/backend/extensions/blas/dot.cpp
@@ -193,6 +193,15 @@ std::pair<sycl::event, sycl::event> dot(sycl::queue exec_q,
     char *b_typeless_ptr = vectorB.get_data();
     char *r_typeless_ptr = result.get_data();
 
+    const int a_elemsize = vectorA.get_elemsize();
+    const int b_elemsize = vectorB.get_elemsize();
+    if (str_a < 0) {
+        a_typeless_ptr -= (n - 1) * std::abs(str_a) * a_elemsize;
+    }
+    if (str_b < 0) {
+        b_typeless_ptr -= (n - 1) * std::abs(str_b) * b_elemsize;
+    }
+
     sycl::event dot_ev = dot_fn(exec_q, n, a_typeless_ptr, str_a,
                                 b_typeless_ptr, str_b, r_typeless_ptr, depends);
 
diff --git a/dpnp/backend/extensions/blas/dotu.cpp b/dpnp/backend/extensions/blas/dotu.cpp
index 566160db6c70..728b874f1eea 100644
--- a/dpnp/backend/extensions/blas/dotu.cpp
+++ b/dpnp/backend/extensions/blas/dotu.cpp
@@ -195,6 +195,15 @@ std::pair<sycl::event, sycl::event>
     char *b_typeless_ptr = vectorB.get_data();
     char *r_typeless_ptr = result.get_data();
 
+    const int a_elemsize = vectorA.get_elemsize();
+    const int b_elemsize = vectorB.get_elemsize();
+    if (str_a < 0) {
+        a_typeless_ptr -= (n - 1) * std::abs(str_a) * a_elemsize;
+    }
+    if (str_b < 0) {
+        b_typeless_ptr -= (n - 1) * std::abs(str_b) * b_elemsize;
+    }
+
     sycl::event dotu_ev =
         dotu_fn(exec_q, n, a_typeless_ptr, str_a, b_typeless_ptr, str_b,
                 r_typeless_ptr, depends);
diff --git a/tests/test_dot.py b/tests/test_dot.py
index 6425e45821d9..8926eb3a1633 100644
--- a/tests/test_dot.py
+++ b/tests/test_dot.py
@@ -181,10 +181,10 @@ def test_dot_ndarray(dtype, array_info):
     assert_dtype_allclose(result, expected)
 
 
-@pytest.mark.parametrize("dtype", get_all_dtypes())
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
 def test_dot_strided(dtype):
-    a = numpy.ones(25, dtype=dtype)
-    b = numpy.ones(25, dtype=dtype)
+    a = numpy.arange(25, dtype=dtype)
+    b = numpy.arange(25, dtype=dtype)
     ia = dpnp.array(a)
     ib = dpnp.array(b)
 
@@ -192,10 +192,17 @@ def test_dot_strided(dtype):
     expected = numpy.dot(a[::3], b[::3])
     assert_dtype_allclose(result, expected)
 
-    # TODO: unmute when the problem with negative stride is fixed
-    # result = dpnp.dot(ia, ib[::-1])
-    # expected = numpy.dot(a, b[::-1])
-    # assert_dtype_allclose(result, expected)
+    result = dpnp.dot(ia, ib[::-1])
+    expected = numpy.dot(a, b[::-1])
+    assert_dtype_allclose(result, expected)
+
+    result = dpnp.dot(ia[::-2], ib[::-2])
+    expected = numpy.dot(a[::-2], b[::-2])
+    assert_dtype_allclose(result, expected)
+
+    result = dpnp.dot(ia[::-5], ib[::-5])
+    expected = numpy.dot(a[::-5], b[::-5])
+    assert_dtype_allclose(result, expected)
 
 
 @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))

From d3ba4f324b18e46f6f6889be09460371a6dd90be Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Fri, 26 Jan 2024 13:45:08 -0600
Subject: [PATCH 05/11] add a temporary workaround

---
 tests/test_dot.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_dot.py b/tests/test_dot.py
index 8926eb3a1633..86782f9c3698 100644
--- a/tests/test_dot.py
+++ b/tests/test_dot.py
@@ -305,12 +305,16 @@ def test_dot_out_error_scalar(ia):
 
     # output data type is incorrect
     dp_out = dpnp.empty((10,), dtype=dpnp.int64)
-    with pytest.raises(ValueError):
+    # TODO: change it to ValueError, when updated
+    # dpctl is being used in internal CI
+    with pytest.raises((ValueError, TypeError)):
         dpnp.dot(ia, ib, out=dp_out)
 
     # output shape is incorrect
     dp_out = dpnp.empty((2,), dtype=dpnp.int32)
-    with pytest.raises(ValueError):
+    # TODO: change it to ValueError, when updated
+    # dpctl is being used in internal CI
+    with pytest.raises((ValueError, TypeError)):
         dpnp.dot(ia, ib, out=dp_out)
 
 

From 815ec86d2c4e9c4c919e0671451a22d309accbe3 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Fri, 26 Jan 2024 15:43:36 -0600
Subject: [PATCH 06/11]  address comments

---
 dpnp/dpnp_iface_linearalgebra.py              |  17 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   | 147 +++++++++---------
 .../cupy/linalg_tests/test_eigenvalue.py      |  25 +--
 .../cupy/math_tests/test_matmul.py            |  55 +++++++
 4 files changed, 142 insertions(+), 102 deletions(-)

diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py
index 0213a23e265b..ae7b7bf58c56 100644
--- a/dpnp/dpnp_iface_linearalgebra.py
+++ b/dpnp/dpnp_iface_linearalgebra.py
@@ -82,19 +82,14 @@ def dot(a, b, out=None):
         Returns the dot product of `a` and `b`.
         If `out` is given, then it is returned.
 
-    Limitations
-    -----------
-    Parameters `x1` and `x2` are supported as either scalar, :class:`dpnp.ndarray`
-    or :class:`dpctl.tensor.usm_ndarray`, but both `x1` and `x2` can not be scalars at the same time.
-    Keyword argument ``kwargs`` is currently unsupported.
-    Otherwise the functions will be executed sequentially on CPU.
-    Input array data types are limited by supported DPNP :ref:`Data types`.
-
     See Also
     --------
     :obj:`dpnp.ndarray.dot` : Equivalent method.
     :obj:`dpnp.tensordot` : Sum products over arbitrary axes.
     :obj:`dpnp.vdot` : Complex-conjugating dot product.
+    :obj:`dpnp.einsum` : Einstein summation convention.
+    :obj:`dpnp.matmul` : Matrix product of two arrays.
+    :obj:`dpnp.linalg.multi_dot` : Chained dot product.
 
     Examples
     --------
@@ -135,15 +130,19 @@ def dot(a, b, out=None):
             raise ValueError("Only C-contiguous array is acceptable.")
 
     if dpnp.isscalar(a) or dpnp.isscalar(b):
+        # TODO: investigate usage of axpy (axpy_batch) or scal
+        # functions from BLAS here instead of dpnp.multiply
         return dpnp.multiply(a, b, out=out)
     elif a.ndim == 0 or b.ndim == 0:
+        # TODO: investigate usage of axpy (axpy_batch) or scal
+        # functions from BLAS here instead of dpnp.multiply
         return dpnp.multiply(a, b, out=out)
     elif a.ndim == 1 and b.ndim == 1:
         return dpnp_dot(a, b, out=out)
     elif a.ndim == 2 and b.ndim == 2:
         # NumPy does not allow casting even if it is safe
         return dpnp.matmul(a, b, out=out, casting="no")
-    elif a.ndim > 1 and b.ndim == 1:
+    elif a.ndim == 1 or b.ndim == 1:
         # NumPy does not allow casting even if it is safe
         return dpnp.matmul(a, b, out=out, casting="no")
     else:
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 5dafff9481f1..4efc9527e54b 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -34,61 +34,34 @@
 __all__ = ["dpnp_dot", "dpnp_matmul"]
 
 
-def _op_res_dtype(*arrays, dtype, casting, sycl_queue):
+def _copy_array(x, dep_events, host_events, contig_copy=False, dtype=None):
     """
-    _op_res_dtype(*arrays, dtype, casting, sycl_queue)
-
-    Determines the output array data type and an intermediate data type
-    used in performing calculations related to a specific math function.
-    If dtype is ``None``, the output array data type of the operation is
-    determined based on the Promotion Type Rule and device capabilities.
-    Otherwise, `dtype` is used as output array dtype, if input arrays
-    can cast to it according to the casting rule determined. If casting
-    cannot be done, a ``TypeError`` is raised.
-    The intermediate data type is the data type used for performing the math
-    function calculations. If output array dtype is a floating-point data type,
-    it is also used for the intermediate data type. If output array dtype is an
-    integral data type, the default floating point data type of the device where
-    input arrays are allocated on are used for intermediate data type.
-
-    Parameters
-    ----------
-    arrays : {dpnp.ndarray, usm_ndarray}
-        Input arrays.
-    dtype : dtype
-        If not ``None``, data type of the output array.
-    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
-        Controls what kind of data casting may occur.
-    sycl_queue : {SyclQueue}
-        A SYCL queue to use for determining default floating point datat type.
+    Creating a copy of input array if needed.
 
-    Returns
-    -------
-    op_dtype, res_dtype :
-        `op_dtype` is the data type used in performing math function calculations.
-        The input arrays of the math function are cast to `op_dtype` and then
-        the calculations are performed.
-        `res_dtype` is the output data type. When the result is obtained, it is cast
-        to `res_dtype`.
+    If `contig_copy` is ``True``, a C-contiguous copy of input array is returned.
+    In this case, the copy array has the input array data type unless `dtype` is
+    determined.
+    If `contig_copy` is ``False`` and input array data type is different than `dtype`,
+    a C-contiguous copy of input array with specified `dtype` is returned.
 
     """
 
-    res_dtype = dpnp.result_type(*arrays)
-    default_dtype = dpnp.default_float_type(sycl_queue=sycl_queue)
-
-    if dtype is not None:
-        if dpnp.can_cast(res_dtype, dtype, casting=casting):
-            res_dtype = dtype
-        else:
-            raise TypeError(
-                f"Cannot cast ufunc 'matmul' output from dtype({res_dtype}) to dtype({dtype}) with casting rule {casting}"
-            )
-
-    op_dtype = (
-        res_dtype if dpnp.issubdtype(res_dtype, dpnp.inexact) else default_dtype
-    )
+    if contig_copy:
+        copy = contig_copy
+    else:
+        copy = x.dtype != dtype if dtype is not None else False
 
-    return op_dtype, res_dtype
+    if copy:
+        x_copy = dpnp.empty_like(x, dtype=dtype, order="C")
+        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=dpnp.get_usm_ndarray(x),
+            dst=x_copy.get_array(),
+            sycl_queue=x.sycl_queue,
+        )
+        dep_events.append(copy_ev)
+        host_events.append(ht_copy_ev)
+        return x_copy
+    return x
 
 
 def _gemm_batch_matmul(exec_q, x1, x2, res, x1_is_2D, x2_is_2D, dev_tasks_list):
@@ -153,34 +126,61 @@ def _gemm_batch_matmul(exec_q, x1, x2, res, x1_is_2D, x2_is_2D, dev_tasks_list):
     return ht_blas_ev, ht_tasks_list, res
 
 
-def _copy_array(x, dep_events, host_events, contig_copy=False, dtype=None):
+def _op_res_dtype(*arrays, dtype, casting, sycl_queue):
     """
-    Creating a copy of input array if needed.
+    _op_res_dtype(*arrays, dtype, casting, sycl_queue)
 
-    If `contig_copy` is ``True``, a C-contiguous copy of input array is returned.
-    In this case, the copy array has the input array data type unless `dtype` is
-    determined.
-    If `contig_copy` is ``False`` and input array data type is different than `dtype`,
-    a C-contiguous copy of input array with specified `dtype` is returned.
+    Determines the output array data type and an intermediate data type
+    used in performing calculations related to a specific math function.
+    If dtype is ``None``, the output array data type of the operation is
+    determined based on the Promotion Type Rule and device capabilities.
+    Otherwise, `dtype` is used as output array dtype, if input arrays
+    can cast to it according to the casting rule determined. If casting
+    cannot be done, a ``TypeError`` is raised.
+    The intermediate data type is the data type used for performing the math
+    function calculations. If output array dtype is a floating-point data type,
+    it is also used for the intermediate data type. If output array dtype is an
+    integral data type, the default floating point data type of the device where
+    input arrays are allocated on are used for intermediate data type.
+
+    Parameters
+    ----------
+    arrays : {dpnp.ndarray, usm_ndarray}
+        Input arrays.
+    dtype : dtype
+        If not ``None``, data type of the output array.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur.
+    sycl_queue : {SyclQueue}
+        A SYCL queue to use for determining default floating point datat type.
+
+    Returns
+    -------
+    op_dtype, res_dtype :
+        `op_dtype` is the data type used in performing math function calculations.
+        The input arrays of the math function are cast to `op_dtype` and then
+        the calculations are performed.
+        `res_dtype` is the output data type. When the result is obtained, it is cast
+        to `res_dtype`.
 
     """
 
-    if contig_copy:
-        copy = contig_copy
-    else:
-        copy = x.dtype != dtype if dtype is not None else False
+    res_dtype = dpnp.result_type(*arrays)
+    default_dtype = dpnp.default_float_type(sycl_queue=sycl_queue)
 
-    if copy:
-        x_copy = dpnp.empty_like(x, dtype=dtype, order="C")
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=dpnp.get_usm_ndarray(x),
-            dst=x_copy.get_array(),
-            sycl_queue=x.sycl_queue,
-        )
-        dep_events.append(copy_ev)
-        host_events.append(ht_copy_ev)
-        return x_copy
-    return x
+    if dtype is not None:
+        if dpnp.can_cast(res_dtype, dtype, casting=casting):
+            res_dtype = dtype
+        else:
+            raise TypeError(
+                f"Cannot cast ufunc 'matmul' output from dtype({res_dtype}) to dtype({dtype}) with casting rule {casting}"
+            )
+
+    op_dtype = (
+        res_dtype if dpnp.issubdtype(res_dtype, dpnp.inexact) else default_dtype
+    )
+
+    return op_dtype, res_dtype
 
 
 def dpnp_dot(
@@ -394,6 +394,11 @@ def dpnp_matmul(
             dtype=gemm_dtype,
         )
 
+        # TODO: investigate usage of gemv (gemv_batch) function
+        # from BLAS when one of the inputs is a vector to
+        # gain performance.
+        # TODO: investigate usage of syrk function from BLAS in
+        # case of a.T @ a and a @ a.T to gain performance.
         if x1_is_2D and x2_is_2D:
             ht_blas_ev, _ = bi._gemm(
                 exec_q,
diff --git a/tests/third_party/cupy/linalg_tests/test_eigenvalue.py b/tests/third_party/cupy/linalg_tests/test_eigenvalue.py
index 99dcfb2127c8..b620bd39e984 100644
--- a/tests/third_party/cupy/linalg_tests/test_eigenvalue.py
+++ b/tests/third_party/cupy/linalg_tests/test_eigenvalue.py
@@ -15,12 +15,6 @@ def _get_hermitian(xp, a, UPLO):
         return xp.tril(a) + xp.tril(a, k=-1).swapaxes(-2, -1).conj()
 
 
-# TODO:
-# remove once dpnp.dot and dpnp.matmul support complex types
-def _wrap_as_numpy_array(xp, a):
-    return a.asnumpy() if xp is cupy else a
-
-
 @testing.parameterize(
     *testing.product(
         {
@@ -57,20 +51,12 @@ def test_eigh(self, xp, dtype):
         else:
             tol = 1e-5
 
-        # TODO: remove _wrap_as_numpy_array() once @ support complex types
-        testing.assert_allclose(
-            _wrap_as_numpy_array(xp, A) @ _wrap_as_numpy_array(xp, v),
-            _wrap_as_numpy_array(xp, v)
-            @ numpy.diag(_wrap_as_numpy_array(xp, w)),
-            atol=tol,
-            rtol=tol,
-        )
+        testing.assert_allclose(A @ v, v @ xp.diag(w), atol=tol, rtol=tol)
 
         # Check if v @ vt is an identity matrix
         testing.assert_allclose(
-            _wrap_as_numpy_array(xp, v)
-            @ _wrap_as_numpy_array(xp, v).swapaxes(-2, -1).conj(),
-            numpy.identity(_wrap_as_numpy_array(xp, A).shape[-1], _dtype),
+            v @ v.swapaxes(-2, -1).conj(),
+            xp.identity(A.shape[-1], _dtype),
             atol=tol,
             rtol=tol,
         )
@@ -121,11 +107,6 @@ def test_eigh_complex_batched(self, xp, dtype):
         # them through the eigen equation A*v=w*v.
         A = _get_hermitian(xp, a, self.UPLO)
 
-        # TODO: remove _wrap_as_numpy_array() once dpnp.dot() support complex types
-        A = _wrap_as_numpy_array(xp, A)
-        v = _wrap_as_numpy_array(xp, v)
-        w = _wrap_as_numpy_array(xp, w)
-
         for i in range(a.shape[0]):
             testing.assert_allclose(
                 A[i].dot(v[i]), w[i] * v[i], rtol=1e-5, atol=1e-5
diff --git a/tests/third_party/cupy/math_tests/test_matmul.py b/tests/third_party/cupy/math_tests/test_matmul.py
index d21ec7a2d686..887ed9ae1b98 100644
--- a/tests/third_party/cupy/math_tests/test_matmul.py
+++ b/tests/third_party/cupy/math_tests/test_matmul.py
@@ -73,6 +73,61 @@ def test_cupy_matmul(self, xp, dtype1):
         return xp.matmul(x1, x2)
 
 
+@testing.parameterize(
+    *testing.product(
+        {
+            "shape_pair": [
+                # dot test
+                ((2, 3), (3, 4), (2, 4)),
+                # ((0,), (0,), (0,)),
+                # matmul test
+                ((5, 3, 2), (5, 2, 4), (5, 3, 4)),
+                ((0, 3, 2), (0, 2, 4), (0, 3, 4)),
+            ],
+        }
+    )
+)
+class TestMatmulOut(unittest.TestCase):
+    @testing.for_all_dtypes(name="dtype1")
+    @testing.for_all_dtypes(name="dtype2")
+    @testing.numpy_cupy_allclose(
+        rtol=1e-3, atol=1e-3, accept_error=TypeError  # required for uint8
+    )
+    def test_cupy_matmul_noncontiguous(self, xp, dtype1, dtype2):
+        x1 = testing.shaped_arange(self.shape_pair[0], xp, dtype1)
+        x2 = testing.shaped_arange(self.shape_pair[1], xp, dtype2)
+        out = xp.zeros(self.shape_pair[2], dtype=dtype1)[::-1]
+        ret = xp.matmul(x1, x2, out=out)
+        assert ret is out
+        return ret
+
+    @testing.for_all_dtypes(name="dtype1")
+    @testing.for_all_dtypes(name="dtype2")
+    @testing.numpy_cupy_allclose(rtol=1e-3, atol=1e-3)  # required for uint8
+    def test_cupy_matmul_out_cast(self, xp, dtype1, dtype2):
+        x1 = testing.shaped_arange(self.shape_pair[0], xp, dtype1)
+        x2 = testing.shaped_arange(self.shape_pair[1], xp, dtype2)
+        out = xp.zeros(self.shape_pair[2], dtype=bool)
+        ret = xp.matmul(x1, x2, out=out, casting="unsafe")
+        assert ret is out
+        return ret
+
+
+class TestMatmulOutOverlap:
+    @pytest.mark.parametrize(
+        "shape",
+        [
+            (900, 900),
+            (2, 600, 600),
+        ],
+    )
+    @testing.for_dtypes([numpy.int32, numpy.float64])
+    @testing.numpy_cupy_allclose(rtol=1e-5, atol=1e-5)
+    def test_overlap_both(self, xp, dtype, shape):
+        a = xp.ones(shape, dtype=dtype)
+        return xp.matmul(a, a, out=a)
+
+
 @testing.parameterize(
     *testing.product(
         {

From 44a7725aa0d2d015fad56dcdae5760c304b91d58 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Mon, 29 Jan 2024 13:20:59 -0600
Subject: [PATCH 07/11] add a TODO comment

---
 dpnp/dpnp_iface_linearalgebra.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dpnp/dpnp_iface_linearalgebra.py b/dpnp/dpnp_iface_linearalgebra.py
index ae7b7bf58c56..9d63f7f8c3d2 100644
--- a/dpnp/dpnp_iface_linearalgebra.py
+++ b/dpnp/dpnp_iface_linearalgebra.py
@@ -146,6 +146,8 @@ def dot(a, b, out=None):
         # NumPy does not allow casting even if it is safe
         return dpnp.matmul(a, b, out=out, casting="no")
     else:
+        # TODO: investigate usage of matmul for some possible
+        # use cases instead of dpnp.tensordot
         result = dpnp.tensordot(a, b, axes=(-1, -2))
         # NumPy does not allow casting even if it is safe
         return dpnp.get_result_array(result, out, casting="no")

From b7081d43859ae13de4b9a67616d3ecf0b2cd586c Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Tue, 30 Jan 2024 12:37:24 -0600
Subject: [PATCH 08/11] call dpt.vecdot for integer data types

---
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 49 ++++++++++++---------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 4efc9527e54b..9a488c696bbf 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -24,11 +24,13 @@
 # *****************************************************************************
 
 import dpctl
+import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
 import numpy
 
 import dpnp
 import dpnp.backend.extensions.blas._blas_impl as bi
+from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
 
 __all__ = ["dpnp_dot", "dpnp_matmul"]
@@ -210,7 +212,7 @@ def dpnp_dot(
     res_usm_type, exec_q = get_usm_allocations([a, b])
 
     # Determine the appropriate data types
-    # casting is irrelevant here since dtype is `None``
+    # casting is irrelevant here since dtype is `None`
     dot_dtype, res_dtype = _op_res_dtype(
         a, b, dtype=None, casting="no", sycl_queue=exec_q
     )
@@ -226,27 +228,32 @@ def dpnp_dot(
     # input arrays should have the proper data type
     dep_events_list = []
     host_tasks_list = []
-    a = _copy_array(a, dep_events_list, host_tasks_list, dtype=dot_dtype)
-    b = _copy_array(b, dep_events_list, host_tasks_list, dtype=dot_dtype)
-
-    if dpnp.issubdtype(dot_dtype, dpnp.complexfloating):
-        ht_blas_ev, _ = bi._dotu(
-            exec_q,
-            dpnp.get_usm_ndarray(a),
-            dpnp.get_usm_ndarray(b),
-            dpnp.get_usm_ndarray(result),
-            dep_events_list,
-        )
+    if dpnp.issubdtype(res_dtype, dpnp.inexact):
+        # copying is needed if dtypes of input arrays are different
+        a = _copy_array(a, dep_events_list, host_tasks_list, dtype=dot_dtype)
+        b = _copy_array(b, dep_events_list, host_tasks_list, dtype=dot_dtype)
+        if dpnp.issubdtype(res_dtype, dpnp.complexfloating):
+            ht_ev, _ = bi._dotu(
+                exec_q,
+                dpnp.get_usm_ndarray(a),
+                dpnp.get_usm_ndarray(b),
+                dpnp.get_usm_ndarray(result),
+                dep_events_list,
+            )
+        else:
+            ht_ev, _ = bi._dot(
+                exec_q,
+                dpnp.get_usm_ndarray(a),
+                dpnp.get_usm_ndarray(b),
+                dpnp.get_usm_ndarray(result),
+                dep_events_list,
+            )
+        host_tasks_list.append(ht_ev)
+        dpctl.SyclEvent.wait_for(host_tasks_list)
     else:
-        ht_blas_ev, _ = bi._dot(
-            exec_q,
-            dpnp.get_usm_ndarray(a),
-            dpnp.get_usm_ndarray(b),
-            dpnp.get_usm_ndarray(result),
-            dep_events_list,
-        )
-    host_tasks_list.append(ht_blas_ev)
-    dpctl.SyclEvent.wait_for(host_tasks_list)
+        dpt_a = dpnp.get_usm_ndarray(a)
+        dpt_b = dpnp.get_usm_ndarray(b)
+        result = dpnp_array._create_from_usm_ndarray(dpt.vecdot(dpt_a, dpt_b))
 
     if dot_dtype != res_dtype:
         result = result.astype(res_dtype, copy=False)

From 3d4b957e5294e56b7050da7319d1a92a767403f2 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Thu, 1 Feb 2024 17:03:01 -0600
Subject: [PATCH 09/11] update doc string

---
 dpnp/backend/extensions/blas/blas_py.cpp      |  12 +-
 dpnp/backend/extensions/blas/dot.hpp          |  14 +-
 dpnp/backend/extensions/blas/types_matrix.hpp |  30 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py   |  21 +-
 tests/test_dot.py                             | 684 +++++++++---------
 5 files changed, 371 insertions(+), 390 deletions(-)

diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
index 893e240ef251..7d5237381b1f 100644
--- a/dpnp/backend/extensions/blas/blas_py.cpp
+++ b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -50,17 +50,17 @@ PYBIND11_MODULE(_blas_impl, m)
     init_dispatch_tables();
 
     {
-        m.def("_dotu", &blas_ext::dotu,
-              "Call `dotu` from OneMKL LAPACK library to return "
-              "the dot product of two complex-valued vectors.",
+        m.def("_dot", &blas_ext::dot,
+              "Call `dot` from OneMKL LAPACK library to return "
+              "the dot product of two real-valued vectors.",
               py::arg("sycl_queue"), py::arg("vectorA"), py::arg("vectorB"),
               py::arg("result"), py::arg("depends") = py::list());
     }
 
     {
-        m.def("_dot", &blas_ext::dot,
-              "Call `dot` from OneMKL LAPACK library to return "
-              "the dot product of two real-valued vectors.",
+        m.def("_dotu", &blas_ext::dotu,
+              "Call `dotu` from OneMKL LAPACK library to return "
+              "the dot product of two complex vectors.",
               py::arg("sycl_queue"), py::arg("vectorA"), py::arg("vectorB"),
               py::arg("result"), py::arg("depends") = py::list());
     }
diff --git a/dpnp/backend/extensions/blas/dot.hpp b/dpnp/backend/extensions/blas/dot.hpp
index dd76f3e0f631..f0d79dfa6478 100644
--- a/dpnp/backend/extensions/blas/dot.hpp
+++ b/dpnp/backend/extensions/blas/dot.hpp
@@ -38,13 +38,6 @@ namespace ext
 {
 namespace blas
 {
-extern std::pair<sycl::event, sycl::event>
-    dotu(sycl::queue exec_q,
-         dpctl::tensor::usm_ndarray vectorA,
-         dpctl::tensor::usm_ndarray vectorB,
-         dpctl::tensor::usm_ndarray result,
-         const std::vector<sycl::event> &depends);
-
 extern std::pair<sycl::event, sycl::event>
     dot(sycl::queue exec_q,
         dpctl::tensor::usm_ndarray vectorA,
@@ -52,6 +45,13 @@ extern std::pair<sycl::event, sycl::event>
         dpctl::tensor::usm_ndarray result,
         const std::vector<sycl::event> &depends);
 
+extern std::pair<sycl::event, sycl::event>
+    dotu(sycl::queue exec_q,
+         dpctl::tensor::usm_ndarray vectorA,
+         dpctl::tensor::usm_ndarray vectorB,
+         dpctl::tensor::usm_ndarray result,
+         const std::vector<sycl::event> &depends);
+
 extern void init_dot_dispatch_table(void);
 extern void init_dotu_dispatch_table(void);
 } // namespace blas
diff --git a/dpnp/backend/extensions/blas/types_matrix.hpp b/dpnp/backend/extensions/blas/types_matrix.hpp
index 51859884eae3..c36ae0e20452 100644
--- a/dpnp/backend/extensions/blas/types_matrix.hpp
+++ b/dpnp/backend/extensions/blas/types_matrix.hpp
@@ -45,43 +45,43 @@ namespace types
 {
 /**
  * @brief A factory to define pairs of supported types for which
- * MKL BLAS library provides support in oneapi::mkl::blas::dotu<Tab, Tc>
+ * MKL BLAS library provides support in oneapi::mkl::blas::dot<Tab, Tc>
  * function.
  *
  * @tparam Tab Type of arrays containing input vectors A and B.
  * @tparam Tc Type of array containing output.
  */
 template <typename Tab, typename Tc>
-struct DotuTypePairSupportFactory
+struct DotTypePairSupportFactory
 {
     static constexpr bool is_defined = std::disjunction<
-        dpctl_td_ns::TypePairDefinedEntry<Tab,
-                                          std::complex<float>,
-                                          Tc,
-                                          std::complex<float>>,
-        dpctl_td_ns::TypePairDefinedEntry<Tab,
-                                          std::complex<double>,
-                                          Tc,
-                                          std::complex<double>>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, float, Tc, float>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, float, Tc, double>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab, double, Tc, double>,
         // fall-through
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
 
 /**
  * @brief A factory to define pairs of supported types for which
- * MKL BLAS library provides support in oneapi::mkl::blas::dot<Tab, Tc>
+ * MKL BLAS library provides support in oneapi::mkl::blas::dotu<Tab, Tc>
  * function.
  *
  * @tparam Tab Type of arrays containing input vectors A and B.
  * @tparam Tc Type of array containing output.
  */
 template <typename Tab, typename Tc>
-struct DotTypePairSupportFactory
+struct DotuTypePairSupportFactory
 {
     static constexpr bool is_defined = std::disjunction<
-        dpctl_td_ns::TypePairDefinedEntry<Tab, float, Tc, float>,
-        dpctl_td_ns::TypePairDefinedEntry<Tab, float, Tc, double>,
-        dpctl_td_ns::TypePairDefinedEntry<Tab, double, Tc, double>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<float>,
+                                          Tc,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<Tab,
+                                          std::complex<double>,
+                                          Tc,
+                                          std::complex<double>>,
         // fall-through
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 9a488c696bbf..4895e5e91284 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -185,21 +185,16 @@ def _op_res_dtype(*arrays, dtype, casting, sycl_queue):
     return op_dtype, res_dtype
 
 
-def dpnp_dot(
-    a,
-    b,
-    /,
-    out=None,
-):
+def dpnp_dot(a, b, /, out=None):
     """
-    dpnp_dot(a, b, out=None)
-
     Return the dot product of two arrays.
 
-    The main calculation is done by calling an extension function
-    for BLAS library of OneMKL. Depending on the data type of `a` and `b` arrays,
-    it will be either ``dot`` (for real-valued data types) or
-    ``dotc``(for complex-valued data types).
+    The routine that is used to perform the main calculation
+    depends on input array data types: 1) For integer data types,
+    `dpctl.tensor.vecdot` form the Data Parallel Control library is used,
+    2) For floating point real-valued data types, `dot` routines from
+    BLAS library of OneMKL is used, and 3) For complex data types,
+    `dotu` routines from BLAS library of OneMKL is used.
 
     """
 
@@ -273,8 +268,6 @@ def dpnp_matmul(
     dtype=None,
 ):
     """
-    dpnp_matmul(x1, x2, out=None, casting="same_kind", order="K", dtype=None)
-
     Return the matrix product of two arrays.
 
     The main calculation is done by calling an extension function
diff --git a/tests/test_dot.py b/tests/test_dot.py
index 86782f9c3698..55884b00cd3b 100644
--- a/tests/test_dot.py
+++ b/tests/test_dot.py
@@ -8,364 +8,352 @@
 from .helper import assert_dtype_allclose, get_all_dtypes, get_complex_dtypes
 
 
-@pytest.mark.parametrize("dtype", get_all_dtypes())
-def test_dot_ones(dtype):
-    n = 10**5
-    a = numpy.ones(n, dtype=dtype)
-    b = numpy.ones(n, dtype=dtype)
-    ia = dpnp.array(a)
-    ib = dpnp.array(b)
-
-    result = dpnp.dot(ia, ib)
-    expected = numpy.dot(a, b)
-    assert_dtype_allclose(result, expected)
-
-
-@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
-def test_dot_arange(dtype):
-    n = 10**2
-    m = 10**3 if dtype is not dpnp.float32 else 10**2
-    a = numpy.hstack((numpy.arange(n, dtype=dtype),) * m)
-    b = numpy.flipud(a)
-    ia = dpnp.array(a)
-    ib = dpnp.array(b)
-
-    result = dpnp.dot(ia, ib)
-    expected = numpy.dot(a, b)
-    assert_dtype_allclose(result, expected)
-
-
-@pytest.mark.parametrize("dtype", get_all_dtypes())
-def test_dot_scalar(dtype):
-    a = 2
-    b = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype)
-    ib = dpnp.array(b)
-
-    result = dpnp.dot(a, ib)
-    expected = numpy.dot(a, b)
-    assert_allclose(result, expected)
-
-
-# TODO: get rid of falls back on NumPy when tensordot
-# is implemented using OneMKL
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("dtype", get_all_dtypes(no_complex=True))
-@pytest.mark.parametrize(
-    "array_info",
-    [
-        (1, 10, (), (10,)),
-        (10, 1, (10,), ()),
-        (1, 1, (), ()),
-        (10, 10, (10,), (10,)),
-        (12, 6, (4, 3), (3, 2)),
-        (12, 3, (4, 3), (3,)),
-        (60, 3, (5, 4, 3), (3,)),
-        (4, 8, (4,), (4, 2)),
-        (60, 48, (5, 3, 4), (6, 4, 2)),
-    ],
-    ids=[
-        "0d_1d",
-        "1d_0d",
-        "0d_0d",
-        "1d_1d",
-        "2d_2d",
-        "2d_1d",
-        "3d_1d",
-        "1d_2d",
-        "3d_3d",
-    ],
-)
-def test_dot(dtype, array_info):
-    size1, size2, shape1, shape2 = array_info
-    a = numpy.array(numpy.random.uniform(-5, 5, size1), dtype=dtype).reshape(
-        shape1
+class Testdot:
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    def test_dot_ones(self, dtype):
+        n = 10**5
+        a = numpy.ones(n, dtype=dtype)
+        b = numpy.ones(n, dtype=dtype)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia, ib)
+        expected = numpy.dot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_dot_arange(self, dtype):
+        n = 10**2
+        m = 10**3 if dtype is not dpnp.float32 else 10**2
+        a = numpy.hstack((numpy.arange(n, dtype=dtype),) * m)
+        b = numpy.flipud(a)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia, ib)
+        expected = numpy.dot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    def test_dot_scalar(self, dtype):
+        a = 2
+        b = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(a, ib)
+        expected = numpy.dot(a, b)
+        assert_allclose(result, expected)
+
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_complex=True))
+    @pytest.mark.parametrize(
+        "array_info",
+        [
+            (1, 10, (), (10,)),
+            (10, 1, (10,), ()),
+            (1, 1, (), ()),
+            (10, 10, (10,), (10,)),
+            (12, 6, (4, 3), (3, 2)),
+            (12, 3, (4, 3), (3,)),
+            (60, 3, (5, 4, 3), (3,)),
+            (4, 8, (4,), (4, 2)),
+            (60, 48, (5, 3, 4), (6, 4, 2)),
+        ],
+        ids=[
+            "0d_1d",
+            "1d_0d",
+            "0d_0d",
+            "1d_1d",
+            "2d_2d",
+            "2d_1d",
+            "3d_1d",
+            "1d_2d",
+            "3d_3d",
+        ],
     )
-    b = numpy.array(numpy.random.uniform(-5, 5, size2), dtype=dtype).reshape(
-        shape2
+    def test_dot(self, dtype, array_info):
+        size1, size2, shape1, shape2 = array_info
+        a = numpy.array(
+            numpy.random.uniform(-5, 5, size1), dtype=dtype
+        ).reshape(shape1)
+        b = numpy.array(
+            numpy.random.uniform(-5, 5, size2), dtype=dtype
+        ).reshape(shape2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia, ib)
+        expected = numpy.dot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_complex_dtypes())
+    @pytest.mark.parametrize(
+        "array_info",
+        [
+            (1, 10, (), (10,)),
+            (10, 1, (10,), ()),
+            (1, 1, (), ()),
+            (10, 10, (10,), (10,)),
+            (12, 6, (4, 3), (3, 2)),
+            (12, 3, (4, 3), (3,)),
+            (60, 3, (5, 4, 3), (3,)),
+            (4, 8, (4,), (4, 2)),
+            (60, 48, (5, 3, 4), (6, 4, 2)),
+        ],
+        ids=[
+            "0d_1d",
+            "1d_0d",
+            "0d_0d",
+            "1d_1d",
+            "2d_2d",
+            "2d_1d",
+            "3d_1d",
+            "1d_2d",
+            "3d_3d",
+        ],
     )
-    ia = dpnp.array(a)
-    ib = dpnp.array(b)
-
-    result = dpnp.dot(ia, ib)
-    expected = numpy.dot(a, b)
-    assert_dtype_allclose(result, expected)
-
-
-# TODO: get rid of falls back on NumPy when tensordot
-# is implemented using OneMKL
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("dtype", get_complex_dtypes())
-@pytest.mark.parametrize(
-    "array_info",
-    [
-        (1, 10, (), (10,)),
-        (10, 1, (10,), ()),
-        (1, 1, (), ()),
-        (10, 10, (10,), (10,)),
-        (12, 6, (4, 3), (3, 2)),
-        (12, 3, (4, 3), (3,)),
-        (60, 3, (5, 4, 3), (3,)),
-        (4, 8, (4,), (4, 2)),
-        (60, 48, (5, 3, 4), (6, 4, 2)),
-    ],
-    ids=[
-        "0d_1d",
-        "1d_0d",
-        "0d_0d",
-        "1d_1d",
-        "2d_2d",
-        "2d_1d",
-        "3d_1d",
-        "1d_2d",
-        "3d_3d",
-    ],
-)
-def test_dot_complex(dtype, array_info):
-    size1, size2, shape1, shape2 = array_info
-    x11 = numpy.random.uniform(-5, 5, size1)
-    x12 = numpy.random.uniform(-5, 5, size1)
-    x21 = numpy.random.uniform(-5, 5, size2)
-    x22 = numpy.random.uniform(-5, 5, size2)
-    a = numpy.array(x11 + 1j * x12, dtype=dtype).reshape(shape1)
-    b = numpy.array(x21 + 1j * x22, dtype=dtype).reshape(shape2)
-    ia = dpnp.array(a)
-    ib = dpnp.array(b)
-
-    result = dpnp.dot(ia, ib)
-    expected = numpy.dot(a, b)
-    assert_dtype_allclose(result, expected)
-
-
-# TODO: get rid of falls back on NumPy when tensordot
-# is implemented using OneMKL
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("dtype", get_all_dtypes())
-@pytest.mark.parametrize(
-    "array_info",
-    [
-        (1, 10, (), (10,)),
-        (10, 1, (10,), ()),
-        (1, 1, (), ()),
-        (10, 10, (10,), (10,)),
-        (12, 6, (4, 3), (3, 2)),
-        (12, 3, (4, 3), (3,)),
-        (60, 3, (5, 4, 3), (3,)),
-        (4, 8, (4,), (4, 2)),
-        (60, 48, (5, 3, 4), (6, 4, 2)),
-    ],
-    ids=[
-        "0d_1d",
-        "1d_0d",
-        "0d_0d",
-        "1d_1d",
-        "2d_2d",
-        "2d_1d",
-        "3d_1d",
-        "1d_2d",
-        "3d_3d",
-    ],
-)
-def test_dot_ndarray(dtype, array_info):
-    size1, size2, shape1, shape2 = array_info
-    a = numpy.array(numpy.random.uniform(-5, 5, size1), dtype=dtype).reshape(
-        shape1
+    def test_dot_complex(self, dtype, array_info):
+        size1, size2, shape1, shape2 = array_info
+        x11 = numpy.random.uniform(-5, 5, size1)
+        x12 = numpy.random.uniform(-5, 5, size1)
+        x21 = numpy.random.uniform(-5, 5, size2)
+        x22 = numpy.random.uniform(-5, 5, size2)
+        a = numpy.array(x11 + 1j * x12, dtype=dtype).reshape(shape1)
+        b = numpy.array(x21 + 1j * x22, dtype=dtype).reshape(shape2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia, ib)
+        expected = numpy.dot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    @pytest.mark.parametrize(
+        "array_info",
+        [
+            (1, 10, (), (10,)),
+            (10, 1, (10,), ()),
+            (1, 1, (), ()),
+            (10, 10, (10,), (10,)),
+            (12, 6, (4, 3), (3, 2)),
+            (12, 3, (4, 3), (3,)),
+            (60, 3, (5, 4, 3), (3,)),
+            (4, 8, (4,), (4, 2)),
+            (60, 48, (5, 3, 4), (6, 4, 2)),
+        ],
+        ids=[
+            "0d_1d",
+            "1d_0d",
+            "0d_0d",
+            "1d_1d",
+            "2d_2d",
+            "2d_1d",
+            "3d_1d",
+            "1d_2d",
+            "3d_3d",
+        ],
     )
-    b = numpy.array(numpy.random.uniform(-5, 5, size2), dtype=dtype).reshape(
-        shape2
+    def test_dot_ndarray(self, dtype, array_info):
+        size1, size2, shape1, shape2 = array_info
+        a = numpy.array(
+            numpy.random.uniform(-5, 5, size1), dtype=dtype
+        ).reshape(shape1)
+        b = numpy.array(
+            numpy.random.uniform(-5, 5, size2), dtype=dtype
+        ).reshape(shape2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = ia.dot(ib)
+        expected = a.dot(b)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_dot_strided(self, dtype):
+        a = numpy.arange(25, dtype=dtype)
+        b = numpy.arange(25, dtype=dtype)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia[::3], ib[::3])
+        expected = numpy.dot(a[::3], b[::3])
+        assert_dtype_allclose(result, expected)
+
+        result = dpnp.dot(ia, ib[::-1])
+        expected = numpy.dot(a, b[::-1])
+        assert_dtype_allclose(result, expected)
+
+        result = dpnp.dot(ia[::-2], ib[::-2])
+        expected = numpy.dot(a[::-2], b[::-2])
+        assert_dtype_allclose(result, expected)
+
+        result = dpnp.dot(ia[::-5], ib[::-5])
+        expected = numpy.dot(a[::-5], b[::-5])
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    def test_dot_out_scalar(self, dtype):
+        size = 10
+        a = 2
+        b = numpy.array(numpy.random.uniform(-5, 5, size), dtype=dtype)
+        ia = 2
+        ib = dpnp.array(b)
+
+        dp_out = dpnp.empty((size,), dtype=dtype)
+        result = dpnp.dot(ia, ib, out=dp_out)
+        expected = numpy.dot(a, b)
+
+        assert result is dp_out
+        assert_allclose(result, expected)
+
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    @pytest.mark.parametrize(
+        "array_info",
+        [
+            (1, 10, (), (10,), (10,)),
+            (10, 1, (10,), (), (10,)),
+            (1, 1, (), (), ()),
+            (10, 10, (10,), (10,), ()),
+            (12, 6, (4, 3), (3, 2), (4, 2)),
+            (12, 3, (4, 3), (3,), (4,)),
+            (60, 3, (5, 4, 3), (3,), (5, 4)),
+            (4, 8, (4,), (4, 2), (2,)),
+            (60, 48, (5, 3, 4), (6, 4, 2), (5, 3, 6, 2)),
+        ],
+        ids=[
+            "0d_1d",
+            "1d_0d",
+            "0d_0d",
+            "1d_1d",
+            "2d_2d",
+            "2d_1d",
+            "3d_1d",
+            "1d_2d",
+            "3d_3d",
+        ],
     )
-    ia = dpnp.array(a)
-    ib = dpnp.array(b)
-
-    result = ia.dot(ib)
-    expected = a.dot(b)
-    assert_dtype_allclose(result, expected)
-
-
-@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
-def test_dot_strided(dtype):
-    a = numpy.arange(25, dtype=dtype)
-    b = numpy.arange(25, dtype=dtype)
-    ia = dpnp.array(a)
-    ib = dpnp.array(b)
-
-    result = dpnp.dot(ia[::3], ib[::3])
-    expected = numpy.dot(a[::3], b[::3])
-    assert_dtype_allclose(result, expected)
-
-    result = dpnp.dot(ia, ib[::-1])
-    expected = numpy.dot(a, b[::-1])
-    assert_dtype_allclose(result, expected)
-
-    result = dpnp.dot(ia[::-2], ib[::-2])
-    expected = numpy.dot(a[::-2], b[::-2])
-    assert_dtype_allclose(result, expected)
-
-    result = dpnp.dot(ia[::-5], ib[::-5])
-    expected = numpy.dot(a[::-5], b[::-5])
-    assert_dtype_allclose(result, expected)
-
-
-@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
-def test_dot_out_scalar(dtype):
-    size = 10
-    a = 2
-    b = numpy.array(numpy.random.uniform(-5, 5, size), dtype=dtype)
-    ia = 2
-    ib = dpnp.array(b)
-
-    dp_out = dpnp.empty((size,), dtype=dtype)
-    result = dpnp.dot(ia, ib, out=dp_out)
-    expected = numpy.dot(a, b)
-
-    assert result is dp_out
-    assert_allclose(result, expected)
-
-
-# TODO: get rid of falls back on NumPy when tensordot
-# is implemented using OneMKL
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("dtype", get_all_dtypes())
-@pytest.mark.parametrize(
-    "array_info",
-    [
-        (1, 10, (), (10,), (10,)),
-        (10, 1, (10,), (), (10,)),
-        (1, 1, (), (), ()),
-        (10, 10, (10,), (10,), ()),
-        (12, 6, (4, 3), (3, 2), (4, 2)),
-        (12, 3, (4, 3), (3,), (4,)),
-        (60, 3, (5, 4, 3), (3,), (5, 4)),
-        (4, 8, (4,), (4, 2), (2,)),
-        (60, 48, (5, 3, 4), (6, 4, 2), (5, 3, 6, 2)),
-    ],
-    ids=[
-        "0d_1d",
-        "1d_0d",
-        "0d_0d",
-        "1d_1d",
-        "2d_2d",
-        "2d_1d",
-        "3d_1d",
-        "1d_2d",
-        "3d_3d",
-    ],
-)
-def test_dot_out(dtype, array_info):
-    size1, size2, shape1, shape2, out_shape = array_info
-    a = numpy.array(numpy.random.uniform(-5, 5, size1), dtype=dtype).reshape(
-        shape1
-    )
-    b = numpy.array(numpy.random.uniform(-5, 5, size2), dtype=dtype).reshape(
-        shape2
+    def test_dot_out(self, dtype, array_info):
+        size1, size2, shape1, shape2, out_shape = array_info
+        a = numpy.array(
+            numpy.random.uniform(-5, 5, size1), dtype=dtype
+        ).reshape(shape1)
+        b = numpy.array(
+            numpy.random.uniform(-5, 5, size2), dtype=dtype
+        ).reshape(shape2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        dp_out = dpnp.empty(out_shape, dtype=dtype)
+        result = dpnp.dot(ia, ib, out=dp_out)
+        expected = numpy.dot(a, b)
+
+        assert result is dp_out
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("dtype1", get_all_dtypes())
+    @pytest.mark.parametrize("dtype2", get_all_dtypes())
+    def test_dot_input_dtype_matrix(self, dtype1, dtype2):
+        a = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype1)
+        b = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype2)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        result = dpnp.dot(ia, ib)
+        expected = numpy.dot(a, b)
+        assert_dtype_allclose(result, expected)
+
+    def test_dot_1d_error(self):
+        a = dpnp.ones(25)
+        b = dpnp.ones(24)
+        # size of input arrays differ
+        with pytest.raises(ValueError):
+            dpnp.dot(a, b)
+
+    def test_dot_sycl_queue_error(self):
+        a = dpnp.ones((5,), sycl_queue=dpctl.SyclQueue())
+        b = dpnp.ones((5,), sycl_queue=dpctl.SyclQueue())
+        with pytest.raises(ValueError):
+            dpnp.dot(a, b)
+
+    # NumPy does not raise an error for the following test.
+    # it just does not update the out keyword if it as not properly defined
+    @pytest.mark.parametrize("ia", [1, dpnp.ones((), dtype=dpnp.int32)])
+    def test_dot_out_error_scalar(self, ia):
+        ib = dpnp.ones(10, dtype=dpnp.int32)
+
+        # output data type is incorrect
+        dp_out = dpnp.empty((10,), dtype=dpnp.int64)
+        # TODO: change it to ValueError, when updated
+        # dpctl is being used in internal CI
+        with pytest.raises((ValueError, TypeError)):
+            dpnp.dot(ia, ib, out=dp_out)
+
+        # output shape is incorrect
+        dp_out = dpnp.empty((2,), dtype=dpnp.int32)
+        # TODO: change it to ValueError, when updated
+        # dpctl is being used in internal CI
+        with pytest.raises((ValueError, TypeError)):
+            dpnp.dot(ia, ib, out=dp_out)
+
+    # TODO: get rid of falls back on NumPy when tensordot
+    # is implemented using OneMKL
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize(
+        "shape_pair",
+        [
+            ((10,), (10,), ()),
+            ((3, 4), (4, 2), (3, 2)),
+            ((3, 4), (4,), (3,)),
+            ((5, 4, 3), (3,), (5, 4)),
+            ((4,), (3, 4, 2), (3, 2)),
+            ((5, 3, 4), (6, 4, 2), (5, 3, 6, 2)),
+        ],
+        ids=["1d_1d", "2d_2d", "2d_1d", "3d_1d", "1d_3d", "3d_3d"],
     )
-    ia = dpnp.array(a)
-    ib = dpnp.array(b)
-
-    dp_out = dpnp.empty(out_shape, dtype=dtype)
-    result = dpnp.dot(ia, ib, out=dp_out)
-    expected = numpy.dot(a, b)
-
-    assert result is dp_out
-    assert_dtype_allclose(result, expected)
-
-
-@pytest.mark.parametrize("dtype1", get_all_dtypes())
-@pytest.mark.parametrize("dtype2", get_all_dtypes())
-def test_dot_input_dtype_matrix(dtype1, dtype2):
-    a = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype1)
-    b = numpy.array(numpy.random.uniform(-5, 5, 10), dtype=dtype2)
-    ia = dpnp.array(a)
-    ib = dpnp.array(b)
-
-    result = dpnp.dot(ia, ib)
-    expected = numpy.dot(a, b)
-    assert_dtype_allclose(result, expected)
-
-
-def test_dot_1d_error():
-    a = dpnp.ones(25)
-    b = dpnp.ones(24)
-    # size of input arrays differ
-    with pytest.raises(ValueError):
-        dpnp.dot(a, b)
-
-
-def test_dot_sycl_queue_error():
-    a = dpnp.ones((5,), sycl_queue=dpctl.SyclQueue())
-    b = dpnp.ones((5,), sycl_queue=dpctl.SyclQueue())
-    with pytest.raises(ValueError):
-        dpnp.dot(a, b)
-
-
-# NumPy does not raise an error for the following test.
-# it just does not update the out keyword if it as not properly defined
-@pytest.mark.parametrize("ia", [1, dpnp.ones((), dtype=dpnp.int32)])
-def test_dot_out_error_scalar(ia):
-    ib = dpnp.ones(10, dtype=dpnp.int32)
-
-    # output data type is incorrect
-    dp_out = dpnp.empty((10,), dtype=dpnp.int64)
-    # TODO: change it to ValueError, when updated
-    # dpctl is being used in internal CI
-    with pytest.raises((ValueError, TypeError)):
-        dpnp.dot(ia, ib, out=dp_out)
-
-    # output shape is incorrect
-    dp_out = dpnp.empty((2,), dtype=dpnp.int32)
-    # TODO: change it to ValueError, when updated
-    # dpctl is being used in internal CI
-    with pytest.raises((ValueError, TypeError)):
-        dpnp.dot(ia, ib, out=dp_out)
-
-
-# TODO: get rid of falls back on NumPy when tensordot
-# is implemented using OneMKL
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize(
-    "shape_pair",
-    [
-        ((10,), (10,), ()),
-        ((3, 4), (4, 2), (3, 2)),
-        ((3, 4), (4,), (3,)),
-        ((5, 4, 3), (3,), (5, 4)),
-        ((4,), (3, 4, 2), (3, 2)),
-        ((5, 3, 4), (6, 4, 2), (5, 3, 6, 2)),
-    ],
-    ids=["1d_1d", "2d_2d", "2d_1d", "3d_1d", "1d_3d", "3d_3d"],
-)
-def test_dot_out_error(shape_pair):
-    shape1, shape2, shape_out = shape_pair
-    a = numpy.ones(shape1, dtype=numpy.int32)
-    b = numpy.ones(shape2, dtype=numpy.int32)
-    ia = dpnp.array(a)
-    ib = dpnp.array(b)
-
-    # output data type is incorrect
-    np_out = numpy.empty(shape_out, dtype=numpy.int64)
-    dp_out = dpnp.empty(shape_out, dtype=dpnp.int64)
-    with pytest.raises(TypeError):
-        dpnp.dot(ia, ib, out=dp_out)
-    with pytest.raises(ValueError):
-        numpy.dot(a, b, out=np_out)
-
-    # output shape is incorrect
-    np_out = numpy.empty((2, 3), dtype=numpy.int32)
-    dp_out = dpnp.empty((2, 3), dtype=dpnp.int32)
-    with pytest.raises(ValueError):
-        dpnp.dot(ia, ib, out=dp_out)
-    with pytest.raises(ValueError):
-        numpy.dot(a, b, out=np_out)
-
-    # "F" or "C" is irrelevant for 0d or 1d arrays
-    if not (len(shape_out) in [0, 1]):
-        # output should be C-contiguous
-        np_out = numpy.empty(shape_out, dtype=numpy.int32, order="F")
-        dp_out = dpnp.empty(shape_out, dtype=dpnp.int32, order="F")
+    def test_dot_out_error(self, shape_pair):
+        shape1, shape2, shape_out = shape_pair
+        a = numpy.ones(shape1, dtype=numpy.int32)
+        b = numpy.ones(shape2, dtype=numpy.int32)
+        ia = dpnp.array(a)
+        ib = dpnp.array(b)
+
+        # output data type is incorrect
+        np_out = numpy.empty(shape_out, dtype=numpy.int64)
+        dp_out = dpnp.empty(shape_out, dtype=dpnp.int64)
+        with pytest.raises(TypeError):
+            dpnp.dot(ia, ib, out=dp_out)
+        with pytest.raises(ValueError):
+            numpy.dot(a, b, out=np_out)
+
+        # output shape is incorrect
+        np_out = numpy.empty((2, 3), dtype=numpy.int32)
+        dp_out = dpnp.empty((2, 3), dtype=dpnp.int32)
         with pytest.raises(ValueError):
             dpnp.dot(ia, ib, out=dp_out)
         with pytest.raises(ValueError):
             numpy.dot(a, b, out=np_out)
 
+        # "F" or "C" is irrelevant for 0d or 1d arrays
+        if not (len(shape_out) in [0, 1]):
+            # output should be C-contiguous
+            np_out = numpy.empty(shape_out, dtype=numpy.int32, order="F")
+            dp_out = dpnp.empty(shape_out, dtype=dpnp.int32, order="F")
+            with pytest.raises(ValueError):
+                dpnp.dot(ia, ib, out=dp_out)
+            with pytest.raises(ValueError):
+                numpy.dot(a, b, out=np_out)
+
 
 @pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
 def test_multi_dot(type):

From b725c1980d3661b9d6c440b9bc63267f74427c8d Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Mon, 5 Feb 2024 10:54:04 -0600
Subject: [PATCH 10/11] pass argument by reference

---
 dpnp/backend/extensions/blas/dot.cpp  | 6 +++---
 dpnp/backend/extensions/blas/dot.hpp  | 4 ++--
 dpnp/backend/extensions/blas/dotu.cpp | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dpnp/backend/extensions/blas/dot.cpp b/dpnp/backend/extensions/blas/dot.cpp
index 239e60cf374a..048738f57a94 100644
--- a/dpnp/backend/extensions/blas/dot.cpp
+++ b/dpnp/backend/extensions/blas/dot.cpp
@@ -46,7 +46,7 @@ namespace mkl_blas = oneapi::mkl::blas;
 namespace py = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
-typedef sycl::event (*dot_impl_fn_ptr_t)(sycl::queue,
+typedef sycl::event (*dot_impl_fn_ptr_t)(sycl::queue &,
                                          const std::int64_t,
                                          char *,
                                          const std::int64_t,
@@ -59,7 +59,7 @@ static dot_impl_fn_ptr_t dot_dispatch_table[dpctl_td_ns::num_types]
                                            [dpctl_td_ns::num_types];
 
 template <typename Tab, typename Tc>
-static sycl::event dot_impl(sycl::queue exec_q,
+static sycl::event dot_impl(sycl::queue &exec_q,
                             const std::int64_t n,
                             char *vectorA,
                             const std::int64_t stride_a,
@@ -107,7 +107,7 @@ static sycl::event dot_impl(sycl::queue exec_q,
     return dot_event;
 }
 
-std::pair<sycl::event, sycl::event> dot(sycl::queue exec_q,
+std::pair<sycl::event, sycl::event> dot(sycl::queue &exec_q,
                                         dpctl::tensor::usm_ndarray vectorA,
                                         dpctl::tensor::usm_ndarray vectorB,
                                         dpctl::tensor::usm_ndarray result,
diff --git a/dpnp/backend/extensions/blas/dot.hpp b/dpnp/backend/extensions/blas/dot.hpp
index f0d79dfa6478..3468196f760a 100644
--- a/dpnp/backend/extensions/blas/dot.hpp
+++ b/dpnp/backend/extensions/blas/dot.hpp
@@ -39,14 +39,14 @@ namespace ext
 namespace blas
 {
 extern std::pair<sycl::event, sycl::event>
-    dot(sycl::queue exec_q,
+    dot(sycl::queue &exec_q,
         dpctl::tensor::usm_ndarray vectorA,
         dpctl::tensor::usm_ndarray vectorB,
         dpctl::tensor::usm_ndarray result,
         const std::vector<sycl::event> &depends);
 
 extern std::pair<sycl::event, sycl::event>
-    dotu(sycl::queue exec_q,
+    dotu(sycl::queue &exec_q,
          dpctl::tensor::usm_ndarray vectorA,
          dpctl::tensor::usm_ndarray vectorB,
          dpctl::tensor::usm_ndarray result,
diff --git a/dpnp/backend/extensions/blas/dotu.cpp b/dpnp/backend/extensions/blas/dotu.cpp
index 728b874f1eea..8c4b43f80345 100644
--- a/dpnp/backend/extensions/blas/dotu.cpp
+++ b/dpnp/backend/extensions/blas/dotu.cpp
@@ -46,7 +46,7 @@ namespace mkl_blas = oneapi::mkl::blas;
 namespace py = pybind11;
 namespace type_utils = dpctl::tensor::type_utils;
 
-typedef sycl::event (*dotu_impl_fn_ptr_t)(sycl::queue,
+typedef sycl::event (*dotu_impl_fn_ptr_t)(sycl::queue &,
                                           const std::int64_t,
                                           char *,
                                           const std::int64_t,
@@ -59,7 +59,7 @@ static dotu_impl_fn_ptr_t dotu_dispatch_table[dpctl_td_ns::num_types]
                                              [dpctl_td_ns::num_types];
 
 template <typename Tab, typename Tc>
-static sycl::event dotu_impl(sycl::queue exec_q,
+static sycl::event dotu_impl(sycl::queue &exec_q,
                              const std::int64_t n,
                              char *vectorA,
                              const std::int64_t stride_a,
@@ -108,7 +108,7 @@ static sycl::event dotu_impl(sycl::queue exec_q,
 }
 
 std::pair<sycl::event, sycl::event>
-    dotu(sycl::queue exec_q,
+    dotu(sycl::queue &exec_q,
          dpctl::tensor::usm_ndarray vectorA,
          dpctl::tensor::usm_ndarray vectorB,
          dpctl::tensor::usm_ndarray result,

From 6f2cce7da252b8259d799b34566aaed5c6b1be28 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Mon, 5 Feb 2024 10:55:35 -0600
Subject: [PATCH 11/11] update doc to add boolean dtype

---
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 4895e5e91284..65d97befa984 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -190,7 +190,7 @@ def dpnp_dot(a, b, /, out=None):
     Return the dot product of two arrays.
 
     The routine that is used to perform the main calculation
-    depends on input array data types: 1) For integer data types,
+    depends on input array data types: 1) For integer and boolean data types,
     `dpctl.tensor.vecdot` form the Data Parallel Control library is used,
     2) For floating point real-valued data types, `dot` routines from
     BLAS library of OneMKL is used, and 3) For complex data types,