diff --git a/CMakeLists.txt b/CMakeLists.txt index 733ca6836..83e074706 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,10 +185,10 @@ if (MATX_NVTX_FLAGS) target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS) endif() if (MATX_BUILD_32_BIT) - set(INT_TYPE "lp64") - target_compile_definitions(matx INTERFACE INDEX_32_BIT) + set(MATX_NVPL_INT_TYPE "lp64") + target_compile_definitions(matx INTERFACE MATX_INDEX_32_BIT) else() - set(INT_TYPE "ilp64") + set(MATX_NVPL_INT_TYPE "ilp64") endif() # Host support @@ -211,13 +211,13 @@ if (MATX_EN_NVPL OR MATX_EN_X86_FFTW OR MATX_EN_BLIS OR MATX_EN_OPENBLAS) endif() if (MATX_EN_NVPL) - message(STATUS "Enabling NVPL library support for ARM CPUs with ${INT_TYPE} interface") + message(STATUS "Enabling NVPL library support for ARM CPUs with ${MATX_NVPL_INT_TYPE} interface") find_package(nvpl REQUIRED COMPONENTS fft blas lapack HINTS ${blas_DIR}) if (NOT MATX_BUILD_32_BIT) target_compile_definitions(matx INTERFACE NVPL_ILP64) endif() target_compile_definitions(matx INTERFACE NVPL_LAPACK_COMPLEX_CUSTOM) - target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${INT_TYPE}_omp nvpl::lapack_${INT_TYPE}_omp) + target_link_libraries(matx INTERFACE nvpl::fftw nvpl::blas_${MATX_NVPL_INT_TYPE}_omp nvpl::lapack_${MATX_NVPL_INT_TYPE}_omp) target_compile_definitions(matx INTERFACE MATX_EN_NVPL) else() # FFTW diff --git a/examples/black_scholes.cu b/examples/black_scholes.cu index 1938cab28..2a8ab47a2 100644 --- a/examples/black_scholes.cu +++ b/examples/black_scholes.cu @@ -46,8 +46,8 @@ using namespace matx; * instructions. While caching helps, this can have a slight performance impact when compared to native CUDA * kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some * boilerplate code around the original expression. This custom operator can then be used either alone or inside - * other arithmetic expressions, and only a single load is issues for each tensor. - * + * other arithmetic expressions, and only a single load is issues for each tensor. + * * This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and * shows the performance difference. */ @@ -76,7 +76,7 @@ public: auto d2 = d1 - VsqrtT; auto cdf_d1 = normcdf(d1); auto cdf_d2 = normcdf(d2); - auto expRT = exp(-1 * r * T); + auto expRT = exp(-1 * r * T); out_(idx) = S * cdf_d1 - K * expRT * cdf_d2; } @@ -87,12 +87,12 @@ public: /* Arithmetic expression */ template -void compute_black_scholes_matx(tensor_t& K, - tensor_t& S, - tensor_t& V, - tensor_t& r, - tensor_t& T, - tensor_t& output, +void compute_black_scholes_matx(tensor_t& K, + tensor_t& S, + tensor_t& V, + tensor_t& r, + tensor_t& T, + tensor_t& output, cudaExecutor& exec) { auto VsqrtT = V * sqrt(T); @@ -100,7 +100,7 @@ void compute_black_scholes_matx(tensor_t& K, auto d2 = d1 - VsqrtT; auto cdf_d1 = normcdf(d1); auto cdf_d2 = normcdf(d2); - auto expRT = exp(-1 * r * T); + auto expRT = exp(-1 * r * T); (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec); } @@ -120,13 +120,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) tensor_t V_tensor{{input_size}}; tensor_t r_tensor{{input_size}}; tensor_t T_tensor{{input_size}}; - tensor_t output_tensor{{input_size}}; + tensor_t output_tensor{{input_size}}; cudaStream_t stream; cudaStreamCreate(&stream); cudaExecutor exec{stream}; - compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); + compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec); cudaEvent_t start, stop; cudaEventCreate(&start); @@ -154,11 +154,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventElapsedTime(&time_ms, start, stop); printf("Time with custom operator = %.2fms per iteration\n", - time_ms / num_iterations); + time_ms / num_iterations); cudaEventDestroy(start); cudaEventDestroy(stop); cudaStreamDestroy(stream); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu index 9027d634d..cba23f02d 100644 --- a/examples/cgsolve.cu +++ b/examples/cgsolve.cu @@ -86,6 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // example-end sync-test-1 printf ("max l2 norm: %f\n", (float)sqrt(maxn())); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/examples/channelize_poly_bench.cu b/examples/channelize_poly_bench.cu index ac38367a4..48a53eb4f 100644 --- a/examples/channelize_poly_bench.cu +++ b/examples/channelize_poly_bench.cu @@ -101,7 +101,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop } cudaEventRecord(stop, stream); exec.sync(); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); cudaEventElapsedTime(&elapsed_ms, start, stop); const double avg_elapsed_us = (static_cast(elapsed_ms)/NUM_ITERATIONS)*1.0e3; @@ -112,7 +112,7 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop printf("\n"); } - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); cudaEventDestroy(start); cudaEventDestroy(stop); diff --git a/examples/conv2d.cu b/examples/conv2d.cu index 71c648b1b..d4387581f 100644 --- a/examples/conv2d.cu +++ b/examples/conv2d.cu @@ -39,23 +39,23 @@ using namespace matx; int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) { MATX_ENTER_HANDLER(); - + index_t iN = 4; index_t iM = 6; - + index_t fN = 4; index_t fM = 2; - + auto in = make_tensor({iN,iM}); auto filter = make_tensor({fN,fM}); - + in.SetVals({ {1,2,3,4,5,6}, {5,4,3,2,1,0}, {3,4,5,6,7,8}, {1,2,3,4,5,6}, }); - filter.SetVals({ {1,2}, + filter.SetVals({ {1,2}, {3,4}, {5,6}, {7,8}}); @@ -73,9 +73,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) index_t oM = iM - fM + 1; auto mode = MATX_C_MODE_VALID; #endif - + auto out = make_tensor({oN,oM}); - + (out = conv2d(in, filter, mode)).run(); printf("in:\n"); @@ -86,6 +86,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) print(out); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/examples/convolution.cu b/examples/convolution.cu index f45b625ad..ac41759bb 100644 --- a/examples/convolution.cu +++ b/examples/convolution.cu @@ -93,7 +93,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) for (uint32_t i = 0; i < iterations; i++) { (outView = conv1d(inView, filterView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); } - + cudaEventRecord(stop, stream); exec.sync(); @@ -149,6 +149,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) matxPrintMemoryStatistics(); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/examples/eigenExample.cu b/examples/eigenExample.cu index d204c4be7..e2ec26535 100644 --- a/examples/eigenExample.cu +++ b/examples/eigenExample.cu @@ -44,6 +44,8 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) { + MATX_ENTER_HANDLER(); + int dimX = 3; int dimY = 3; @@ -53,9 +55,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) ////////////// Eigen Test Data Setup ////////////// /////////////////////////////////////////////////////////////////////////////// #ifdef USE_EIGEN - + typedef Eigen::Matrix MatrixXdRowMajor; // define a custom type that is aligned to MatX row-Major. - + Eigen::MatrixXd a(dimX, dimY); MatrixXdRowMajor b(dimX, dimY); Eigen::RowVectorXd rowVec(dimX); @@ -88,7 +90,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) complexMatrix(0, 1) = std::complex(2.0, 3.0); complexMatrix(1, 0) = std::complex(3.0, 4.0); complexMatrix(1, 1) = std::complex(4.0, 5.0); - + #else std::cout <<"!!!!!!!!! Eigen NOT USED in Test !!!!!!!!!" << std ::endl; // provide data in tensors if eigen is not used @@ -140,18 +142,18 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // // Data Mapping Example - // -#ifdef USE_EIGEN + // +#ifdef USE_EIGEN std::cout << "=================== Data Map Example ===================" << std::endl; double *raw_data; // memory could be any type of allocation, but choosing to use managed memory so it's valid on the host and device (this does cost performance) - cudaMallocManaged((void**)&raw_data, dimX*dimY * sizeof(double)); - + cudaMallocManaged((void**)&raw_data, dimX*dimY * sizeof(double)); + for(int i=0; i < dimX * dimY; i++) { raw_data[i] = 0.1 + i * 0.1; } - + // map user memory into Eigen Matrix Eigen::Map mappedMatrix(raw_data, dimX, dimY); std::cout << "Eigen Mapped Data :\n" << mappedMatrix << std::endl; @@ -159,24 +161,24 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // map user memory into Eigen Matrix auto mappedTensor = matx::make_tensor(raw_data, {dimX, dimY}, false); // create MatX tensor with non-owning user allocated memory matx::print(mappedTensor); - - // modify the data from each of the references + + // modify the data from each of the references raw_data[4] = 117; mappedMatrix(0,1) = 42; mappedTensor(2,1) = 87; - + // print modified data std::cout << "Eigen Mapped Data After Modified :\n" << mappedMatrix << std::endl; matx::print(mappedTensor); -#endif +#endif // // Basic Indexing // std::cout << "=================== Indexing ===================" << std::endl; #ifdef USE_EIGEN - std::cout << "eigen a(1,2) = " << a(1,2) << std::endl; -#endif + std::cout << "eigen a(1,2) = " << a(1,2) << std::endl; +#endif std::cout << "MatX a(1,2) = " << aTensor(1,2) << std::endl; @@ -186,9 +188,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // std::cout << "=================== Addition ===================" << std::endl; #ifdef USE_EIGEN - Eigen::MatrixXd addResult = a + b; - std::cout << "A + B = \n" << addResult << std::endl; -#endif + Eigen::MatrixXd addResult = a + b; + std::cout << "A + B = \n" << addResult << std::endl; +#endif auto addTensor = aTensor + bTensor; matx::print(addTensor); @@ -199,9 +201,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // std::cout << "=================== Element-Wise Multiply ===================" << std::endl; #ifdef USE_EIGEN - Eigen::MatrixXd elementWise = a.cwiseProduct(b); - std::cout << "A .* B = \n" << elementWise << std::endl; -#endif + Eigen::MatrixXd elementWise = a.cwiseProduct(b); + std::cout << "A .* B = \n" << elementWise << std::endl; +#endif auto elementWiseTensor = aTensor*bTensor; matx::print(elementWiseTensor); @@ -212,9 +214,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // std::cout << "=================== Element-Wise Division ===================" << std::endl; #ifdef USE_EIGEN - Eigen::MatrixXd divResult = a.cwiseQuotient(b); - std::cout << "A / B = \n" << divResult << std::endl; -#endif + Eigen::MatrixXd divResult = a.cwiseQuotient(b); + std::cout << "A / B = \n" << divResult << std::endl; +#endif auto divResultTensor = aTensor / bTensor; matx::print(divResultTensor); @@ -225,9 +227,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // std::cout << "=================== Continuous Slice ===================" << std::endl; #ifdef USE_EIGEN - Eigen::Matrix2d aSlice = a.block(0, 0, 2, 2); - std::cout << "A Sliced: \n" << aSlice << std::endl; -#endif + Eigen::Matrix2d aSlice = a.block(0, 0, 2, 2); + std::cout << "A Sliced: \n" << aSlice << std::endl; +#endif auto aSliceTensor = matx::slice<2>(aTensor,{0,0},{2,2}); matx::print(aSliceTensor); @@ -238,7 +240,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // std::cout << "=================== Strided Slice ===================" << std::endl; #ifdef USE_EIGEN - std::cout << "Original matrix10x10:\n" << matrix10x10 << "\n\n"; + std::cout << "Original matrix10x10:\n" << matrix10x10 << "\n\n"; // Define the starting point, number of elements to select, and strides for both rows and columns // int startRow = 0, startCol = 0; // Starting index for rows and columns // int rowStride = 3, colStride = 2; // Stride along rows and columns @@ -246,14 +248,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // int numCols = 3; // Grab every third item until the 8th item (0, 3, 6) // Create a Map with Stride to access the elements - Eigen::Map> - strided(matrix10x10.data() + 0 * matrix10x10.outerStride() + 0, - 5, 3, - Eigen::Stride(3 * matrix10x10.outerStride(), 2)); + Eigen::Map> + strided(matrix10x10.data() + 0 * matrix10x10.outerStride() + 0, + 5, 3, + Eigen::Stride(3 * matrix10x10.outerStride(), 2)); // Print the strided matrix10x10 - std::cout << "Strided matrix10x10:\n" << strided << "\n"; -#endif + std::cout << "Strided matrix10x10:\n" << strided << "\n"; +#endif auto slicedMat = matx::slice(matTensor10x10, {0,0}, {matx::matxEnd,9}, {2,3}); matx::print(slicedMat); @@ -265,9 +267,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) std::cout << "=================== Clone ===================" << std::endl; #ifdef USE_EIGEN // Use the replicate function to create a 5x5 matrix by replicating the 1x5 matrix - Eigen::MatrixXd mat = rowVec.replicate(3, 1); + Eigen::MatrixXd mat = rowVec.replicate(3, 1); std::cout << "1D Cloned to 2D \n" << mat << std::endl; -#endif +#endif auto cloned3Tensor = matx::clone<2>(tensor1D, {3, matx::matxKeepDim}); matx::print(cloned3Tensor); @@ -278,9 +280,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // std::cout << "=================== Slice Row ===================" << std::endl; #ifdef USE_EIGEN - Eigen::RowVector3d row = a.row(1); + Eigen::RowVector3d row = a.row(1); std::cout << "Sliced Row \n" << row << std::endl; -#endif +#endif auto rowSlice = matx::slice<1>(aTensor, {1, 0}, {matx::matxDropDim, matx::matxEnd}); matx::print(rowSlice); @@ -291,14 +293,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // std::cout << "=================== Permute Rows ===================" << std::endl; #ifdef USE_EIGEN - std::cout << "Original Matrix:\n" << a << std::endl; + std::cout << "Original Matrix:\n" << a << std::endl; // Define a permutation a - Eigen::PermutationMatrix<3> perm; + Eigen::PermutationMatrix<3> perm; perm.indices() << 2, 1, 0; // This permutation swaps the first and third rows // Apply the permutation to the rows Eigen::Matrix3d permutedMatrix = perm * a; std::cout << "Permuted Matrix (Rows):\n" << permutedMatrix << std::endl; -#endif +#endif // Define a permutation a auto permVec = matx::make_tensor({dimX}); @@ -323,12 +325,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // std::cout << "=================== Get Real Values ===================" << std::endl; #ifdef USE_EIGEN - std::cout << "Original Complex Matrix:\n" << complexMatrix << std::endl; + std::cout << "Original Complex Matrix:\n" << complexMatrix << std::endl; // Extract and output the real part of the complex matrix - Eigen::Matrix realMatrix = complexMatrix.real(); - std::cout << "Real Part of Matrix:\n" << realMatrix << std::endl; -#endif + Eigen::Matrix realMatrix = complexMatrix.real(); + std::cout << "Real Part of Matrix:\n" << realMatrix << std::endl; +#endif auto realTensor = matx::real(complexTensor); matx::print(realTensor); @@ -339,9 +341,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // std::cout << "=================== Matrix Multiply ===================" << std::endl; #ifdef USE_EIGEN - Eigen::MatrixXd multResult = a * b; - std::cout << "A * B = \n" << multResult << std::endl; -#endif + Eigen::MatrixXd multResult = a * b; + std::cout << "A * B = \n" << multResult << std::endl; +#endif auto multResultTensor=matmul(aTensor,bTensor); matx::print(multResultTensor); @@ -354,7 +356,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) #ifdef USE_EIGEN // Eigen::MatrixXd inverseMatrix = a.inverse(); // current bug where .run(exec) in inverse is ambiguous, so cannot be used with MatX // std::cout << "Inverse of the Real Part:\n" << inverseMatrix << std::endl; // current bug where .run(exec) in inverse is ambiguous, so cannot be used with MatX -#endif +#endif auto invTensor = matx::inv(aTensor); matx::print(invTensor); @@ -384,5 +386,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // // Unsupported by eigen + MATX_CUDA_CHECK_LAST_ERROR(); + MATX_EXIT_HANDLER(); return 0; } diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu index 9833ea83f..c6361481c 100644 --- a/examples/fft_conv.cu +++ b/examples/fft_conv.cu @@ -64,7 +64,7 @@ using namespace matx; * For smaller signal sizes, the FFT convolution typically performs worse since * there is some buffer and 3 FFT operations (2 for FFT of signal and filter, * and 1 IFFT after the multiply) that causes the setup time to dominate. - * + * * Note that the conv1d() operator has a mode to perform FFT-based convolution * automatically. * @@ -82,11 +82,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) float fused_ms; constexpr int iterations = 100; cudaStream_t stream; - cudaStreamCreate(&stream); + cudaStreamCreate(&stream); cudaEvent_t start, stop; cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaExecutor exec{stream}; + cudaEventCreate(&stop); + cudaExecutor exec{stream}; // Create time domain buffers auto sig_time = make_tensor({batches, signal_size}); @@ -118,7 +118,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) for (int i = 0; i < iterations; i++) { if (i == 1) { cudaEventRecord(start, stream); - } + } (sig_freq = fft(sig_time, filtered_size)).run(exec); (filt_freq = fft(filt_time, filtered_size)).run(exec); @@ -126,12 +126,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // IFFT in-place (sig_freq = ifft(sig_freq)).run(exec); - + } cudaEventRecord(stop, stream); exec.sync(); - cudaEventElapsedTime(&separate_ms, start, stop); + cudaEventElapsedTime(&separate_ms, start, stop); for (int i = 0; i < iterations; i++) { if (i == 1) { @@ -139,10 +139,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) } (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec); } - + cudaEventRecord(stop, stream); exec.sync(); - cudaEventElapsedTime(&fused_ms, start, stop); + cudaEventElapsedTime(&fused_ms, start, stop); printf("FFT runtimes for separate = %.2f ms, fused = %.2f ms\n", separate_ms/(iterations-1), fused_ms/(iterations-1)); @@ -153,7 +153,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); exec.sync(); - + // Compare signals for (index_t b = 0; b < batches; b++) { for (index_t i = 0; i < filtered_size; i++) { @@ -170,6 +170,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) std::cout << "Verification successful" << std::endl; - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } \ No newline at end of file diff --git a/examples/mvdr_beamformer.cu b/examples/mvdr_beamformer.cu index 07fc97ebf..c1271c3b9 100644 --- a/examples/mvdr_beamformer.cu +++ b/examples/mvdr_beamformer.cu @@ -92,6 +92,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventDestroy(start); cudaEventDestroy(stop); cudaStreamDestroy(stream); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } \ No newline at end of file diff --git a/examples/print_styles.cu b/examples/print_styles.cu index d76ec9f80..19cf10078 100644 --- a/examples/print_styles.cu +++ b/examples/print_styles.cu @@ -58,6 +58,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) {-13, 13}, {-14, 14}, {-15, 15}, {-16, 16} }); + A1.set_name("A1 Matrix"); auto A2 = reshape(A1, {4,4}); auto A3 = reshape(A1, {2,2,4}); auto A4 = reshape(A1, {2,2,2,2}); @@ -83,7 +84,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) print(A4); // example-end print-example-1 - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); return 0; } diff --git a/examples/pwelch.cu b/examples/pwelch.cu index ea2679180..f3873900b 100644 --- a/examples/pwelch.cu +++ b/examples/pwelch.cu @@ -97,7 +97,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) print(Pxx); printf("PWelchOp avg runtime = %.3f ms\n", exec_time_ms / num_iterations); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); return 0; } diff --git a/examples/qr.cu b/examples/qr.cu index 2953dabcf..6115bde50 100644 --- a/examples/qr.cu +++ b/examples/qr.cu @@ -43,14 +43,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) //using AType = double; using AType = cuda::std::complex; - + cudaStream_t stream = 0; cudaExecutor exec{stream}; - int batch = 1; + int batch = 1; int m = 4; int n = 5; - + auto A = make_tensor({batch, m, n}); auto QR = make_tensor({batch, m, n}); auto QTQ = make_tensor({batch, m, m}); @@ -73,13 +73,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) (QR = matmul(Q, R)).run(exec); (QTQ = matmul(conj(transpose_matrix(Q)), Q)).run(exec); exec.sync(); - + printf("Q:\n"); print(Q); printf("R:\n"); print(R); printf("QTQ:\n"); print(QTQ); printf("QR:\n"); print(QR); printf("A:\n"); print(A); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/examples/recursive_filter.cu b/examples/recursive_filter.cu index 9023bc2d4..fbeedec4d 100644 --- a/examples/recursive_filter.cu +++ b/examples/recursive_filter.cu @@ -43,7 +43,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) using complex = cuda::std::complex; cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, 0); + cudaGetDeviceProperties(&prop, 0); if (prop.sharedMemPerBlock < 40000) { printf("Recursive filter example requires at least 40KB of shared memory to run. Exiting."); @@ -139,6 +139,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) matxPrintMemoryStatistics(); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/examples/resample.cu b/examples/resample.cu index c7c8b6bf9..bb97d166e 100644 --- a/examples/resample.cu +++ b/examples/resample.cu @@ -97,6 +97,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventDestroy(start); cudaEventDestroy(stop); cudaStreamDestroy(stream); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); -} \ No newline at end of file +} diff --git a/examples/resample_poly_bench.cu b/examples/resample_poly_bench.cu index 6debd4311..11a9b2e48 100644 --- a/examples/resample_poly_bench.cu +++ b/examples/resample_poly_bench.cu @@ -159,17 +159,17 @@ void ResamplePolyBench() } cudaEventRecord(stop, stream); exec.sync(); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); cudaEventElapsedTime(&elapsed_ms, start, stop); const double gflops = static_cast(num_batches*(2*filter_len_per_phase-1)*output_len) / 1.0e9; const double avg_elapsed_us = (static_cast(elapsed_ms)/NUM_ITERATIONS)*1.0e3; - printf("Batches: %5" INDEX_T_FMT " FilterLen: %5" INDEX_T_FMT " InputLen: %9" INDEX_T_FMT " OutputLen: %8" INDEX_T_FMT - " Up/Down: %4" INDEX_T_FMT "/%4" INDEX_T_FMT " Elapsed Usecs: %12.1f GFLOPS: %10.3f\n", + printf("Batches: %5" MATX_INDEX_T_FMT " FilterLen: %5" MATX_INDEX_T_FMT " InputLen: %9" MATX_INDEX_T_FMT " OutputLen: %8" MATX_INDEX_T_FMT + " Up/Down: %4" MATX_INDEX_T_FMT "/%4" MATX_INDEX_T_FMT " Elapsed Usecs: %12.1f GFLOPS: %10.3f\n", num_batches, filter_len, input_len, output_len, up, down, avg_elapsed_us, gflops/(avg_elapsed_us/1.0e6)); } - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); cudaEventDestroy(start); cudaEventDestroy(stop); diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu index a89cb1b1f..126ded2b5 100644 --- a/examples/simple_radar_pipeline.cu +++ b/examples/simple_radar_pipeline.cu @@ -43,7 +43,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) uint32_t iterations = 100; constexpr int num_streams = 1; cudaGraph_t graphs[num_streams]; - cudaGraphExec_t instances[num_streams]; + cudaGraphExec_t instances[num_streams]; using complex = cuda::std::complex; RadarPipeline *pipelines[num_streams]; @@ -55,10 +55,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) // cuda stream to place work in cudaStream_t streams[num_streams]; - + // manually set to log all NVTX levels MATX_NVTX_SET_LOG_LEVEL( matx_nvxtLogLevels::MATX_NVTX_LOG_ALL ); - + // create some events for timing cudaEvent_t starts[num_streams]; cudaEvent_t stops[num_streams]; @@ -67,13 +67,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventCreate(&starts[s]); cudaEventCreate(&stops[s]); cudaStreamCreate(&streams[s]); - + MATX_NVTX_START_RANGE("Pipeline Initialize", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1) printf("Initializing data structures for stream %d...\n", s); pipelines[s] = new RadarPipeline(numPulses, numSamples, waveformLength, numChannels, streams[s]); MATX_NVTX_END_RANGE(1) - pipelines[s]->sync(); + pipelines[s]->sync(); } MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2) @@ -82,20 +82,20 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto run_pipeline = [&](int s) { MATX_NVTX_START_RANGE("PulseCompression", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 21) pipelines[s]->PulseCompression(); - MATX_NVTX_END_RANGE(21) - + MATX_NVTX_END_RANGE(21) + MATX_NVTX_START_RANGE("ThreePulseCanceller", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 22) pipelines[s]->ThreePulseCanceller(); MATX_NVTX_END_RANGE(22) - + MATX_NVTX_START_RANGE("DopplerProcessing", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 23) pipelines[s]->DopplerProcessing(); MATX_NVTX_END_RANGE(23) - + MATX_NVTX_START_RANGE("CFARDetections", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 24) pipelines[s]->CFARDetections(); MATX_NVTX_END_RANGE(24) - }; + }; // Warmup for (int s = 0; s < num_streams; s++) { @@ -107,10 +107,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal); run_pipeline(s); cudaStreamEndCapture(streams[s], &graphs[s]); - cudaGraphInstantiate(&instances[s], graphs[s], NULL, NULL, 0); + cudaGraphInstantiate(&instances[s], graphs[s], NULL, NULL, 0); } } - + for (uint32_t i = 0; i < iterations; i++) { for (int s = 0; s < num_streams; s++) { if (i == 1) { @@ -131,7 +131,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) pipelines[s]->sync(); } MATX_NVTX_END_RANGE(2) - + MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3) float time_ms; cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]); @@ -150,7 +150,7 @@ for (int s = 0; s < num_streams; s++) { } cudaDeviceSynchronize(); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); matxPrintMemoryStatistics(); diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu index 8d566b325..79b066a07 100644 --- a/examples/spectrogram.cu +++ b/examples/spectrogram.cu @@ -154,6 +154,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventDestroy(stop); cudaStreamDestroy(stream); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/examples/spectrogram_graph.cu b/examples/spectrogram_graph.cu index 6f0583c8f..38d7c5d2e 100644 --- a/examples/spectrogram_graph.cu +++ b/examples/spectrogram_graph.cu @@ -143,7 +143,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) viz::contour(time, freqs, Sxx); #else printf("Not outputting plot since visualizations disabled\n"); -#endif +#endif } } @@ -164,6 +164,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) cudaEventDestroy(start); cudaEventDestroy(stop); cudaStreamDestroy(stream); - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/examples/spherical_harmonics.cu b/examples/spherical_harmonics.cu index 35d650f5f..48ddcdcd3 100644 --- a/examples/spherical_harmonics.cu +++ b/examples/spherical_harmonics.cu @@ -64,7 +64,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto [phi, theta] = meshgrid(az, col); auto Plm = lcollapse<3>(legendre(l, m, cos(theta))); - + ValueType a = (2*l+1)*factorial(l-m); ValueType b = 4*M_PI*factorial(l+m); ValueType C = cuda::std::sqrt(a/b); @@ -85,6 +85,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) #if MATX_ENABLE_VIZ matx::viz::surf(X, Y, Z, "test-viz.html"); #endif - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/examples/svd_power.cu b/examples/svd_power.cu index 451c8f8a2..ca7f4b2c6 100644 --- a/examples/svd_power.cu +++ b/examples/svd_power.cu @@ -56,7 +56,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) int k = d; // number of singular values to find #if 0 - int batch = 1; + int batch = 1; auto A = make_tensor({batch, m, n}); auto U = make_tensor({batch, m, k}); auto VT = make_tensor({batch, k, n}); @@ -70,7 +70,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto VVT = make_tensor({batch, n, n}); auto VTV = make_tensor({batch, k, k}); auto x0 = random({batch, d}, NORMAL); - + (A = random({batch, m, n}, NORMAL)).run(exec); #else @@ -87,7 +87,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) auto VVT = make_tensor({n, n}); auto VTV = make_tensor({k, k}); auto x0 = random({d}, NORMAL); - + (A = random({m, n}, NORMAL)).run(exec); #endif @@ -99,7 +99,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) float tol = (float)1e-3; int iterations = 20; - + { printf("iterations: %d\n", iterations); @@ -157,7 +157,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) printf("A-UDVT\n"); print(A); } - + // Same as above but with svdbpi { @@ -215,6 +215,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) print(A); } #endif - CUDA_CHECK_LAST_ERROR(); + MATX_CUDA_CHECK_LAST_ERROR(); MATX_EXIT_HANDLER(); } diff --git a/include/matx.h b/include/matx.h index d783c2481..4114a03ff 100644 --- a/include/matx.h +++ b/include/matx.h @@ -58,6 +58,3 @@ namespace matx { using fcomplex = cuda::std::complex; using dcomplex = cuda::std::complex; } - -#define TEST_VECTOR_PATH "generated/" - diff --git a/include/matx/core/defines.h b/include/matx/core/defines.h index cdfaafa9e..c4cfde1ba 100644 --- a/include/matx/core/defines.h +++ b/include/matx/core/defines.h @@ -37,12 +37,12 @@ namespace matx { -#ifdef INDEX_32_BIT +#ifdef MATX_INDEX_32_BIT using index_t = int32_t; - #define INDEX_T_FMT "d" -#else + #define MATX_INDEX_T_FMT "d" +#else using index_t = long long int; - #define INDEX_T_FMT "lld" + #define MATX_INDEX_T_FMT "lld" #endif #ifdef __CUDACC__ @@ -53,39 +53,39 @@ namespace matx { #define __MATX_DEVICE__ __device__ #endif -#ifdef __GNUC__ +#ifdef __GNUC__ #define __MATX_INLINE__ __attribute__((always_inline)) inline #elif __CUDACC__ - #define __MATX_INLINE__ __forceinline__ + #define __MATX_INLINE__ __forceinline__ #else #define __MATX_INLINE__ inline #endif -#define STRINGIFY(x) #x -#define TOSTRING(x) STRINGIFY(x) +#define MATX_STRINGIFY(x) #x +#define MATX_TOSTRING(x) MATX_STRINGIFY(x) #if defined(__clang__ ) - #define IGNORE_WARNING_PUSH_GCC(WARN_MSG) - #define IGNORE_WARNING_POP_GCC + #define MATX_IGNORE_WARNING_PUSH_GCC(WARN_MSG) + #define MATX_IGNORE_WARNING_POP_GCC - #define IGNORE_WARNING_PUSH_CLANG(WARN_MSG) \ + #define MATX_IGNORE_WARNING_PUSH_CLANG(WARN_MSG) \ _Pragma("clang diagnostic push") \ - _Pragma(TOSTRING(clang diagnostic ignored WARN_MSG)) + _Pragma(MATX_TOSTRING(clang diagnostic ignored WARN_MSG)) - #define IGNORE_WARNING_POP_CLANG \ + #define MATX_IGNORE_WARNING_POP_CLANG \ _Pragma("clang diagnostic pop") #elif defined(__GNUC__) - #define IGNORE_WARNING_PUSH_CLANG(WARN_MSG) - #define IGNORE_WARNING_POP_CLANG + #define MATX_IGNORE_WARNING_PUSH_CLANG(WARN_MSG) + #define MATX_IGNORE_WARNING_POP_CLANG - #define IGNORE_WARNING_PUSH_GCC(WARN_MSG) \ + #define MATX_IGNORE_WARNING_PUSH_GCC(WARN_MSG) \ _Pragma("GCC diagnostic push") \ - _Pragma(TOSTRING(GCC diagnostic ignored WARN_MSG)) + _Pragma(MATX_TOSTRING(GCC diagnostic ignored WARN_MSG)) - #define IGNORE_WARNING_POP_GCC \ + #define MATX_IGNORE_WARNING_POP_GCC \ _Pragma("GCC diagnostic pop") -#endif +#endif // std::ceil is not constexpr until C++23 #define MATX_ROUND_UP(N, S) ((((N) + (S) - 1) / (S)) * (S)) diff --git a/include/matx/core/error.h b/include/matx/core/error.h index 58b32d11d..9185db16f 100644 --- a/include/matx/core/error.h +++ b/include/matx/core/error.h @@ -46,7 +46,7 @@ namespace matx /** * @brief MatX error codes - * + * */ enum matxError_t { @@ -121,11 +121,11 @@ namespace matx /** * @brief Throw an exception and print a stack trace - * - * @param error - * @param s - * @param file - * @param line + * + * @param error + * @param s + * @param file + * @param line */ matxException(matxError_t error, const char *s, const char *file, int line) : e(error) @@ -190,8 +190,8 @@ namespace matx std::cout << #a ": " << str << "(" << tmp << " != " << expected << ")\n";\ MATX_THROW(error, ""); \ } \ - } - + } + #else #define MATX_ASSERT(a, error) {} #define MATX_ASSERT_STR(a, error, str) {} @@ -216,14 +216,14 @@ namespace matx } // Macro for checking cuda errors following a cuda launch or api call -#define CUDA_CHECK_LAST_ERROR() \ +#define MATX_CUDA_CHECK_LAST_ERROR() \ { \ const auto e = cudaGetLastError(); \ MATX_CUDA_CHECK(e); \ } // This macro asserts compatible dimensions of current class to an operator. -#define ASSERT_COMPATIBLE_OP_SIZES(op) \ +#define MATX_ASSERT_COMPATIBLE_OP_SIZES(op) \ if constexpr (Rank() > 0) { \ bool compatible = true; \ _Pragma("unroll") \ @@ -249,6 +249,6 @@ namespace matx std::cerr << ")" << std::endl; \ MATX_THROW(matxInvalidSize, "Incompatible operator sizes"); \ } \ - } + } } // end namespace matx diff --git a/include/matx/core/print.h b/include/matx/core/print.h index 2a5483f61..d7c7ee51f 100644 --- a/include/matx/core/print.h +++ b/include/matx/core/print.h @@ -27,7 +27,7 @@ // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ///////////////////////////////////////////////////////////////////////////////// #include @@ -136,11 +136,15 @@ namespace matx { template void PrintShapeImpl(const Op& op, FILE *fp) { - std::string type = (is_tensor_view_v) ? "Tensor" : "Operator"; + if (is_tensor_view_v) { + fprintf(fp, "%s: ",op.str().c_str()); + } + + std::string type = (is_tensor_view_v) ? "Tensor" : "Operator"; fprintf(fp, "%s{%s} Rank: %d, Sizes:[", type.c_str(), detail::GetTensorTypeString().c_str(), op.Rank()); for (index_t dimIdx = 0; dimIdx < op.Rank(); dimIdx++) { - fprintf(fp, "%" INDEX_T_FMT, op.Size(static_cast(dimIdx)) ); + fprintf(fp, "%" MATX_INDEX_T_FMT, op.Size(static_cast(dimIdx)) ); if( dimIdx < (op.Rank() - 1) ) fprintf(fp, ", "); } @@ -152,7 +156,7 @@ namespace matx { { for (index_t dimIdx = 0; dimIdx < (op.Rank() ); dimIdx++ ) { - fprintf(fp, "%" INDEX_T_FMT, op.Stride(static_cast(dimIdx)) ); + fprintf(fp, "%" MATX_INDEX_T_FMT, op.Stride(static_cast(dimIdx)) ); if( dimIdx < (op.Rank() - 1) ) { fprintf(fp, ","); @@ -161,8 +165,8 @@ namespace matx { } } - fprintf(fp, "]\n"); - } + fprintf(fp, "]\n"); + } /** @@ -195,7 +199,7 @@ namespace matx { } } if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) { - fprintf(fp, "%06" INDEX_T_FMT ": ", _k); + fprintf(fp, "%06" MATX_INDEX_T_FMT ": ", _k); } PrintVal(fp, op.operator()(_k)); if (_k == (op.Size(0)-1)) { @@ -223,7 +227,7 @@ namespace matx { for (index_t _l = 0; _l < ((l == 0) ? op.Size(1) : l); _l++) { if (_l == 0) { if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) { - fprintf(fp, "%06" INDEX_T_FMT ": ", _k); + fprintf(fp, "%06" MATX_INDEX_T_FMT ": ", _k); } else if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_PYTHON) { if (_k == 0) { @@ -284,7 +288,7 @@ namespace matx { } } if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) { - fprintf(fp, "[%06" INDEX_T_FMT ",:,:]\n", _j); + fprintf(fp, "[%06" MATX_INDEX_T_FMT ",:,:]\n", _j); } for (index_t _k = 0; _k < ((k == 0) ? op.Size(1) : k); _k++) { if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_MLAB) { @@ -311,7 +315,7 @@ namespace matx { for (index_t _l = 0; _l < ((l == 0) ? op.Size(2) : l); _l++) { if (_l == 0) { if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) { - fprintf(fp, "%06" INDEX_T_FMT ": ", _k); + fprintf(fp, "%06" MATX_INDEX_T_FMT ": ", _k); } else if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_PYTHON) { if (_k == 0) { @@ -395,7 +399,7 @@ namespace matx { } } if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) { - fprintf(fp, "[%06" INDEX_T_FMT ",%06" INDEX_T_FMT ",:,:]\n", _i, _j); + fprintf(fp, "[%06" MATX_INDEX_T_FMT ",%06" MATX_INDEX_T_FMT ",:,:]\n", _i, _j); } for (index_t _k = 0; _k < ((k == 0) ? op.Size(2) : k); _k++) { if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_MLAB) { @@ -422,7 +426,7 @@ namespace matx { for (index_t _l = 0; _l < ((l == 0) ? op.Size(3) : l); _l++) { if (_l == 0) { if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_DEFAULT) { - fprintf(fp, "%06" INDEX_T_FMT ": ", _k); + fprintf(fp, "%06" MATX_INDEX_T_FMT ": ", _k); } else if (PRINT_FORMAT_TYPE == MATX_PRINT_FORMAT_PYTHON) { if (_k == 0) { @@ -506,7 +510,7 @@ namespace matx { PrintData(fp, tmpv, dims...); } #endif - } + } /** @@ -596,7 +600,7 @@ namespace matx { template void print_shape(const T& op) { detail::PrintShapeImpl(op, stdout); - } + } /** * @brief print a tensor's values to output file stream @@ -685,19 +689,19 @@ namespace matx { /** * @brief Print a tensor's all values to stdout * - * This form of `print()` is a specialization for 0D tensors. + * This form of `print()` is a specialization for 0D tensors. * * @tparam Op Operator input type * @param op Operator input */ - template = true> - void print(const Op &op) + void print(const Op &op) { fprint(stdout, op); } - #endif // not DOXYGEN_ONLY + #endif // not DOXYGEN_ONLY /** * @brief Set the print() precision for floating point values @@ -733,6 +737,6 @@ namespace matx { */ __MATX_INLINE__ __MATX_HOST__ enum PrintFormatType get_print_format_type() { return PRINT_FORMAT_TYPE; - } + } } // End namespace matx diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h index cf4d5f224..be5ffecea 100644 --- a/include/matx/core/tensor.h +++ b/include/matx/core/tensor.h @@ -156,8 +156,13 @@ class tensor_t : public detail::tensor_impl_t { __MATX_INLINE__ ~tensor_t() = default; - __MATX_INLINE__ const std::string str() const { - return std::string("T") + std::to_string(RANK) + "_" + detail::to_short_str(); + const std::string str() const { + return name_; + } + + void set_name(std::string name) + { + name_ = name; } /** @@ -1332,7 +1337,7 @@ class tensor_t : public detail::tensor_impl_t { * more dimensions of a tensor. This includes completely dropping an unwanted * dimension, or simply taking a piece of a wanted dimension. Slice() is very * similar to indexing operations in both Python and MATLAB. - * + * * *NOTE* Users should not call Slice() directly anymore. Use the slice() operator instead. * * @param firsts @@ -1511,6 +1516,7 @@ class tensor_t : public detail::tensor_impl_t { private: Storage storage_; + std::string name_ = std::string("tensor_") + std::to_string(RANK) + "_" + detail::to_short_str(); }; diff --git a/include/matx/core/tensor_desc.h b/include/matx/core/tensor_desc.h index aa0a8eef7..5487ed9cd 100644 --- a/include/matx/core/tensor_desc.h +++ b/include/matx/core/tensor_desc.h @@ -46,12 +46,12 @@ namespace matx { * @tparam ShapeContainer type of sizes * @tparam StrideContainer type of strides */ -template +template class tensor_desc_t { public: template using self_type = tensor_desc_t; - + using shape_container = ShapeContainer; using stride_container = StrideContainer; using shape_type = typename ShapeContainer::value_type; ///< Type trait of shape type @@ -65,17 +65,17 @@ class tensor_desc_t { /** * @brief Default move constructor - */ + */ __MATX_INLINE__ tensor_desc_t(tensor_desc_t &&) = default; /** * @brief Default const copy assignment constructor - */ + */ __MATX_INLINE__ tensor_desc_t& operator=(const tensor_desc_t&) = default; /** * @brief Default copy assignment constructor - */ + */ __MATX_INLINE__ tensor_desc_t& operator=(tensor_desc_t&&) = default; /** Swaps two raw_pointer_buffers @@ -87,18 +87,18 @@ class tensor_desc_t { * @param rhs * Right argument */ - friend void swap( tensor_desc_t &lhs, + friend void swap( tensor_desc_t &lhs, tensor_desc_t &rhs) noexcept { using std::swap; swap(lhs.shape_, rhs.shape_); swap(lhs.stride_, rhs.stride_); - } + } /** * @brief Construct a tensor_desc_t from a generic shape and stride - * + * * @tparam S Unused * @param shape Shape object * @param stride Stride object @@ -110,35 +110,35 @@ class tensor_desc_t { MATX_ASSERT_STR(shape.size() == stride.size(), matxInvalidDim, "Size and stride array sizes must match"); MATX_ASSERT_STR(shape.size() == RANK, matxInvalidDim, - "Rank parameter must match array size"); + "Rank parameter must match array size"); } /** * @brief Construct a tensor_desc_t for a 0D tensor - * + * */ __MATX_INLINE__ __MATX_HOST__ tensor_desc_t() { - } + } /** * @brief Constructor with just shape for non-C-style arrays - * + * * @tparam S2 Unused - * @param shape + * @param shape * Shape of tensor */ template ::type> && !is_matx_descriptor_v::type>, bool> = true> - __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(S2 &&shape) + __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(S2 &&shape) { InitFromShape(std::forward(shape)); } /** * @brief Constructor with just shape for C-style arrays - * - * @tparam M + * + * @tparam M * Unused - * @param shape + * @param shape * Shape of tensor */ template @@ -146,20 +146,20 @@ class tensor_desc_t { { // Construct a new cuda::std::array. Slower, but saves duplication cuda::std::array tshape; - std::move(std::begin(shape), std::end(shape), tshape.begin()); + std::move(std::begin(shape), std::end(shape), tshape.begin()); InitFromShape(std::move(tshape)); - } + } /** * @brief Constructor with perfect-forwarded shape and C array of strides - * - * @param shape + * + * @param shape * Shape of tensor * @param strides * Strides of tensor */ template , bool> = true> - __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(S2 &&shape, const stride_type (&strides)[RANK]) : + __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(S2 &&shape, const stride_type (&strides)[RANK]) : shape_(std::forward(shape)) { for (int i = 0; i < RANK; i++) { MATX_ASSERT_STR(*(shape.begin() + i) > 0, matxInvalidSize, @@ -170,26 +170,26 @@ class tensor_desc_t { /** * @brief Constructor with perfect-forwarded shape and C array of strides - * - * @param shape + * + * @param shape * Shape of tensor * @param strides * Strides of tensor */ template , bool> = true> - __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(const shape_type (&shape)[RANK], StrideContainer &&strides) : + __MATX_INLINE__ __MATX_HOST__ tensor_desc_t(const shape_type (&shape)[RANK], StrideContainer &&strides) : stride_(std::forward(strides)) { for (int i = 0; i < RANK; i++) { MATX_ASSERT_STR(shape[i] > 0, matxInvalidSize, "Must specify size larger than 0 for each dimension"); *(shape_.begin() + i) = shape[i]; } - } + } /** * @brief Constructor with C-style array shape and strides - * - * @param shape + * + * @param shape * Shape of tensor * @param strides * Strides of tensor @@ -201,7 +201,7 @@ class tensor_desc_t { *(stride_.begin() + i) = strides[i]; *(shape_.begin() + i) = shape[i]; } - } + } /** * Check if a descriptor is contiguous in memory for all elements in the view @@ -233,16 +233,16 @@ class tensor_desc_t { constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto TotalSize() const noexcept { // The stride_type is expected to be able to hold this without overflowing - stride_type size = 1; + stride_type size = 1; for (int i = 0; i < RANK; i++) { size *= Size(i); } return size; - } + } /** * @brief Initialize descriptor from existing shape - * + * * @tparam S2 Shape type * @param shape Shape object */ @@ -262,51 +262,51 @@ class tensor_desc_t { #pragma unroll for (int i = RANK - 2; i >= 0; i--) { *(stride_.begin() + i) = Stride(i+1) * Size(i+1); - } - - } + } + + } /** * @brief Set the Size object - * + * * @param dim Dimension to size * @param size Size to set dimension to - * + * */ void __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ SetSize(int dim, shape_type size) { *(shape_.begin() + dim) = size; } /** * @brief Return size of descriptor on a single dimension - * + * * @param dim Dimension to retrieve * @return Size of dimension */ - constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Size([[maybe_unused]] int dim) const noexcept { + constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Size([[maybe_unused]] int dim) const noexcept { if constexpr (RANK == 0) { return static_cast(1); } // gcc 14.1 incorrectly reports shape_ as uninitialized in some contexts -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") - return *(shape_.begin() + dim); -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") + return *(shape_.begin() + dim); +MATX_IGNORE_WARNING_POP_GCC } /** * @brief Return strides contaienr of descriptor - * + * * @return Strides container */ - auto __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Strides() const { + auto __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Strides() const { return stride_; } /** * @brief Return stride of descriptor on a single dimension - * + * * @param dim Dimension to retrieve * @return Stride of dimension */ - auto __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Stride([[maybe_unused]] int dim) const { + auto __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Stride([[maybe_unused]] int dim) const { if constexpr (RANK == 0) { return static_cast(0); } @@ -315,25 +315,25 @@ IGNORE_WARNING_POP_GCC and clone(). It appears there's no valid code path that would cause this to be unitialized, so we're ignoring the warning in this one spot. gcc also incorrectly reports: error: array subscript 3 is outside array bounds of. This is impossible in the case it's reporting - since it comes from a clone where the loop inside of clone() is a compile-time constant of 2, + since it comes from a clone where the loop inside of clone() is a compile-time constant of 2, so it can never count up to 3. */ -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") -IGNORE_WARNING_PUSH_GCC("-Warray-bounds") +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") +MATX_IGNORE_WARNING_PUSH_GCC("-Warray-bounds") return *(stride_.begin() + dim); -IGNORE_WARNING_POP_GCC -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC } /** * @brief Return shape object - * - * @return Shape object + * + * @return Shape object */ auto __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Shape() const { return shape_; } /** * @brief Get rank of descriptor - * + * * @return Rank of descriptor */ static auto constexpr Rank() { return RANK; } @@ -345,11 +345,11 @@ IGNORE_WARNING_POP_GCC /** * @brief Tensor descriptor for compile-time descriptors - * + * * @tparam I First size * @tparam Is Parameter pack of sizes */ -template +template class static_tensor_desc_t { public: using shape_container = cuda::std::array; ///< Type trait of shape type @@ -372,7 +372,7 @@ class static_tensor_desc_t { /** * @brief Get size of dimension - * + * * @param dim Dimension to retrieve * @return Size of dimension */ @@ -380,48 +380,48 @@ class static_tensor_desc_t { /** * @brief Get stride of dimension - * + * * @param dim Dimension to retrieve * @return Stride of dimension - */ + */ static constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Stride(int dim) { return stride_[dim]; } /** * @brief Return strides contaienr of descriptor - * + * * @return Strides container */ - static constexpr auto Strides() { + static constexpr auto Strides() { return stride_; - } + } /** * @brief Get rank of descriptor - * + * * @return Descriptor rank - */ + */ static constexpr int Rank() { return shape_.size(); } /** * @brief Get underlying shape object - * + * * @return Shape object - */ + */ static constexpr auto Shape() { return shape_; } /** * @brief Get total size of descriptor - * + * * @return Product of all sizes - */ + */ static constexpr auto TotalSize() { return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies()); - } + } private: static constexpr auto make_shape(){ return cuda::std::array{I, Is...}; - } + } static constexpr auto make_strides(){ cuda::std::array m{}; @@ -435,7 +435,7 @@ class static_tensor_desc_t { } static constexpr shape_container shape_ = make_shape(); - static constexpr stride_container stride_ = make_strides(); + static constexpr stride_container stride_ = make_strides(); }; /** @@ -490,7 +490,7 @@ using tensor_desc_cr_disi_dist = tensor_desc_cr_ds_t; * * @tparam RANK Rank of shape */ -#ifdef INDEX_32_BIT +#ifdef MATX_INDEX_32_BIT template using DefaultDescriptor = tensor_desc_cr_ds_32_32_t; #else diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h index 77dd0dbd3..6c00062bc 100644 --- a/include/matx/core/tensor_impl.h +++ b/include/matx/core/tensor_impl.h @@ -92,7 +92,7 @@ class tensor_impl_t { const std::string str() const { - return std::string("T") + std::to_string(RANK) + "_" + to_short_str(); + return std::string("tensor_impl_") + std::to_string(RANK) + "_" + to_short_str(); } /** Swaps two raw_pointer_buffers @@ -210,14 +210,14 @@ class tensor_impl_t { * Data type */ // gcc 14.1 incorrectly reports desc as uninitialized in some contexts -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") template ::type>, bool> = true> - __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ tensor_impl_t(T *const ldata, - DescriptorType &&desc) + __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ tensor_impl_t(T *const ldata, + DescriptorType &&desc) : ldata_(ldata), desc_{std::forward(desc)} { } -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC /** * Constructor for creating a view with only a descriptor @@ -663,7 +663,7 @@ IGNORE_WARNING_POP_GCC MATX_ASSERT_STR(first < end, matxInvalidSize, "Slice must be at least one element long"); [[maybe_unused]] typename Desc::stride_type stride_mult; - + if constexpr (std::is_same_v) { stride_mult = 1; } @@ -724,7 +724,7 @@ IGNORE_WARNING_POP_GCC MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) return Slice(firsts, ends, detail::NoStride{}); - } + } template @@ -761,7 +761,7 @@ IGNORE_WARNING_POP_GCC tensor_desc_t new_desc{std::move(n), std::move(s)}; return new_desc; } - + template __MATX_INLINE__ auto Clone(const cuda::std::array &clones) const @@ -857,7 +857,7 @@ IGNORE_WARNING_POP_GCC } else { return false; } - } + } /** * Set the size of a dimension @@ -887,9 +887,9 @@ IGNORE_WARNING_POP_GCC template __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ stride_type GetVal([[maybe_unused]] cuda::std::tuple tup) { if constexpr (I < sizeof...(Is)) { -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") return GetVal(tup) + cuda::std::get(tup)*this->desc_.Stride(I); -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC } else { return 0; @@ -899,9 +899,9 @@ IGNORE_WARNING_POP_GCC template __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ stride_type GetValC([[maybe_unused]] const cuda::std::tuple tup) const { if constexpr (I < sizeof...(Is)) { -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") return GetValC(tup) + cuda::std::get(tup)*this->desc_.Stride(I); -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC } else { return 0; diff --git a/include/matx/core/utils.h b/include/matx/core/utils.h index 50bda7bb0..80f377029 100644 --- a/include/matx/core/utils.h +++ b/include/matx/core/utils.h @@ -38,12 +38,13 @@ #include "matx/core/defines.h" #include "matx/core/error.h" -#define HOPPER_CC 9 -#define AMPERE_CC 8 -#define VOLTA_CC 7 -#define PASCAL_CC 6 - namespace matx { + + constexpr int HOPPER_CC = 9; + constexpr int AMPERE_CC = 8; + constexpr int VOLTA_CC = 7; + constexpr int PASCAL_CC = 6; + namespace detail { __MATX_INLINE__ int GetDeviceAttr(cudaDeviceAttr attr) { int val; @@ -85,7 +86,7 @@ bool SizesMatch(const Op1 &op1, const Op2 &op2) { template __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 &y, const T3 &z) { // CUDA 12.6 with gcc 13 is reporting a parsing bug with the expression below. Use an alternative form. - // using T4 = decltype(x*y+z); + // using T4 = decltype(x*y+z); using T4 = std::invoke_result_t{}), decltype(std::multiplies<>{}(x, y)), decltype(z)>; if constexpr (is_complex_v && !is_complex_half_v) { @@ -150,7 +151,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 & __half2 ari = make_half2(X.x, X.y); // negate and swap supported in hardware sm_8.6+ __half2 air = make_half2(X.y, __hneg(X.x)); - // broadcast supported in hardware + // broadcast supported in hardware __half2 brr = make_half2(Y.x, Y.x); // broadcast supported in hardware __half2 bii = make_half2(Y.y, Y.y); @@ -158,9 +159,9 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 & __half2 d; // HFMA2 RD, RA.H1_H0, RB.H1_H1, RC.H1_H0 - d = __hfma2(ari, brr, c); + d = __hfma2(ari, brr, c); // HFMA2 RD, RB.H0_H0, -RA.H0_NH1, RC.H1_H0 - d = __hfma2(bii, -air, d); + d = __hfma2(bii, -air, d); return T4(d.x, d.y); #endif diff --git a/include/matx/executors/host.h b/include/matx/executors/host.h index d8219ae9f..e7ad02698 100644 --- a/include/matx/executors/host.h +++ b/include/matx/executors/host.h @@ -39,7 +39,7 @@ #ifdef MATX_EN_OMP #include #endif -namespace matx +namespace matx { // Matches current Linux max @@ -66,14 +66,14 @@ struct HostExecParams { private: int threads_; - cpu_set_t cpu_set_; + cpu_set_t cpu_set_ {0}; }; /** * @brief Executor for running an operator on a single or multi-threaded host - * + * * @tparam MODE Threading policy - * + * */ template class HostExecutor { @@ -106,13 +106,13 @@ class HostExecutor { /** * @brief Synchronize the host executor's threads. - * + * */ void sync() {} /** * @brief Execute an operator - * + * * @tparam Op Operator type * @param op Operator to execute */ @@ -120,7 +120,7 @@ class HostExecutor { void Exec(const Op &op) const noexcept { if constexpr (Op::Rank() == 0) { op(); - } + } else { index_t size = TotalSize(op); #ifdef MATX_EN_OMP @@ -130,17 +130,17 @@ class HostExecutor { auto idx = GetIdxFromAbs(op, i); cuda::std::apply([&](auto... args) { return op(args...); - }, idx); + }, idx); } } else #endif - { + { for (index_t i = 0; i < size; i++) { auto idx = GetIdxFromAbs(op, i); cuda::std::apply([&](auto... args) { return op(args...); - }, idx); - } + }, idx); + } } } } diff --git a/include/matx/generators/range.h b/include/matx/generators/range.h index 492fe6c0a..0a87c7f37 100644 --- a/include/matx/generators/range.h +++ b/include/matx/generators/range.h @@ -55,14 +55,14 @@ namespace matx __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ T operator()(index_t idx) const { if constexpr (is_matx_half_v) { -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") return first_ + T(static_cast((float)idx) * step_); -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC } else { -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") return first_ + T(static_cast(idx) * step_); -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC } } }; diff --git a/include/matx/kernels/conv.cuh b/include/matx/kernels/conv.cuh index b914dd295..eaa11566b 100644 --- a/include/matx/kernels/conv.cuh +++ b/include/matx/kernels/conv.cuh @@ -13,10 +13,14 @@ #include "matx/core/type_utils.h" #include "matx/core/tensor_utils.h" -#define CONV1D_ELEMENTS_PER_BLOCK 512 namespace matx { +namespace matx_conv1d_detail { + constexpr size_t CONV1D_ELEMENTS_PER_BLOCK = 512; +}; +using namespace matx_conv1d_detail; + typedef enum { MATX_C_MODE_FULL, // Default. Keep all elements of ramp up/down MATX_C_MODE_SAME, // Only keep elements where entire filter was present @@ -28,7 +32,7 @@ typedef enum { MATX_C_METHOD_FFT } matxConvCorrMethod_t; -#ifdef __CUDACC__ +#ifdef __CUDACC__ template __launch_bounds__(THREADS) __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter, @@ -37,11 +41,11 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter, { /* strategy: - 1 thread per EPT outputs. + 1 thread per EPT outputs. Each block produces EPT * THREADS outputs Full convolution is computed and results are windowed down based on the request Filter is fully loaded into shared memory - Chunk of signal is loaded into shared memory with filter_len pandding on the negative side. + Chunk of signal is loaded into shared memory with filter_len pandding on the negative side. If out of range then we pad with zeros. */ static_assert(InType::Rank() == FilterType::Rank()); @@ -93,14 +97,14 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter, __syncthreads(); // load signal, pad extra elements with zeros - for (int32_t lidx = threadIdx.x, gidx = chunk_idx * CONV1D_ELEMENTS_PER_BLOCK - filter_len + 1 + threadIdx.x; - gidx < static_cast((chunk_idx+1) * CONV1D_ELEMENTS_PER_BLOCK) ; + for (int32_t lidx = threadIdx.x, gidx = chunk_idx * CONV1D_ELEMENTS_PER_BLOCK - filter_len + 1 + threadIdx.x; + gidx < static_cast((chunk_idx+1) * CONV1D_ELEMENTS_PER_BLOCK) ; gidx += THREADS, lidx += THREADS) { // some elements may be out of range. We set their values to 0. intype_strip val(0); - if( gidx >= 0 && gidx < signal_len) { + if( gidx >= 0 && gidx < signal_len) { bdims[Rank - 1] = gidx; cuda::std::apply([&val, d_in](auto &&...args) { val = d_in.operator()(args...); @@ -113,8 +117,8 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter, // wait for signal to load __syncthreads(); - // register array for output data - outtype_strip oval[EPT] = {0}; + // register array for output data + outtype_strip oval[EPT] = {0}; // Below will use pointer modification instead of offsets to change IMADS into IADS. IMADS go through FMA pipe. @@ -178,10 +182,10 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter, int32_t gidx = idx - start; if(idx >= start && idx <= stop) { - bdims[Rank - 1] = gidx; + bdims[Rank - 1] = gidx; cuda::std::apply([&](auto &&...args) { d_out.operator()(args...) = oval[i]; - }, bdims); + }, bdims); } } } // end chunk loop @@ -205,12 +209,12 @@ struct ShmBuffer2D { __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ T &operator()(index_t y, index_t x) noexcept { return *(ptr + y * X_LEN + x); - } + } T *ptr; }; -template = 0 && x < i1M && y >=0 && y < i1N) { // Signal Dims @@ -321,7 +325,7 @@ __global__ void Conv2D(OutType d_out, InType1 d_in1, InType2 d_in2, bdims[Rank - 1] = x; cuda::std::apply([&](auto &&...args) { val = d_in1.operator()(args...); }, bdims); } - + // store in shared s_signal(ii,jj) = val; } @@ -336,7 +340,7 @@ __global__ void Conv2D(OutType d_out, InType1 d_in1, InType2 d_in2, for (int mm = 0; mm < FILTER_SHARED_CHUNK_X; mm+=FILTER_REG_CHUNK_X) { #pragma unroll for (int nn = 0; nn < FILTER_SHARED_CHUNK_Y; nn+=FILTER_REG_CHUNK_Y) { - + // Copy chunk from shared memory in to registers #pragma unroll @@ -348,19 +352,19 @@ __global__ void Conv2D(OutType d_out, InType1 d_in1, InType2 d_in2, } - // convolution loop: convolve filter and signal. + // convolution loop: convolve filter and signal. // Keep signal in registers as much as possible by shifting. #pragma unroll for (int m = 0; m < FILTER_REG_CHUNK_X; m++) { - + #pragma unroll for (int n = 0; n < FILTER_REG_CHUNK_Y; n++) { - + in2type i2 = r_filter[n][m]; // if FILTER_REG_CHUNK_X > 1 then we need to reload i1 every m loop - if( nn == 0 || - (FILTER_REG_CHUNK_X > 1 && n == 0)) { + if( nn == 0 || + (FILTER_REG_CHUNK_X > 1 && n == 0)) { // load ILPY signal points #pragma unroll for(int u = 0; u < ILPY; u++) { @@ -395,7 +399,7 @@ __global__ void Conv2D(OutType d_out, InType1 d_in1, InType2 d_in2, if(i + u < oN && j < oM) { bdims[Rank - 2] = i + u; bdims[Rank - 1] = j; - cuda::std::apply([&](auto &&...args) { d_out.operator()(args...) = sum[u]; }, bdims); + cuda::std::apply([&](auto &&...args) { d_out.operator()(args...) = sum[u]; }, bdims); } } diff --git a/include/matx/kernels/filter.cuh b/include/matx/kernels/filter.cuh index fcd41cca0..a40ccef68 100644 --- a/include/matx/kernels/filter.cuh +++ b/include/matx/kernels/filter.cuh @@ -8,34 +8,31 @@ #include #include -#define MAX_BATCHES 10000 -#define BLOCK_SIZE_RECURSIVE 1024 -#define CORR_COLS BLOCK_SIZE_RECURSIVE -#define MAX_BLOCKS_PER_BATCH 1000 -#define RECURSIVE_VALS_PER_THREAD 8 -#define MAX_NON_RECURSIVE_COEFFS 4 -#define MAX_RECURSIVE_COEFFS 4 -#define WARP_SIZE 32 -#define COMPLEX_TYPE cuComplex -#define RECURSIVE_CHUNK_SIZE (BLOCK_SIZE_RECURSIVE * RECURSIVE_VALS_PER_THREAD) -#define MAX_SIGNAL_LEN_PER_BATCH \ - (BLOCK_SIZE_RECURSIVE * RECURSIVE_VALS_PER_THREAD * MAX_BLOCKS_PER_BATCH) - -#define COMPLEX_TYPE cuComplex - -// cuda::std::max/min isn't working on template value parameters -#define MAX(a, b) ((a) < (b) ? (b) : (a)) -#define MIN(a, b) ((a) < (b) ? (a) : (b)) - namespace matx { +namespace detail_filter { + constexpr size_t MAX_BATCHES = 10000; + constexpr size_t BLOCK_SIZE_RECURSIVE = 1024; + constexpr size_t CORR_COLS = BLOCK_SIZE_RECURSIVE; + constexpr size_t MAX_BLOCKS_PER_BATCH = 1000; + constexpr size_t RECURSIVE_VALS_PER_THREAD = 8; + constexpr size_t MAX_NON_RECURSIVE_COEFFS = 4; + constexpr size_t MAX_RECURSIVE_COEFFS = 4; + constexpr size_t WARP_SIZE = 32; + using COMPLEX_TYPE = cuComplex; + constexpr size_t RECURSIVE_CHUNK_SIZE = BLOCK_SIZE_RECURSIVE * RECURSIVE_VALS_PER_THREAD; + constexpr size_t MAX_SIGNAL_LEN_PER_BATCH = + (BLOCK_SIZE_RECURSIVE * RECURSIVE_VALS_PER_THREAD * MAX_BLOCKS_PER_BATCH); +}; +using namespace detail_filter; + typedef enum { STATUS_FLAG_INCOMPLETE = 0, STATUS_FLAG_PARTIAL_COMPLETE = 1, STATUS_FLAG_FULL_COMPLETE = 2, } STATUS_FLAGS; -#ifdef __CUDACC__ +#ifdef __CUDACC__ // Chunk ID assignment used for atomic incrementing between blocks static __device__ uint32_t cid_assign[MAX_BATCHES] = {0}; @@ -54,7 +51,7 @@ __global__ __launch_bounds__(BLOCK_SIZE_RECURSIVE, 1) void RecursiveFilter( __shared__ intype_strip s_exch[1 + (1 + BLOCK_SIZE_RECURSIVE) * - MAX(num_non_recursive - 1, + cuda::std::max(num_non_recursive - 1, num_recursive)]; // Data exchange between threads __shared__ uint32_t s_chunk_id; __shared__ FilterType @@ -64,7 +61,7 @@ __global__ __launch_bounds__(BLOCK_SIZE_RECURSIVE, 1) void RecursiveFilter( // since nvcc doesn't like that intype_strip tmp[RECURSIVE_VALS_PER_THREAD]; intype_strip vals[RECURSIVE_VALS_PER_THREAD]; - intype_strip r_nonr[MAX(MAX_NON_RECURSIVE_COEFFS, MAX_RECURSIVE_COEFFS)]; + intype_strip r_nonr[cuda::std::max(MAX_NON_RECURSIVE_COEFFS, MAX_RECURSIVE_COEFFS)]; const uint32_t lane = threadIdx.x & 31; const uint32_t warp_id = threadIdx.x / WARP_SIZE; // const index_t batch_offset = blockIdx.y * len; @@ -259,7 +256,7 @@ __global__ __launch_bounds__(BLOCK_SIZE_RECURSIVE, 1) void RecursiveFilter( for (uint32_t r = 0; r < RECURSIVE_VALS_PER_THREAD; r++) { // Load all of the values we need from other threads in the warp #pragma unroll - for (int32_t rec = 0; rec < MIN(num_recursive, wl); rec++) { + for (int32_t rec = 0; rec < cuda::std::min(num_recursive, static_cast(wl)); rec++) { if constexpr (is_cuda_complex_v) { *reinterpret_cast(&tmp[rec + 1]) = __shfl_sync(~0, *reinterpret_cast(&vals[r]), @@ -274,7 +271,7 @@ __global__ __launch_bounds__(BLOCK_SIZE_RECURSIVE, 1) void RecursiveFilter( len) { // Make sure this value is within bounds of the signal // Now apply those values #pragma unroll - for (int32_t rec = 0; rec < MIN(num_recursive, wl); rec++) { + for (int32_t rec = 0; rec < cuda::std::min(num_recursive, static_cast(wl)); rec++) { if constexpr (is_cuda_complex_v) { vals[r] = cuCaddf(vals[r], diff --git a/include/matx/kernels/transpose.cuh b/include/matx/kernels/transpose.cuh index f1a5adb55..b49066aa0 100644 --- a/include/matx/kernels/transpose.cuh +++ b/include/matx/kernels/transpose.cuh @@ -13,9 +13,13 @@ namespace matx { -#ifdef __CUDACC__ -// Tile dims for one block -#define TILE_DIM 32 +#ifdef __CUDACC__ + +namespace matx_transpose_detail { + // Tile dims for one block + constexpr size_t TILE_DIM = 32; +} +using namespace matx_transpose_detail; /* Out of place. Adapted from: https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/. Works @@ -26,7 +30,7 @@ __global__ void transpose_kernel_oop(OutputTensor out, { using T = typename OutputTensor::value_type; constexpr int RANK = OutputTensor::Rank(); - + extern __shared__ float tile[]; // Need to swap complex types also, so cast when needed T *shm_tile = reinterpret_cast(&tile[0]); diff --git a/include/matx/operators/binary_operators.h b/include/matx/operators/binary_operators.h index 49635ccd0..c6c0eea1b 100644 --- a/include/matx/operators/binary_operators.h +++ b/include/matx/operators/binary_operators.h @@ -36,7 +36,7 @@ #include "matx/operators/base_operator.h" #include "matx/operators/scalar_ops.h" -#define DEFINE_BINARY_OP(FUNCTION, TENSOR_OP) \ +#define MATX_DEFINE_BINARY_OP(FUNCTION, TENSOR_OP) \ template () or \ is_matx_op()>> \ @@ -54,7 +54,7 @@ namespace matx { /** * @brief Utility operator for multiplying scalars by a complex value - * + * * @tparam T Complex type * @tparam S Scalar type * @param n Scalar value @@ -72,7 +72,7 @@ namespace matx /** * @brief Utility operator for multiplying scalars by a complex value - * + * * @tparam T Complex type * @tparam S Scalar type * @param n Scalar value @@ -112,8 +112,8 @@ namespace matx { if constexpr (Rank() > 0) { - ASSERT_COMPATIBLE_OP_SIZES(in1_); - ASSERT_COMPATIBLE_OP_SIZES(in2_); + MATX_ASSERT_COMPATIBLE_OP_SIZES(in1_); + MATX_ASSERT_COMPATIBLE_OP_SIZES(in2_); } } @@ -130,8 +130,8 @@ namespace matx { return cuda::std::apply([&](auto &&...args) { return this->operator()(args...); - }, idx); - } + }, idx); + } static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() @@ -159,7 +159,7 @@ namespace matx } template - __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept + __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept { if constexpr (is_matx_op()) { in1_.PostRun(std::forward(shape), std::forward(ex)); @@ -168,7 +168,7 @@ namespace matx if constexpr (is_matx_op()) { in2_.PostRun(std::forward(shape), std::forward(ex)); } - } + } }; } @@ -353,7 +353,7 @@ namespace matx * @param t2 * RHS tensor or operator input */ - Op operator|(Op t, Op t2) {} + Op operator|(Op t, Op t2) {} /** * Compute t ^ t2 (bitwise XOR) of two operators or tensors @@ -362,7 +362,7 @@ namespace matx * @param t2 * RHS tensor or operator input */ - Op operator^(Op t, Op t2) {} + Op operator^(Op t, Op t2) {} /** * Compute the arctangent of two inputs @@ -371,31 +371,31 @@ namespace matx * @param t2 * Y value of input */ - Op atan2(Op t, Op t2) {} + Op atan2(Op t, Op t2) {} #else - DEFINE_BINARY_OP(operator+, detail::AddOp); - DEFINE_BINARY_OP(operator-, detail::SubOp); - DEFINE_BINARY_OP(operator*, detail::MulOp); - DEFINE_BINARY_OP(mul, detail::MulOp); - DEFINE_BINARY_OP(operator/, detail::DivOp); - DEFINE_BINARY_OP(operator%, detail::ModOp); - DEFINE_BINARY_OP(fmod, detail::FModOp); - DEFINE_BINARY_OP(operator|, detail::OrOp); - DEFINE_BINARY_OP(operator&, detail::AndOp); - DEFINE_BINARY_OP(operator^, detail::XorOp); - DEFINE_BINARY_OP(pow, detail::PowOp); - DEFINE_BINARY_OP(max, detail::MaximumOp); - DEFINE_BINARY_OP(atan2, detail::Atan2Op); - DEFINE_BINARY_OP(min, detail::MinimumOp); - DEFINE_BINARY_OP(operator<, detail::LTOp); - DEFINE_BINARY_OP(operator>, detail::GTOp); - DEFINE_BINARY_OP(operator<=, detail::LTEOp); - DEFINE_BINARY_OP(operator>=, detail::GTEOp); - DEFINE_BINARY_OP(operator==, detail::EQOp); - DEFINE_BINARY_OP(operator!=, detail::NEOp); - DEFINE_BINARY_OP(operator&&, detail::AndAndOp); - DEFINE_BINARY_OP(operator||, detail::OrOrOp); - DEFINE_UNARY_OP(operator!, detail::NotOp); + MATX_DEFINE_BINARY_OP(operator+, detail::AddOp); + MATX_DEFINE_BINARY_OP(operator-, detail::SubOp); + MATX_DEFINE_BINARY_OP(operator*, detail::MulOp); + MATX_DEFINE_BINARY_OP(mul, detail::MulOp); + MATX_DEFINE_BINARY_OP(operator/, detail::DivOp); + MATX_DEFINE_BINARY_OP(operator%, detail::ModOp); + MATX_DEFINE_BINARY_OP(fmod, detail::FModOp); + MATX_DEFINE_BINARY_OP(operator|, detail::OrOp); + MATX_DEFINE_BINARY_OP(operator&, detail::AndOp); + MATX_DEFINE_BINARY_OP(operator^, detail::XorOp); + MATX_DEFINE_BINARY_OP(pow, detail::PowOp); + MATX_DEFINE_BINARY_OP(max, detail::MaximumOp); + MATX_DEFINE_BINARY_OP(atan2, detail::Atan2Op); + MATX_DEFINE_BINARY_OP(min, detail::MinimumOp); + MATX_DEFINE_BINARY_OP(operator<, detail::LTOp); + MATX_DEFINE_BINARY_OP(operator>, detail::GTOp); + MATX_DEFINE_BINARY_OP(operator<=, detail::LTEOp); + MATX_DEFINE_BINARY_OP(operator>=, detail::GTEOp); + MATX_DEFINE_BINARY_OP(operator==, detail::EQOp); + MATX_DEFINE_BINARY_OP(operator!=, detail::NEOp); + MATX_DEFINE_BINARY_OP(operator&&, detail::AndAndOp); + MATX_DEFINE_BINARY_OP(operator||, detail::OrOrOp); + MATX_DEFINE_UNARY_OP(operator!, detail::NotOp); #endif } // end namespace matx diff --git a/include/matx/operators/cart2sph.h b/include/matx/operators/cart2sph.h index d57aa238e..15dca39e4 100644 --- a/include/matx/operators/cart2sph.h +++ b/include/matx/operators/cart2sph.h @@ -54,18 +54,18 @@ namespace matx using matxop = bool; using value_type = typename T1::value_type; - __MATX_INLINE__ std::string str() const { return "cart2sph(" + get_type_str(x_) + + __MATX_INLINE__ std::string str() const { return "cart2sph(" + get_type_str(x_) + "," + get_type_str(y_) + "," + get_type_str(z_) + ")"; } __MATX_INLINE__ Cart2SphOp(const T1 &x, const T2 &y, const T3 &z) : x_(x), y_(y), z_(z) { - ASSERT_COMPATIBLE_OP_SIZES(x); - ASSERT_COMPATIBLE_OP_SIZES(y); - ASSERT_COMPATIBLE_OP_SIZES(z); + MATX_ASSERT_COMPATIBLE_OP_SIZES(x); + MATX_ASSERT_COMPATIBLE_OP_SIZES(y); + MATX_ASSERT_COMPATIBLE_OP_SIZES(z); } template - __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const + __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const { auto x = get_value(x_, indices...); auto y = get_value(y_, indices...); @@ -78,7 +78,7 @@ namespace matx } else { // r return _internal_sqrt(x * x + y * y + z * z); } - } + } static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() { @@ -90,7 +90,7 @@ namespace matx index_t size1 = get_expanded_size(x_, dim); index_t size2 = get_expanded_size(y_, dim); index_t size3 = get_expanded_size(z_, dim); - return detail::matx_max(size1, size2, size3); + return detail::matx_max(size1, size2, size3); } template @@ -123,7 +123,7 @@ namespace matx if constexpr (is_matx_op()) { z_.PostRun(std::forward(shape), std::forward(ex)); } - } + } }; } /** @@ -139,7 +139,7 @@ namespace matx * * @param x * Operator defining x - * + * * @param y * Operator defining y * @@ -152,7 +152,7 @@ namespace matx template auto __MATX_INLINE__ cart2sph(const T1 &x, const T2 &y, const T3 &z) { - return cuda::std::tuple{ + return cuda::std::tuple{ detail::Cart2SphOp(x, y, z), detail::Cart2SphOp(x, y, z), detail::Cart2SphOp(x, y, z)}; diff --git a/include/matx/operators/clone.h b/include/matx/operators/clone.h index 85c614d86..b99bd83af 100644 --- a/include/matx/operators/clone.h +++ b/include/matx/operators/clone.h @@ -58,7 +58,7 @@ namespace matx static_assert(T::Rank() < CRank, "Cloning rank must be higher than input operator rank"); const index_t num_keep = std::count_if(shape.begin(), shape.end(), [](index_t i) { return i == matxKeepDim; }); - MATX_ASSERT_STR(num_keep == T::Rank(), matxInvalidParameter, + MATX_ASSERT_STR(num_keep == T::Rank(), matxInvalidParameter, "Number of matxKeepDim in a clone must match input operator rank"); // create gather list @@ -69,9 +69,9 @@ namespace matx sizes_[i] = op_.Size(d); // gcc incorrectly shows an invalid access to array element [1] in a unit test here. This is not // possible based on runtime checks we have. Disable the warning temporarily. -IGNORE_WARNING_PUSH_GCC("-Warray-bounds") +MATX_IGNORE_WARNING_PUSH_GCC("-Warray-bounds") dims_[d++] = i; -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC } else { sizes_[i] = shape[i]; } @@ -87,11 +87,11 @@ IGNORE_WARNING_POP_GCC template static __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) get_impl(Op&& op, const Dims &dims, Is... indices) - { -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") + { +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") cuda::std::array sind{indices...}; cuda::std::array gind; -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC // gather indices for(int i = 0; i < T::Rank(); i++) { diff --git a/include/matx/operators/diag.h b/include/matx/operators/diag.h index 08c737643..6eab9669d 100644 --- a/include/matx/operators/diag.h +++ b/include/matx/operators/diag.h @@ -59,11 +59,11 @@ namespace matx using value_type = typename T1::value_type; __MATX_INLINE__ std::string str() const { return "diag(" + op_.str() + ")"; } - + __MATX_INLINE__ DiagOp(const T1 &op, index_t k) : op_(op), k_(k) { } template - __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const + __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const { static_assert(RANK != 0, "Cannot make get diagonals from 0D tensor"); using tt = cuda::std::tuple_element_t<0, cuda::std::tuple>; @@ -79,26 +79,26 @@ namespace matx } else { static_assert(sizeof...(Is) == RANK - 1, "Diagonal operator must have one fewer op() index than rank of operator"); - + // Offset either the rows or columns by k_, depending on if it's negative if (k_ < 0) { cuda::std::array tmp{indices...}; tmp[RANK - 1] = pp_get(indices...); //cuda::std::get(tup) = pp_get(indices...) ; -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") tmp[RANK - 2] -= k_; //cuda::std::get(tup) = cuda::std::get(tup) - k_; -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC return get_value(op_, tmp); } else { cuda::std::array tmp{indices...}; //auto tup = cuda::std::make_tuple(indices..., static_cast(0)); -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") tmp[RANK - 1] = pp_get(indices...) + k_; //cuda::std::get(tup) = pp_get(indices...) + k_; -IGNORE_WARNING_POP_GCC - return get_value(op_, tmp); +MATX_IGNORE_WARNING_POP_GCC + return get_value(op_, tmp); } } } @@ -166,13 +166,13 @@ IGNORE_WARNING_POP_GCC * Diagonal to pull (0 is the main diagonal). Only used for 2D tensors and above */ #ifdef DOXYGEN_ONLY - auto __MATX_INLINE__ diag(const T1 &t, index_t k = 0) { + auto __MATX_INLINE__ diag(const T1 &t, index_t k = 0) { #else template (), bool> = true> - auto __MATX_INLINE__ diag(T1 t, index_t k = 0) { -#endif - MATX_ASSERT_STR(T1::Rank() != 1 || k == 0, matxInvalidParameter, + auto __MATX_INLINE__ diag(T1 t, index_t k = 0) { +#endif + MATX_ASSERT_STR(T1::Rank() != 1 || k == 0, matxInvalidParameter, "k parameter in diag() can only be used for 2D tensors and above"); - return detail::DiagOp(t, k); - } + return detail::DiagOp(t, k); + } } // end namespace matx diff --git a/include/matx/operators/if.h b/include/matx/operators/if.h index 08c923aeb..f0b858c48 100644 --- a/include/matx/operators/if.h +++ b/include/matx/operators/if.h @@ -63,10 +63,10 @@ namespace matx __MATX_INLINE__ std::string str() const { return "if(" + cond_.str() + ") then {" + op_.str() + "}"; } /** * @brief Constructor for an IF statement - * + * * @param cond Condition to perform the IF/ELSE on * @param op Operator if conditional branch is true - */ + */ __MATX_INLINE__ IFOP(const T1 &cond, const T2 &op) : cond_(cond), op_(op) { static_assert((!is_tensor_view_v), @@ -83,17 +83,17 @@ namespace matx { index_t size1 = detail::get_expanded_size(cond_, i); index_t size2 = detail::get_expanded_size(op_, i); - size_[i] = detail::matx_max(size1, size2); + size_[i] = detail::matx_max(size1, size2); } - - ASSERT_COMPATIBLE_OP_SIZES(op_); - ASSERT_COMPATIBLE_OP_SIZES(cond_); + + MATX_ASSERT_COMPATIBLE_OP_SIZES(op_); + MATX_ASSERT_COMPATIBLE_OP_SIZES(cond_); } } /** * @brief Operator() for getting values of an if operator - * + * * @tparam Is Index types * @param indices Index values */ @@ -102,13 +102,13 @@ namespace matx if (get_value(cond_, indices...)) { get_value(op_, indices...); } - } + } /** * @brief Rank of IF operator - * + * * @return Rank - */ + */ static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() { return detail::matx_max(detail::get_rank(), detail::get_rank()); @@ -116,10 +116,10 @@ namespace matx /** * @brief Size of dimension of operator - * + * * @param dim Dimension to get size of - * @return Size of dimension - */ + * @return Size of dimension + */ constexpr index_t __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Size(int dim) const { return size_[dim]; @@ -135,7 +135,7 @@ namespace matx if constexpr (is_matx_op()) { cond_.PreRun(std::forward(shape), std::forward(ex)); - } + } } template @@ -143,12 +143,12 @@ namespace matx { if constexpr (is_matx_op()) { op_.PostRun(std::forward(shape), std::forward(ex)); - } + } if constexpr (is_matx_op()) { cond_.PostRun(std::forward(shape), std::forward(ex)); } - } + } }; /** @@ -159,7 +159,7 @@ namespace matx * operator must be defined for the particular type. For example, operator< on * two integers is okay, but the same operator on two complex numbers will give * a compiler error. - * + * * @param t1 op1 * * @param t2 op2 diff --git a/include/matx/operators/ifelse.h b/include/matx/operators/ifelse.h index 43d7f0466..649aa726c 100644 --- a/include/matx/operators/ifelse.h +++ b/include/matx/operators/ifelse.h @@ -56,24 +56,24 @@ namespace matx private: typename detail::base_type_t cond_; typename detail::base_type_t op1_; - typename detail::base_type_t op2_; + typename detail::base_type_t op2_; cuda::std::array(), detail::get_rank(), detail::get_rank())> size_; public: using value_type = void; ///< Scalar type for type extraction - __MATX_INLINE__ std::string str() const { - return "if(" + detail::get_type_str(cond_) + ") then {" + detail::get_type_str(op1_) + "} else {" + detail::get_type_str(op2_) + "}"; + __MATX_INLINE__ std::string str() const { + return "if(" + detail::get_type_str(cond_) + ") then {" + detail::get_type_str(op1_) + "} else {" + detail::get_type_str(op2_) + "}"; } /** * @brief Constructor for an IFELSE statement - * + * * @param cond Condition to perform the IF/ELSE on * @param op1 Operator if conditional branch is true * @param op2 Operator if conditional branch is false */ - __MATX_INLINE__ IFELSE(const C1 &cond, const T1 &op1, const T2 &op2) : + __MATX_INLINE__ IFELSE(const C1 &cond, const T1 &op1, const T2 &op2) : cond_(cond), op1_(op1), op2_(op2) { static_assert((!is_tensor_view_v && !is_tensor_view_v), @@ -97,14 +97,14 @@ namespace matx } } - ASSERT_COMPATIBLE_OP_SIZES(op1_); - ASSERT_COMPATIBLE_OP_SIZES(op2_); - ASSERT_COMPATIBLE_OP_SIZES(cond_); + MATX_ASSERT_COMPATIBLE_OP_SIZES(op1_); + MATX_ASSERT_COMPATIBLE_OP_SIZES(op2_); + MATX_ASSERT_COMPATIBLE_OP_SIZES(cond_); } /** * @brief Operator() for getting values of an if/else - * + * * @tparam Is Index types * @param indices Index values */ @@ -116,11 +116,11 @@ namespace matx else { get_value(op2_, indices...); } - } + } /** * @brief Rank of IF/ELSE operator - * + * * @return Rank */ static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() @@ -137,11 +137,11 @@ namespace matx if constexpr (is_matx_op()) { op2_.PreRun(std::forward(shape), std::forward(ex)); - } + } if constexpr (is_matx_op()) { cond_.PreRun(std::forward(shape), std::forward(ex)); - } + } } template @@ -153,18 +153,18 @@ namespace matx if constexpr (is_matx_op()) { op2_.PostRun(std::forward(shape), std::forward(ex)); - } + } if constexpr (is_matx_op()) { cond_.PostRun(std::forward(shape), std::forward(ex)); } - } + } /** * @brief Size of dimension of operator - * + * * @param dim Dimension to get size of - * @return Size of dimension + * @return Size of dimension */ constexpr index_t __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ Size(int dim) const { diff --git a/include/matx/operators/isclose.h b/include/matx/operators/isclose.h index 53f8a3b75..c5f922c38 100644 --- a/include/matx/operators/isclose.h +++ b/include/matx/operators/isclose.h @@ -41,7 +41,7 @@ namespace matx { namespace detail { - template + template class IsCloseOp : public BaseOp> { public: @@ -52,19 +52,19 @@ namespace matx __MATX_INLINE__ std::string str() const { return "isclose()"; } - __MATX_INLINE__ IsCloseOp(const Op1 &op1, const Op2 &op2, double rtol, double atol) : - op1_(op1), op2_(op2), rtol_(static_cast(rtol)), atol_(static_cast(atol)) + __MATX_INLINE__ IsCloseOp(const Op1 &op1, const Op2 &op2, double rtol, double atol) : + op1_(op1), op2_(op2), rtol_(static_cast(rtol)), atol_(static_cast(atol)) { static_assert(Op1::Rank() == Op2::Rank(), "Operator ranks must match in isclose()"); - ASSERT_COMPATIBLE_OP_SIZES(op1); - ASSERT_COMPATIBLE_OP_SIZES(op2); + MATX_ASSERT_COMPATIBLE_OP_SIZES(op1); + MATX_ASSERT_COMPATIBLE_OP_SIZES(op2); } template - __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ int operator()([[maybe_unused]] Is... indices) const + __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ int operator()([[maybe_unused]] Is... indices) const { - return static_cast(detail::_internal_abs(get_value(op1_, indices...) - get_value(op2_, indices...)) <= + return static_cast(detail::_internal_abs(get_value(op1_, indices...) - get_value(op2_, indices...)) <= static_cast(atol_) + static_cast(rtol_) * detail::_internal_abs(get_value(op2_, indices...))); } @@ -89,7 +89,7 @@ namespace matx if constexpr (is_matx_op()) { op2_.PreRun(std::forward(shape), std::forward(ex)); - } + } } template @@ -102,7 +102,7 @@ namespace matx if constexpr (is_matx_op()) { op2_.PostRun(std::forward(shape), std::forward(ex)); } - } + } private: typename detail::base_type_t op1_; @@ -116,9 +116,9 @@ namespace matx /** * @brief Returns an integer tensor where an element is 1 if: * abs(op1 - op2) <= atol + rtol * abs(op2) - * + * * or 0 otherwise - * + * * @tparam Op1 First operator type * @tparam Op2 Second operator type * @param op1 First operator diff --git a/include/matx/operators/permute.h b/include/matx/operators/permute.h index cebb62dc3..0732b3aec 100644 --- a/include/matx/operators/permute.h +++ b/include/matx/operators/permute.h @@ -84,30 +84,30 @@ namespace matx // convert variadic type to tuple so we can read/update cuda::std::array inds{indices...}; -IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") +MATX_IGNORE_WARNING_PUSH_GCC("-Wmaybe-uninitialized") cuda::std::array ind; -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC #if 0 //This causes register spills but might be faster if Rank is large #pragma unroll - for(int32_t i = 0; i < Rank(); i++) { + for(int32_t i = 0; i < Rank(); i++) { ind[dims_[i]] = inds[i]; } #else #pragma unroll // use double loop to avoid register spills - for(int32_t i = 0; i < Rank(); i++) { + for(int32_t i = 0; i < Rank(); i++) { #pragma unroll - for(int32_t j = 0; j < Rank(); j++) { + for(int32_t j = 0; j < Rank(); j++) { if(dims[j] == i) { ind[i] = inds[j]; - } + } } } -#endif +#endif return get_value(cuda::std::forward(op), ind); - } + } template __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const diff --git a/include/matx/operators/set.h b/include/matx/operators/set.h index 898857061..d74d47a5b 100644 --- a/include/matx/operators/set.h +++ b/include/matx/operators/set.h @@ -101,7 +101,7 @@ class set : public BaseOp> { // set() is a placeholder when using mtie() for multiple return types, so we don't need to check compatible // sizes if constexpr (!is_mtie()) { - ASSERT_COMPATIBLE_OP_SIZES(op); + MATX_ASSERT_COMPATIBLE_OP_SIZES(op); } } @@ -152,18 +152,18 @@ class set : public BaseOp> { { if constexpr (is_matx_op()) { out_.PreRun(std::forward(shape), std::forward(ex)); - } + } if constexpr (is_matx_op()) { op_.PreRun(std::forward(shape), std::forward(ex)); } } template - __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept + __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept { if constexpr (is_matx_op()) { out_.PostRun(std::forward(shape), std::forward(ex)); - } + } if constexpr (is_matx_op()) { op_.PostRun(std::forward(shape), std::forward(ex)); } diff --git a/include/matx/operators/shift.h b/include/matx/operators/shift.h index 4685e9e0d..cebab1acc 100644 --- a/include/matx/operators/shift.h +++ b/include/matx/operators/shift.h @@ -40,7 +40,7 @@ namespace matx { /** * Shifts the indexing of an operator to move the array forward or backward by the - * shift amount. + * shift amount. * * ShiftOp allows adjusting the relative view of a tensor to start at a * new offset. This may be useful to cut off part of a tensor that is @@ -68,27 +68,27 @@ namespace matx #pragma unroll for (int i = 0; i < Rank(); i++) { index_t size1 = detail::get_expanded_size(op_, i); - index_t size2 = detail::get_expanded_size(shift_, i); + index_t size2 = detail::get_expanded_size(shift_, i); sizes_[i] = detail::matx_max(size1,size2); } - ASSERT_COMPATIBLE_OP_SIZES(shift_); - ASSERT_COMPATIBLE_OP_SIZES(op_); + MATX_ASSERT_COMPATIBLE_OP_SIZES(shift_); + MATX_ASSERT_COMPATIBLE_OP_SIZES(op_); } template static __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) get_impl( - Op&& op, - const Sizes &sizes, - ShiftType shiftin, + Op&& op, + const Sizes &sizes, + ShiftType shiftin, Is... indices) - { + { cuda::std::array idx{indices...}; index_t shift = -get_value(shiftin, indices...); shift = (shift + idx[DIM]) % sizes[DIM]; - if (shift < 0) { + if (shift < 0) { shift += sizes[DIM]; } @@ -96,12 +96,12 @@ namespace matx return get_value(cuda::std::forward(op), idx); } - + template - __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const + __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const { return get_impl(cuda::std::as_const(op_), sizes_, shift_, indices...); - } + } template __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) @@ -123,7 +123,7 @@ namespace matx if constexpr (is_matx_op()) { op_.PostRun(std::forward(shape), std::forward(ex)); } - } + } static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank() { @@ -137,24 +137,24 @@ namespace matx ~ShiftOp() = default; ShiftOp(const ShiftOp &rhs) = default; - __MATX_INLINE__ auto operator=(const self_type &rhs) { - return set(*this, rhs); - } + __MATX_INLINE__ auto operator=(const self_type &rhs) { + return set(*this, rhs); + } - template - __MATX_INLINE__ auto operator=(const R &rhs) { + template + __MATX_INLINE__ auto operator=(const R &rhs) { if constexpr (is_matx_transform_op()) { return mtie(*this, rhs); } - else { - return set(*this, rhs); + else { + return set(*this, rhs); } } private: typename detail::base_type_t op_; cuda::std::array sizes_; - typename detail::base_type_t shift_; + typename detail::base_type_t shift_; }; } /** @@ -206,7 +206,7 @@ namespace matx * * @param s * Amount to shift forward - * + * * @param shifts * list of shift amounts * @returns diff --git a/include/matx/operators/sph2cart.h b/include/matx/operators/sph2cart.h index cd25c0139..bba6a41e4 100644 --- a/include/matx/operators/sph2cart.h +++ b/include/matx/operators/sph2cart.h @@ -54,18 +54,18 @@ namespace matx using matxop = bool; using value_type = typename T1::value_type; - __MATX_INLINE__ std::string str() const { return "sph2cart(" + get_type_str(theta_) + + __MATX_INLINE__ std::string str() const { return "sph2cart(" + get_type_str(theta_) + "," + get_type_str(phi_) + "," + get_type_str(r_) + ")"; } __MATX_INLINE__ Sph2CartOp(const T1 &theta, const T2 &phi, const T3 &r) : theta_(theta), phi_(phi), r_(r) { - ASSERT_COMPATIBLE_OP_SIZES(theta); - ASSERT_COMPATIBLE_OP_SIZES(phi); - ASSERT_COMPATIBLE_OP_SIZES(r); + MATX_ASSERT_COMPATIBLE_OP_SIZES(theta); + MATX_ASSERT_COMPATIBLE_OP_SIZES(phi); + MATX_ASSERT_COMPATIBLE_OP_SIZES(r); } template - __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const + __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const { [[maybe_unused]] auto theta = get_value(theta_, indices...); [[maybe_unused]] auto phi = get_value(phi_, indices...); @@ -122,7 +122,7 @@ namespace matx index_t size1 = get_expanded_size(theta_, dim); index_t size2 = get_expanded_size(phi_, dim); index_t size3 = get_expanded_size(r_, dim); - return detail::matx_max(size1, size2, size3); + return detail::matx_max(size1, size2, size3); } }; } @@ -140,7 +140,7 @@ namespace matx * * @param theta * Operator defining theta - * + * * @param phi * Operator defining phi * @@ -153,7 +153,7 @@ namespace matx template auto __MATX_INLINE__ sph2cart(const T1 &theta, const T2 &phi, const T3 &r) { - return cuda::std::tuple{ + return cuda::std::tuple{ detail::Sph2CartOp(theta, phi, r), detail::Sph2CartOp(theta, phi, r), detail::Sph2CartOp(theta, phi, r)}; diff --git a/include/matx/operators/unary_operators.h b/include/matx/operators/unary_operators.h index e0a3d9c71..dbf1afdb0 100644 --- a/include/matx/operators/unary_operators.h +++ b/include/matx/operators/unary_operators.h @@ -37,15 +37,15 @@ #include "matx/operators/scalar_ops.h" #include "matx/operators/base_operator.h" -#define DEFINE_UNARY_OP(FUNCTION, TENSOR_OP) \ +#define MATX_DEFINE_UNARY_OP(FUNCTION, TENSOR_OP) \ template ()>> \ - [[nodiscard]] __MATX_INLINE__ auto FUNCTION(const I1 &i1) \ + [[nodiscard]] __MATX_INLINE__ auto FUNCTION(const I1 &i1) \ { \ - using I1Type = extract_value_type_t; \ + using I1Type = extract_value_type_t; \ using Op = TENSOR_OP; \ - const typename detail::base_type_t &base = i1; \ - return detail::matxUnaryOp(base, Op()); \ + const typename detail::base_type_t &base = i1; \ + return detail::matxUnaryOp(base, Op()); \ } @@ -83,8 +83,8 @@ namespace matx { return cuda::std::apply([&](auto &&...args) { return this->operator()(args...); - }, idx); - } + }, idx); + } template ...>, bool> = true> __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const @@ -110,7 +110,7 @@ namespace matx } template - __MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept + __MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept { in1_.PostRun(std::forward(shape), std::forward(ex)); } @@ -337,52 +337,52 @@ namespace matx * @param t * Input operator */ - Op real(Op t) {} + Op real(Op t) {} /** * Return imaginary components of an operator * @param t * Input operator */ - Op imag(Op t) {} + Op imag(Op t) {} /** * Returns a truth value if operator value is NaN * @param t * Input operator */ - Op isnan(Op t) {} + Op isnan(Op t) {} /** * Returns a truth value if operator value is infinite * @param x * Input operator */ - Op isinf( Op x) {} + Op isinf( Op x) {} /** * Returns values from the standard normal cumulative distribution function * @param x * Input operator */ - Op normcdf( Op x) {} + Op normcdf( Op x) {} #else - DEFINE_UNARY_OP(sqrt, detail::SqrtOp); - DEFINE_UNARY_OP(csqrt, detail::CsqrtOp); - DEFINE_UNARY_OP(rsqrt, detail::RSqrtOp); - DEFINE_UNARY_OP(exp, detail::ExpOp); - DEFINE_UNARY_OP(expj, detail::ExpjOp); - DEFINE_UNARY_OP(log10, detail::Log10Op); - DEFINE_UNARY_OP(log2, detail::Log2Op); - DEFINE_UNARY_OP(log, detail::LogOp); - DEFINE_UNARY_OP(loge, detail::LogOp); + MATX_DEFINE_UNARY_OP(sqrt, detail::SqrtOp); + MATX_DEFINE_UNARY_OP(csqrt, detail::CsqrtOp); + MATX_DEFINE_UNARY_OP(rsqrt, detail::RSqrtOp); + MATX_DEFINE_UNARY_OP(exp, detail::ExpOp); + MATX_DEFINE_UNARY_OP(expj, detail::ExpjOp); + MATX_DEFINE_UNARY_OP(log10, detail::Log10Op); + MATX_DEFINE_UNARY_OP(log2, detail::Log2Op); + MATX_DEFINE_UNARY_OP(log, detail::LogOp); + MATX_DEFINE_UNARY_OP(loge, detail::LogOp); #if 0 - DEFINE_UNARY_OP(conj, detail::ConjOp); + MATX_DEFINE_UNARY_OP(conj, detail::ConjOp); #else // implementing without a macro so we can optimize conj(real) - template ()>> + template ()>> [[nodiscard]] __MATX_INLINE__ auto conj(I1 i1) { using I1Type = extract_value_type_t; if constexpr (is_complex_v) { @@ -395,31 +395,31 @@ namespace matx } } #endif - DEFINE_UNARY_OP(abs, detail::AbsOp); - DEFINE_UNARY_OP(abs2, detail::Abs2Op); - DEFINE_UNARY_OP(sin, detail::SinOp); - DEFINE_UNARY_OP(cos, detail::CosOp); - DEFINE_UNARY_OP(tan, detail::TanOp); - DEFINE_UNARY_OP(asin, detail::AsinOp); - DEFINE_UNARY_OP(acos, detail::AcosOp); - DEFINE_UNARY_OP(atan, detail::AtanOp); - DEFINE_UNARY_OP(sinh, detail::SinhOp); - DEFINE_UNARY_OP(cosh, detail::CoshOp); - DEFINE_UNARY_OP(tanh, detail::TanhOp); - DEFINE_UNARY_OP(asinh, detail::AsinhOp); - DEFINE_UNARY_OP(acosh, detail::AcoshOp); - DEFINE_UNARY_OP(atanh, detail::AtanhOp); - DEFINE_UNARY_OP(angle, detail::AngleOp); - DEFINE_UNARY_OP(floor, detail::FloorOp); - DEFINE_UNARY_OP(ceil, detail::CeilOp); - DEFINE_UNARY_OP(round, detail::RoundOp); - DEFINE_UNARY_OP(normcdf, detail::NormCdfOp); + MATX_DEFINE_UNARY_OP(abs, detail::AbsOp); + MATX_DEFINE_UNARY_OP(abs2, detail::Abs2Op); + MATX_DEFINE_UNARY_OP(sin, detail::SinOp); + MATX_DEFINE_UNARY_OP(cos, detail::CosOp); + MATX_DEFINE_UNARY_OP(tan, detail::TanOp); + MATX_DEFINE_UNARY_OP(asin, detail::AsinOp); + MATX_DEFINE_UNARY_OP(acos, detail::AcosOp); + MATX_DEFINE_UNARY_OP(atan, detail::AtanOp); + MATX_DEFINE_UNARY_OP(sinh, detail::SinhOp); + MATX_DEFINE_UNARY_OP(cosh, detail::CoshOp); + MATX_DEFINE_UNARY_OP(tanh, detail::TanhOp); + MATX_DEFINE_UNARY_OP(asinh, detail::AsinhOp); + MATX_DEFINE_UNARY_OP(acosh, detail::AcoshOp); + MATX_DEFINE_UNARY_OP(atanh, detail::AtanhOp); + MATX_DEFINE_UNARY_OP(angle, detail::AngleOp); + MATX_DEFINE_UNARY_OP(floor, detail::FloorOp); + MATX_DEFINE_UNARY_OP(ceil, detail::CeilOp); + MATX_DEFINE_UNARY_OP(round, detail::RoundOp); + MATX_DEFINE_UNARY_OP(normcdf, detail::NormCdfOp); #if 0 - DEFINE_UNARY_OP(real, detail::RealOp); + MATX_DEFINE_UNARY_OP(real, detail::RealOp); #else // implementing without a macro so we can optimize away real on a real operator - template ()>> + template ()>> [[nodiscard]] __MATX_INLINE__ auto real(I1 i1) { using I1Type = extract_value_type_t; if constexpr (is_complex_v) { @@ -432,10 +432,10 @@ namespace matx } } #endif - DEFINE_UNARY_OP(imag, detail::ImagOp); - DEFINE_UNARY_OP(operator-, detail::SubNegOp ); - DEFINE_UNARY_OP(isnan, detail::IsNanOp); - DEFINE_UNARY_OP(isinf, detail::IsInfOp); + MATX_DEFINE_UNARY_OP(imag, detail::ImagOp); + MATX_DEFINE_UNARY_OP(operator-, detail::SubNegOp ); + MATX_DEFINE_UNARY_OP(isnan, detail::IsNanOp); + MATX_DEFINE_UNARY_OP(isinf, detail::IsInfOp); #endif } // end namespace matx diff --git a/include/matx/transforms/eig/eig_lapack.h b/include/matx/transforms/eig/eig_lapack.h index 0b56948ab..0340e43ba 100644 --- a/include/matx/transforms/eig/eig_lapack.h +++ b/include/matx/transforms/eig/eig_lapack.h @@ -138,7 +138,7 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t { syevd_dispatch("V", ¶ms.uplo, ¶ms.n, nullptr, ¶ms.n, nullptr, &work_query, &this->lwork, &rwork_query, &this->lrwork, &iwork_query, &this->liwork, &info); - + MATX_ASSERT_STR_EXP(info, 0, matxSolverError, ("Parameter " + std::to_string(-info) + " had an illegal value in LAPACK syevd workspace query").c_str()); @@ -205,7 +205,7 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t { reinterpret_cast(this->rwork), &this->lrwork, reinterpret_cast(this->iwork), &this->liwork, &info); - MATX_ASSERT_STR_EXP(info, 0, matxSolverError, + MATX_ASSERT_STR_EXP(info, 0, matxSolverError, (std::to_string(info) + " off-diagonal elements of an intermediate tridiagonal form did not converge to zero in LAPACK syevd").c_str()); } } @@ -231,7 +231,7 @@ class matxDnEigHostPlan_t : matxDnHostSolver_t { const lapack_int_t* liwork_in, lapack_int_t* info) { // TODO: remove warning suppression once syevd is optimized in NVPL LAPACK -IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations") +MATX_IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations") if constexpr (std::is_same_v) { LAPACK_CALL(ssyevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, iwork_in, liwork_in, info); } else if constexpr (std::is_same_v) { @@ -241,7 +241,7 @@ IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations") } else if constexpr (std::is_same_v>) { LAPACK_CALL(zheevd)(jobz, uplo, n, a, lda, w, work_in, lwork_in, rwork_in, lrwork_in, iwork_in, liwork_in, info); } -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC } std::vector batch_w_ptrs; diff --git a/include/matx/transforms/matmul/matmul_cblas.h b/include/matx/transforms/matmul/matmul_cblas.h index e8e9a6aa9..1a661a908 100644 --- a/include/matx/transforms/matmul/matmul_cblas.h +++ b/include/matx/transforms/matmul/matmul_cblas.h @@ -354,7 +354,7 @@ __MATX_INLINE__ void matmul_exec(TensorTypeC &c, #elif defined(MATX_EN_BLIS) bli_thread_set_num_threads(exec.GetNumThreads()); #endif - + total_iter *= params.batch; for (size_t iter = 0; iter < total_iter; iter++) { // Get pointers into A/B/C for this round @@ -397,7 +397,7 @@ __MATX_INLINE__ void matmul_exec(TensorTypeC &c, } template -__MATX_INLINE__ void matmul_dispatch(TensorTypeC &c, +__MATX_INLINE__ void matmul_dispatch(TensorTypeC &c, const TensorTypeA &a, const TensorTypeB &b, const float alpha, @@ -452,7 +452,7 @@ __MATX_INLINE__ auto getCBLASSupportedTensor( const Op &in) { return true; } }; - + return GetSupportedTensor(in, support_func, MATX_HOST_MALLOC_MEMORY); } diff --git a/include/matx/transforms/solver_common.h b/include/matx/transforms/solver_common.h index 3a78225aa..51599d1a3 100644 --- a/include/matx/transforms/solver_common.h +++ b/include/matx/transforms/solver_common.h @@ -55,7 +55,7 @@ namespace matx { #define LAPACK_CALL(fn) LAPACK_##fn #else using lapack_int_t = index_t; -#endif +#endif /* Parameter enums */ @@ -159,18 +159,18 @@ enum class BatchType { /** * @brief Sets batch pointers for a batched tensor of arbitrary rank. - * + * * Clears the given batch pointers vector and then populates it * with pointers to the data of the tensor for batched operations. * Handles both batched matrices and vectors. - * + * * @tparam BTYPE * Whether the input is a batch of matrices or vectors * @tparam TensorType * Type of input tensor a * @tparam PointerType * Tensor value type - * + * * @param a * The tensor for which batch pointers are to be set. * @param batch_ptrs @@ -182,7 +182,7 @@ __MATX_INLINE__ void SetBatchPointers(const TensorType &a, std::vector 0) { matxAlloc(&h_workspace, batches * hspace, MATX_HOST_MEMORY); - } + } } } @@ -303,15 +303,15 @@ class matxDnCUDASolver_t { /** * Dense LAPACK base class that all dense host solver types inherit common methods * and structures from. Depending on the decomposition, it may require different - * types of workspace arrays. + * types of workspace arrays. * * @tparam ValueType * Input tensor type - * + * */ template class matxDnHostSolver_t { - + public: matxDnHostSolver_t() { diff --git a/include/matx/transforms/svd/svd_cuda.h b/include/matx/transforms/svd/svd_cuda.h index 3c7b017a0..720249e8f 100644 --- a/include/matx/transforms/svd/svd_cuda.h +++ b/include/matx/transforms/svd/svd_cuda.h @@ -75,7 +75,7 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) { auto RShape = A.Shape(); RShape[RANK-1] = d; - RShape[RANK-2] = d; + RShape[RANK-2] = d; cuda::std::array l2NormShape; for(int i=0;i(RShape, MATX_ASYNC_DEVICE_MEMORY, stream); auto Z = make_tensor(QShape, MATX_ASYNC_DEVICE_MEMORY, stream); auto l2Norm = make_tensor(l2NormShape, MATX_ASYNC_DEVICE_MEMORY, stream); - auto converged = make_tensor({}, MATX_ASYNC_DEVICE_MEMORY, stream); + auto converged = make_tensor({}, MATX_ASYNC_DEVICE_MEMORY, stream); return cuda::std::tuple(AT, Q, Qold, R, Z, l2Norm, converged); } @@ -124,7 +124,7 @@ inline auto svdbpi_impl_workspace(const AType &A, cudaStream_t stream) { * Input tensor or operator signaling the initial guess for x0 at each power iteration. A * Random tensor of size batches x min(n,m) is suggested. * @param iterations - * The number of power iterations to perform for each singular value. + * The number of power iterations to perform for each singular value. * @param exec * CUDA executor * @param k @@ -146,7 +146,7 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat auto m = A.Size(RANK-2); // rows auto n = A.Size(RANK-1); // cols auto d = cuda::std::min(n,m); // dim for AAT or ATA - + // if sentinal found get all singularvalues if( k == -1 ) k = (int) d; @@ -156,7 +156,7 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat MATX_ASSERT_STR(VT.Size(i) == A.Size(i), matxInvalidDim, "svdpi: VT and A must have the same batch sizes"); MATX_ASSERT_STR(S.Size(i) == A.Size(i), matxInvalidDim, "svdpi: S and A must have the same batch sizes"); } - + MATX_ASSERT_STR(U.Size(RANK-2) == m, matxInvalidDim, "svdpi: U must have Size(RANK-2) == m"); MATX_ASSERT_STR(U.Size(RANK-1) == k, matxInvalidDim, "svdpi: U must have Size(RANK-1) == k"); MATX_ASSERT_STR(VT.Size(RANK-2) == k, matxInvalidDim, "svdpi: VT must have Size(RANK-2) == k"); @@ -199,7 +199,7 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat auto uv = make_tensor(AShape, MATX_ASYNC_DEVICE_MEMORY, stream); auto AT = make_tensor(ATShape, MATX_ASYNC_DEVICE_MEMORY, stream); auto xm = make_tensor(xmShape, MATX_ASYNC_DEVICE_MEMORY, stream); - + // we shouldn't need sums but cub doesn't support strided tensors so we cannot write directly at this time. auto sums = make_tensor(sumsShape, MATX_ASYNC_DEVICE_MEMORY, stream); auto x = slice(xm, xSliceB, xSliceE); @@ -341,7 +341,7 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat (u = u / clone(s, sCloneShape)).run(stream); } // end ufirst - + // Remove current singular vectors from matrix if(i < k - 1) { @@ -384,7 +384,7 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat * @param A * Input tensor or operator for tensor A input with size "batches by m by n" * @param max_iters - * The approximate maximum number of QR iterations to perform. + * The approximate maximum number of QR iterations to perform. * @param tol * The termination tolerance for the QR iteration. Setting this to 0 will skip the tolerance check. * @param exec @@ -466,7 +466,7 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_ //compute L2(Q-Qold) // sqrt folded into next operation - (l2Norm = sum(abs2(Q-Qold))).run(stream); + (l2Norm = sum(abs2(Q-Qold))).run(stream); // compute if all batches have converged if constexpr (RANK > 2) { @@ -474,14 +474,14 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_ } else { (converged = as_int(sqrt(l2Norm) < tol)).run(stream); } - + // event to record when converged is ready in stream cudaEventRecord(event, stream); // wait for d2h transfer until converged is ready cudaStreamWaitEvent(d2h, event); - // copy convergence criteria to host. - // This is in unpinned memory and cannot on most systems run asynchronously. + // copy convergence criteria to host. + // This is in unpinned memory and cannot on most systems run asynchronously. // We do this here to hide the copy/sync behind prior launch latency/execution of next iteration. cudaMemcpyAsync(&converged_host, converged.Data(), sizeof(int), cudaMemcpyDeviceToHost, d2h); } @@ -498,7 +498,7 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_ DShape[RANK-2] = m; auto D = clone(S, DShape); - // normalize U by singular values + // normalize U by singular values // IF required to avoid nans when singular value is 0 (IF(D != STypeS(0), U = U / D)).run(stream); @@ -548,7 +548,7 @@ struct DnSVDCUDAParams_t { template static __MATX_INLINE__ SVDMethod GetCUDASVDMethod(const ATensor &a) { - static constexpr int RANK = ATensor::Rank(); + static constexpr int RANK = ATensor::Rank(); index_t m = a.Size(RANK - 2); index_t n = a.Size(RANK - 1); @@ -556,16 +556,16 @@ static __MATX_INLINE__ SVDMethod GetCUDASVDMethod(const ATensor &a) { SVDMethod method = detail::SVDMethod::GESVD; if (a.Rank() != 2) { - if (a.Size(RANK-2) <= 32 && + if (a.Size(RANK-2) <= 32 && a.Size(RANK-1) <= 32) { if constexpr (is_tensor_view_v) { - #if !defined(INDEX_32_BIT) + #if !defined(MATX_INDEX_32_BIT) if (a.Stride(0) < std::numeric_limits::max()) { method = detail::SVDMethod::GESVDJ_BATCHED; } #else method = detail::SVDMethod::GESVDJ_BATCHED; - #endif + #endif } else { method = detail::SVDMethod::GESVDJ_BATCHED; @@ -651,7 +651,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnCreateGesvdjInfo"); ret = cusolverDnXgesvdjSetTolerance(batch_params, 1e-9); - MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnXgesvdjSetTolerance"); + MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnXgesvdjSetTolerance"); ret = cusolverDnXgesvdjSetMaxSweeps(batch_params, 15); MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "Failure in cusolverDnXgesvdjSetMaxSweeps"); @@ -666,7 +666,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { cusolverStatus_t ret; // Use all mode for a larger workspace size that works for all modes - if (!batched) { + if (params.method == SVDMethod::GESVD) { ret = cusolverDnXgesvd_bufferSize( this->handle, this->dn_params, 'A', 'A', params.m, params.n, MatXTypeToCudaType(), params.A, params.m, @@ -675,44 +675,45 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { MatXTypeToCudaType(), &this->dspace, &this->hspace); } else { + int i_dspace; + if constexpr (std::is_same_v) { ret = cusolverDnSgesvdjBatched_bufferSize( this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), reinterpret_cast(params.A), static_cast(params.m), - reinterpret_cast(params.S), reinterpret_cast(params.U), static_cast(params.m), + reinterpret_cast(params.S), reinterpret_cast(params.U), static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), - reinterpret_cast(&this->dspace), batch_params, static_cast(params.batch_size)); + &i_dspace, batch_params, static_cast(params.batch_size)); } else if constexpr (std::is_same_v) { ret = cusolverDnDgesvdjBatched_bufferSize( this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), reinterpret_cast(params.A), static_cast(params.m), - reinterpret_cast(params.S), reinterpret_cast(params.U), + reinterpret_cast(params.S), reinterpret_cast(params.U), static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), - reinterpret_cast(&this->dspace), batch_params, static_cast(params.batch_size)); + &i_dspace, batch_params, static_cast(params.batch_size)); } else if constexpr (std::is_same_v, T1>) { ret = cusolverDnCgesvdjBatched_bufferSize( this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), reinterpret_cast(params.A), static_cast(params.m), - reinterpret_cast(params.S), reinterpret_cast(params.U), + reinterpret_cast(params.S), reinterpret_cast(params.U), static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), - reinterpret_cast(&this->dspace), batch_params, static_cast(params.batch_size)); + &i_dspace, batch_params, static_cast(params.batch_size)); } else if constexpr (std::is_same_v, T1>) { ret = cusolverDnZgesvdjBatched_bufferSize( this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), reinterpret_cast(params.A), static_cast(params.m), - reinterpret_cast(params.S), reinterpret_cast(params.U), + reinterpret_cast(params.S), reinterpret_cast(params.U), static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), - reinterpret_cast(&this->dspace), batch_params, static_cast(params.batch_size)); + &i_dspace, batch_params, static_cast(params.batch_size)); } else { MATX_THROW(matxInvalidType, "Invalid data type passed to svd()"); - } + } - int dspace_tmp = *reinterpret_cast(&this->dspace); - this->dspace = dspace_tmp; + this->dspace = i_dspace; this->hspace = 0; } @@ -722,7 +723,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { static DnSVDCUDAParams_t GetSVDParams(UTensor &u, STensor &s, - VtTensor &vt, const ATensor &a, + VtTensor &vt, const ATensor &a, const char jobz, const cudaExecutor &exec) { DnSVDCUDAParams_t params; @@ -773,7 +774,7 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { SetBatchPointers(vt, this->batch_vt_ptrs); SetBatchPointers(s, this->batch_s_ptrs); - const int64_t ldvt = vt.Size(RANK-2); + const int64_t ldvt = vt.Size(RANK-2); if (params.method == SVDMethod::GESVD) { // At this time cuSolver does not have a batched 64-bit SVD interface. Change @@ -797,9 +798,9 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { ret = cusolverDnSgesvdjBatched( this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), reinterpret_cast(params.A), static_cast(params.m), - reinterpret_cast(params.S), reinterpret_cast(params.U), + reinterpret_cast(params.S), reinterpret_cast(params.U), static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), - reinterpret_cast(this->d_workspace), static_cast(this->dspace), + reinterpret_cast(this->d_workspace), static_cast(this->dspace), this->d_info, batch_params, static_cast(params.batch_size)); } else if constexpr (std::is_same_v) { @@ -808,30 +809,30 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { reinterpret_cast(params.A), static_cast(params.m), reinterpret_cast(params.S), reinterpret_cast(params.U), static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), - reinterpret_cast(this->d_workspace), static_cast(this->dspace), + reinterpret_cast(this->d_workspace), static_cast(this->dspace), this->d_info, batch_params, static_cast(params.batch_size)); } else if constexpr (std::is_same_v, T1>) { ret = cusolverDnCgesvdjBatched( this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), reinterpret_cast(params.A), static_cast(params.m), - reinterpret_cast(params.S), reinterpret_cast(params.U), + reinterpret_cast(params.S), reinterpret_cast(params.U), static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), - reinterpret_cast(this->d_workspace), static_cast(this->dspace), + reinterpret_cast(this->d_workspace), static_cast(this->dspace), this->d_info, batch_params, static_cast(params.batch_size)); } else if constexpr (std::is_same_v, T1>) { ret = cusolverDnZgesvdjBatched( this->handle, CUSOLVER_EIG_MODE_VECTOR, static_cast(params.m), static_cast(params.n), reinterpret_cast(params.A), static_cast(params.m), - reinterpret_cast(params.S), reinterpret_cast(params.U), + reinterpret_cast(params.S), reinterpret_cast(params.U), static_cast(params.m), reinterpret_cast(params.VT), static_cast(params.n), - reinterpret_cast(this->d_workspace), static_cast(this->dspace), + reinterpret_cast(this->d_workspace), static_cast(this->dspace), this->d_info, batch_params, static_cast(params.batch_size)); } else { MATX_THROW(matxInvalidType, "Invalid data type passed to svd()"); - } + } MATX_ASSERT_STR_EXP(ret, CUSOLVER_STATUS_SUCCESS, matxSolverError, "cusolverDnSgesvdjBatched failed"); } @@ -850,10 +851,10 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { MATX_ASSERT_STR_EXP(info, 0, matxSolverError, ("Parameter " + std::to_string(-info) + " had an illegal value in cuSolver Xgesvd").c_str()); } else { - MATX_ASSERT_STR_EXP(info, 0, matxSolverError, + MATX_ASSERT_STR_EXP(info, 0, matxSolverError, (std::to_string(info) + " superdiagonals of an intermediate bidiagonal form did not converge to zero in cuSolver Xgesvd").c_str()); } - } + } } /** @@ -871,7 +872,6 @@ class matxDnSVDCUDAPlan_t : matxDnCUDASolver_t { std::vector batch_vt_ptrs; gesvdjInfo_t batch_params = nullptr; DnSVDCUDAParams_t params; - bool batched; }; /** @@ -893,10 +893,10 @@ struct DnSVDCUDAParamsKeyHash { struct DnSVDCUDAParamsKeyEq { bool operator()(const DnSVDCUDAParams_t &l, const DnSVDCUDAParams_t &t) const noexcept { - return l.n == t.n && - l.m == t.m && - l.batch_size == t.batch_size && - l.dtype == t.dtype && + return l.n == t.n && + l.m == t.m && + l.batch_size == t.batch_size && + l.dtype == t.dtype && l.exec.getStream() == t.exec.getStream(); } }; @@ -961,7 +961,7 @@ void svd_impl(UTensor &&u, STensor &&s, const char job_cusolver = detail::SVDModeToChar(jobz); const bool m_leq_n = a.Size(RANK-2) <= a.Size(RANK-1); - const auto method = GetCUDASVDMethod(a); + const auto method = GetCUDASVDMethod(a); // The power iteration method is a custom kernel so we don't need to test the types if (method == detail::SVDMethod::POW_ITER) { @@ -973,8 +973,8 @@ void svd_impl(UTensor &&u, STensor &&s, T1 *tp; auto a_shape = a.Shape(); auto a_total_size = std::accumulate(a_shape.begin(), a_shape.begin() + ATensor::Rank(), 1, std::multiplies()); - matxAlloc(reinterpret_cast(&tp), sizeof(T1) * a_total_size, MATX_ASYNC_DEVICE_MEMORY, stream); - + matxAlloc(reinterpret_cast(&tp), sizeof(T1) * a_total_size, MATX_ASYNC_DEVICE_MEMORY, stream); + if (m_leq_n) { // get col-major AT auto a_new = make_tensor(tp, a_shape); @@ -987,7 +987,7 @@ void svd_impl(UTensor &&u, STensor &&s, auto vt_new = getSolverSupportedTensor(vt, exec); // swap U and VT. svd(AT) = V*S*UT - // svd(AT) = V*S*UT + // svd(AT) = V*S*UT // Need the tensors to appear like V and UT since that is expected based on AT view // inputted, although the results when read as row-major are VT and U. auto u_in = transpose_matrix(vt_new); diff --git a/include/matx/transforms/svd/svd_lapack.h b/include/matx/transforms/svd/svd_lapack.h index 0621bd1d4..14d04d7f2 100644 --- a/include/matx/transforms/svd/svd_lapack.h +++ b/include/matx/transforms/svd/svd_lapack.h @@ -184,7 +184,7 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t { // the real part of the first elem of work holds the optimal lwork. if constexpr (is_complex_v) { this->lwork = static_cast(work_query.real()); - + lapack_int_t mnthr = (mn * 5) / 3; if (mx >= mnthr) { // mx >> mn condition in LAPACK this->lrwork = 5*mn*mn + 5*mn; @@ -263,7 +263,7 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t { reinterpret_cast(this->rwork), &info); - MATX_ASSERT_STR_EXP(info, 0, matxSolverError, + MATX_ASSERT_STR_EXP(info, 0, matxSolverError, (std::to_string(info) + " superdiagonals of an intermediate bidiagonal form did not converge to zero in LAPACK").c_str()); } } else if (params.algo == SVDHostAlgo::DC) { @@ -277,7 +277,7 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t { reinterpret_cast(this->rwork), reinterpret_cast(this->iwork), &info); - MATX_ASSERT_STR_EXP(info, 0, matxSolverError, "gesdd error in LAPACK"); + MATX_ASSERT_STR_EXP(info, 0, matxSolverError, "gesdd error in LAPACK"); } } } @@ -300,7 +300,7 @@ class matxDnSVDHostPlan_t : matxDnHostSolver_t { const lapack_int_t *lwork_in, [[maybe_unused]] T3 *rwork_in, lapack_int_t *info) { // TODO: remove warning suppression once gesvd is optimized in NVPL LAPACK -IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations") +MATX_IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations") if constexpr (std::is_same_v) { LAPACK_CALL(sgesvd)(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, info); } else if constexpr (std::is_same_v) { @@ -310,7 +310,7 @@ IGNORE_WARNING_PUSH_GCC("-Wdeprecated-declarations") } else if constexpr (std::is_same_v>) { LAPACK_CALL(zgesvd)(jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, rwork_in, info); } -IGNORE_WARNING_POP_GCC +MATX_IGNORE_WARNING_POP_GCC } /** @@ -332,7 +332,7 @@ IGNORE_WARNING_POP_GCC LAPACK_CALL(zgesdd)(jobz, m, n, a, lda, s, u, ldu, vt, ldvt, work_in, lwork_in, rwork_in, iwork_in, info); } } - + std::vector batch_u_ptrs; std::vector batch_s_ptrs; std::vector batch_vt_ptrs; @@ -427,7 +427,7 @@ void svd_impl([[maybe_unused]] UTensor &&u, equivalent to col-major AT, and swapping the inputs U and VT. */ - // LAPACK destroys the input, so we need to make a copy of A regardless + // LAPACK destroys the input, so we need to make a copy of A regardless auto a_copy = make_tensor(a.Shape(), MATX_HOST_MALLOC_MEMORY); (a_copy = a).run(exec); auto at_col_maj = transpose_matrix(a_copy); diff --git a/test/00_io/PrintTests.cu b/test/00_io/PrintTests.cu index 3a9e66215..b07bb8e63 100644 --- a/test/00_io/PrintTests.cu +++ b/test/00_io/PrintTests.cu @@ -97,8 +97,9 @@ TEST_F(PrintTest, DefaultTest1) auto pft = get_print_format_type(); ASSERT_EQ(MATX_PRINT_FORMAT_DEFAULT, pft); + A1.set_name("A1 Matrix"); print_checker(A1, - "Tensor{complex} Rank: 1, Sizes:[16], Strides:[1]\n" + "A1 Matrix: Tensor{complex} Rank: 1, Sizes:[16], Strides:[1]\n" "000000: -9.2466e-01+9.9114e-01j \n" "000001: -4.2534e-01+1.0676e+00j \n" "000002: -2.6438e+00-6.2723e-01j \n" @@ -192,11 +193,11 @@ TEST_F(PrintTest, DefaultTest5) MATX_ENTER_HANDLER(); auto pft = get_print_format_type(); ASSERT_EQ(MATX_PRINT_FORMAT_DEFAULT, pft); - + auto testSlice = matx::slice<0>(A1, {0}, {matx::matxDropDim}); print_checker(testSlice, - "Tensor{complex} Rank: 0, Sizes:[], Strides:[]\n" + "tensor_0_f64c: Tensor{complex} Rank: 0, Sizes:[], Strides:[]\n" "-9.2466e-01+9.9114e-01j \n"); MATX_EXIT_HANDLER(); @@ -210,7 +211,7 @@ TEST_F(PrintTest, MlabTest1) ASSERT_EQ(MATX_PRINT_FORMAT_MLAB, pft); print_checker(A1, - "Tensor{complex} Rank: 1, Sizes:[16], Strides:[1]\n" + "tensor_1_f64c: Tensor{complex} Rank: 1, Sizes:[16], Strides:[1]\n" "[-9.2466e-01+9.9114e-01j ,\n" " -4.2534e-01+1.0676e+00j ,\n" " -2.6438e+00-6.2723e-01j ,\n" @@ -306,7 +307,7 @@ TEST_F(PrintTest, MlabTest5) auto testSlice = matx::slice<0>(A1, {0}, {matx::matxDropDim}); print_checker(testSlice, - "Tensor{complex} Rank: 0, Sizes:[], Strides:[]\n" + "tensor_0_f64c: Tensor{complex} Rank: 0, Sizes:[], Strides:[]\n" "-9.2466e-01+9.9114e-01j \n"); MATX_EXIT_HANDLER(); @@ -320,7 +321,7 @@ TEST_F(PrintTest, PythonTest1) ASSERT_EQ(MATX_PRINT_FORMAT_PYTHON, pft); print_checker(A1, - "Tensor{complex} Rank: 1, Sizes:[16], Strides:[1]\n" + "tensor_1_f64c: Tensor{complex} Rank: 1, Sizes:[16], Strides:[1]\n" "[-9.2466e-01+9.9114e-01j ,\n" " -4.2534e-01+1.0676e+00j ,\n" " -2.6438e+00-6.2723e-01j ,\n" @@ -412,7 +413,7 @@ TEST_F(PrintTest, PythonTest5) auto testSlice = matx::slice<0>(A1, {0}, {matx::matxDropDim}); print_checker(testSlice, - "Tensor{complex} Rank: 0, Sizes:[], Strides:[]\n" + "tensor_0_f64c: Tensor{complex} Rank: 0, Sizes:[], Strides:[]\n" "-9.2466e-01+9.9114e-01j \n"); MATX_EXIT_HANDLER();