diff --git a/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp b/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp index ba30512..bf80607 100644 --- a/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp +++ b/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp @@ -25,7 +25,7 @@ template class xSpMdV: public clsparseFunc { public: - xSpMdV( PFCLSPARSETIMER sparseGetTimer, size_t profileCount, cl_device_type devType ): clsparseFunc( devType, CL_QUEUE_PROFILING_ENABLE ), gpuTimer( nullptr ), cpuTimer( nullptr ) + xSpMdV( PFCLSPARSETIMER sparseGetTimer, size_t profileCount, cl_bool extended_precision, cl_device_type devType ): clsparseFunc( devType, CL_QUEUE_PROFILING_ENABLE ), gpuTimer( nullptr ), cpuTimer( nullptr ) { // Create and initialize our timer class, if the external timer shared library loaded if( sparseGetTimer ) @@ -42,6 +42,7 @@ class xSpMdV: public clsparseFunc cpuTimerID = cpuTimer->getUniqueID( "CPU xSpMdV", 0 ); } + clsparseEnableExtendedPrecision( control, extended_precision ); clsparseEnableAsync( control, false ); } diff --git a/src/benchmarks/clsparse-bench/src/main.cpp b/src/benchmarks/clsparse-bench/src/main.cpp index b73b73b..62efa6a 100644 --- a/src/benchmarks/clsparse-bench/src/main.cpp +++ b/src/benchmarks/clsparse-bench/src/main.cpp @@ -120,14 +120,15 @@ int main( int argc, char *argv[ ] ) desc.add_options( ) ( "help,h", "produces this help message" ) ( "dirpath,d", po::value( &root_dir ), "Matrix directory" ) - ( "alpha", po::value( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" ) - ( "beta", po::value( &beta )->default_value( 0.0f ), "specifies the scalar beta" ) + ( "alpha,a", po::value( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" ) + ( "beta,b", po::value( &beta )->default_value( 0.0f ), "specifies the scalar beta" ) ( "rows", po::value( &rows )->default_value( 16 ), "specifies the number of rows for matrix data" ) ( "columns", po::value( &columns )->default_value( 16 ), "specifies the number of columns for matrix data" ) ( "function,f", po::value( &function )->default_value( "SpMdV" ), "Sparse functions to test. Options: " "SpMdV, SpMdM, CG, BiCGStab, Csr2Dense, Dense2Csr, Csr2Coo, Coo2Csr" ) ( "precision,r", po::value( &precision )->default_value( "s" ), "Options: s,d,c,z" ) - ( "profile,p", po::value( &profileCount )->default_value( 20 ), "Time and report the kernel speed (default: profiling off)" ) + ( "profile,p", po::value( &profileCount )->default_value( 20 ), "Number of times to run the desired test function" ) + ( "extended,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy by emulating extended precision" ) ; po::variables_map vm; @@ -160,6 +161,9 @@ int main( int argc, char *argv[ ] ) std::cerr << "Could not find the external timing library; timings disabled" << std::endl; } + cl_bool extended_precision = false; + if (vm["extended"].as()) + extended_precision = true; // Timer module discovered and loaded successfully // Initialize function pointers to call into the shared module @@ -170,9 +174,9 @@ int main( int argc, char *argv[ ] ) if( boost::iequals( function, "SpMdV" ) ) { if( precision == "s" ) - my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU ) ); + my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU ) ); else if( precision == "d" ) - my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU ) ); + my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU ) ); else { std::cerr << "Unknown spmdv precision" << std::endl; diff --git a/src/include/clSPARSE.h b/src/include/clSPARSE.h index 3d5c86a..38c2dd1 100644 --- a/src/include/clSPARSE.h +++ b/src/include/clSPARSE.h @@ -90,6 +90,9 @@ extern "C" { CLSPARSE_EXPORT clsparseStatus clsparseEnableAsync( clsparseControl control, cl_bool async ); + //enable/disable the use of compensated summation + CLSPARSE_EXPORT clsparseStatus + clsparseEnableExtendedPrecision( clsparseControl control, cl_bool async ); //setup events to sync //TODO:: NOT WORKING! NDRange throws Failure diff --git a/src/library/blas2/clsparse-csrmv.hpp b/src/library/blas2/clsparse-csrmv.hpp index 9a379cc..17dbd11 100644 --- a/src/library/blas2/clsparse-csrmv.hpp +++ b/src/library/blas2/clsparse-csrmv.hpp @@ -45,15 +45,11 @@ csrmv (const clsparseScalarPrivate *pAlpha, return clsparseStructInvalid; } - // We have problems with failing test cases with csrmv_adaptive on double precision - // fall back to csrmv_vector - if( typeid( T ) == typeid( cl_double ) ) - { - return csrmv_vector( pAlpha, pCsrMatx, pX, pBeta, pY, control ); - } + // Use this for csrmv_general instead of adaptive. + //return csrmv_vector( pAlpha, pCsrMatx, pX, pBeta, pY, control ); - // Call adaptive CSR kernels - return csrmv_adaptive( pAlpha, pCsrMatx, pX, pBeta, pY, control ); + // Call adaptive CSR kernels + return csrmv_adaptive( pAlpha, pCsrMatx, pX, pBeta, pY, control ); } } @@ -83,12 +79,8 @@ csrmv (const clsparse::array_base& pAlpha, return clsparseStructInvalid; } - // We have problems with failing test cases with csrmv_adaptive on double precision - // fall back to csrmv_vector - if( typeid( T ) == typeid( cl_double ) ) - { - return csrmv_vector( pAlpha, pCsrMatx, pX, pBeta, pY, control ); - } + // Use this for csrmv_general instead of adaptive. + //return csrmv_vector( pAlpha, pCsrMatx, pX, pBeta, pY, control ); // Call adaptive CSR kernels return csrmv_adaptive( pAlpha, pCsrMatx, pX, pBeta, pY, control ); diff --git a/src/library/blas2/csrmv-adaptive.hpp b/src/library/blas2/csrmv-adaptive.hpp index d043b3c..95ff105 100644 --- a/src/library/blas2/csrmv-adaptive.hpp +++ b/src/library/blas2/csrmv-adaptive.hpp @@ -39,19 +39,33 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha, const cl_uint group_size = 256; std::string params = std::string( ) + + " -DINDEX_TYPE=uint" + " -DROWBITS=" + std::to_string( ROW_BITS ) + " -DWGBITS=" + std::to_string( WG_BITS ) - + " -DBLOCKSIZE=" + std::to_string( BLKSIZE ); -#ifdef DOUBLE - buildFlags += " -DDOUBLE"; -#endif + + " -DWG_SIZE=" + std::to_string( group_size ) + + " -DBLOCKSIZE=" + std::to_string( BLKSIZE ) + + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER ) + + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR ); + std::string options; if(typeid(T) == typeid(cl_double)) - { - std::string options = std::string() + " -DDOUBLE"; - params.append(options); - } - + options = std::string() + " -DVALUE_TYPE=double -DDOUBLE"; + else if(typeid(T) == typeid(cl_float)) + options = std::string() + " -DVALUE_TYPE=float"; + else if(typeid(T) == typeid(cl_uint)) + options = std::string() + " -DVALUE_TYPE=uint"; + else if(typeid(T) == typeid(cl_int)) + options = std::string() + " -DVALUE_TYPE=int"; + else if(typeid(T) == typeid(cl_ulong)) + options = std::string() + " -DVALUE_TYPE=ulong -DLONG"; + else if(typeid(T) == typeid(cl_long)) + options = std::string() + " -DVALUE_TYPE=long -DLONG"; + else + return clsparseInvalidKernelArgs; + + if(control->extended_precision) + options += " -DEXTENDED_PRECISION"; + params.append(options); cl::Kernel kernel = KernelCache::get( control->queue, "csrmv_adaptive", @@ -70,7 +84,10 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha, // if NVIDIA is used it does not allow to run the group size // which is not a multiplication of group_size. Don't know if that // have an impact on performance - cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size; + // Setting global work size to half the row block size because we are only + // using half the row blocks buffer for actual work. + // The other half is used for the extended precision reduction. + cl_uint global_work_size = ( (pCsrMatx->rowBlockSize/2) - 1 ) * group_size; cl::NDRange local( group_size ); cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] ); @@ -102,15 +119,33 @@ csrmv_adaptive( const clsparse::array_base& pAlpha, const cl_uint group_size = 256; std::string params = std::string( ) + + " -DINDEX_TYPE=uint" + " -DROWBITS=" + std::to_string( ROW_BITS ) + " -DWGBITS=" + std::to_string( WG_BITS ) - + " -DBLOCKSIZE=" + std::to_string( BLKSIZE ); + + " -DWG_SIZE=" + std::to_string( group_size ) + + " -DBLOCKSIZE=" + std::to_string( BLKSIZE ) + + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER ) + + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR ); + std::string options; if(typeid(T) == typeid(cl_double)) - { - std::string options = std::string() + " -DDOUBLE"; - params.append(options); - } + options = std::string() + " -DVALUE_TYPE=double -DDOUBLE"; + else if(typeid(T) == typeid(cl_float)) + options = std::string() + " -DVALUE_TYPE=float"; + else if(typeid(T) == typeid(cl_uint)) + options = std::string() + " -DVALUE_TYPE=uint"; + else if(typeid(T) == typeid(cl_int)) + options = std::string() + " -DVALUE_TYPE=int"; + else if(typeid(T) == typeid(cl_ulong)) + options = std::string() + " -DVALUE_TYPE=ulong -DLONG"; + else if(typeid(T) == typeid(cl_long)) + options = std::string() + " -DVALUE_TYPE=long -DLONG"; + else + return clsparseInvalidKernelArgs; + + if(control->extended_precision) + options += " -DEXTENDED_PRECISION"; + params.append(options); cl::Kernel kernel = KernelCache::get( control->queue, "csrmv_adaptive", @@ -129,7 +164,10 @@ csrmv_adaptive( const clsparse::array_base& pAlpha, // if NVIDIA is used it does not allow to run the group size // which is not a multiplication of group_size. Don't know if that // have an impact on performance - cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size; + // Setting global work size to half the row block size because we are only + // using half the row blocks buffer for actual work. + // The other half is used for the extended precision reduction. + cl_uint global_work_size = ( (pCsrMatx->rowBlockSize/2) - 1 ) * group_size; cl::NDRange local( group_size ); cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] ); diff --git a/src/library/blas2/csrmv-vector.hpp b/src/library/blas2/csrmv-vector.hpp index 2b8b637..406933f 100644 --- a/src/library/blas2/csrmv-vector.hpp +++ b/src/library/blas2/csrmv-vector.hpp @@ -48,7 +48,7 @@ csrmv_vector(const clsparseScalarPrivate* pAlpha, if (nnz_per_row < 8) { subwave_size = 4; } if (nnz_per_row < 4) { subwave_size = 2; } - const std::string params = std::string() + + std::string params = std::string() + "-DINDEX_TYPE=" + OclTypeTraits::type + " -DVALUE_TYPE=" + OclTypeTraits::type + " -DSIZE_TYPE=" + OclTypeTraits::type @@ -56,6 +56,8 @@ csrmv_vector(const clsparseScalarPrivate* pAlpha, + " -DWAVE_SIZE=" + std::to_string(wave_size) + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size); + if(control->extended_precision) + params += " -DEXTENDED_PRECISION"; cl::Kernel kernel = KernelCache::get(control->queue, "csrmv_general", @@ -124,7 +126,7 @@ csrmv_vector(const clsparse::array_base& pAlpha, if (nnz_per_row < 8) { subwave_size = 4; } if (nnz_per_row < 4) { subwave_size = 2; } - const std::string params = std::string() + + std::string params = std::string() + "-DINDEX_TYPE=" + OclTypeTraits::type + " -DVALUE_TYPE=" + OclTypeTraits::type + " -DSIZE_TYPE=" + OclTypeTraits::type @@ -132,6 +134,8 @@ csrmv_vector(const clsparse::array_base& pAlpha, + " -DWAVE_SIZE=" + std::to_string(wave_size) + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size); + if(control->extended_precision) + params += " -DEXTENDED_PRECISION"; cl::Kernel kernel = KernelCache::get(control->queue, "csrmv_general", diff --git a/src/library/include/clSPARSE-private.hpp b/src/library/include/clSPARSE-private.hpp index 73f71ef..89df7cc 100644 --- a/src/library/include/clSPARSE-private.hpp +++ b/src/library/include/clSPARSE-private.hpp @@ -42,5 +42,7 @@ const cl_uint WG_BITS = 24; const cl_uint ROW_BITS = 32; const cl_uint BLKSIZE = 1024; +const cl_uint BLOCK_MULTIPLIER = 3; +const cl_uint ROWS_FOR_VECTOR = 1; #endif diff --git a/src/library/internal/clsparse-control.cpp b/src/library/internal/clsparse-control.cpp index 36234b1..e3a6a25 100644 --- a/src/library/internal/clsparse-control.cpp +++ b/src/library/internal/clsparse-control.cpp @@ -104,6 +104,7 @@ clsparseCreateControl( cl_command_queue queue, clsparseStatus *status ) control->wavefront_size = 0; control->max_wg_size = 0; control->async = false; + control->extended_precision = false; collectEnvParams( control ); @@ -144,6 +145,18 @@ clsparseEnableAsync( clsparseControl control, cl_bool async ) return clsparseSuccess; } +clsparseStatus +clsparseEnableExtendedPrecision( clsparseControl control, cl_bool extended_precision ) +{ + if( control == NULL ) + { + return clsparseInvalidControlObject; + } + + control->extended_precision = extended_precision; + return clsparseSuccess; +} + clsparseStatus clsparseReleaseControl( clsparseControl control ) { diff --git a/src/library/internal/clsparse-control.hpp b/src/library/internal/clsparse-control.hpp index 4b44602..5a722c7 100644 --- a/src/library/internal/clsparse-control.hpp +++ b/src/library/internal/clsparse-control.hpp @@ -54,6 +54,9 @@ struct _clsparseControl size_t wavefront_size; size_t max_wg_size; + // Should we attempt to perform compensated summation? + cl_bool extended_precision; + // current device max compute units; cl_uint max_compute_units; diff --git a/src/library/internal/computeRowBlocks.hpp b/src/library/internal/computeRowBlocks.hpp index eeb8bf4..6271c1a 100644 --- a/src/library/internal/computeRowBlocks.hpp +++ b/src/library/internal/computeRowBlocks.hpp @@ -21,12 +21,56 @@ #include #include +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +static inline unsigned int flp2(unsigned int x) +{ + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + return x - (x >> 1); +} + +// Short rows in CSR-Adaptive are batched together into a single row block. +// If there are a relatively small number of these, then we choose to do +// a horizontal reduction (groups of threads all reduce the same row). +// If there are many threads (e.g. more threads than the maximum size +// of our workgroup) then we choose to have each thread serially reduce +// the row. +// This function calculates the number of threads that could team up +// to reduce these groups of rows. For instance, if you have a +// workgroup size of 256 and 4 rows, you could have 64 threads +// working on each row. If you have 5 rows, only 32 threads could +// reliably work on each row because our reduction assumes power-of-2. +template< typename rowBlockType > +static inline rowBlockType numThreadsForReduction(rowBlockType num_rows) +{ +#if defined(__INTEL_COMPILER) + return 256 >> (_bit_scan_reverse(num_rows-1)+1); +#elif (defined(__clang__) && __has_builtin(__builtin_clz)) || \ + !defined(__clang) && \ + defined(__GNUG__) && ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 30202) + return (256 >> (8*sizeof(int)-__builtin_clz(num_rows-1))); +#elif defined(_MSC_VER) && (_MSC_VER >= 1400) + int bit_returned; + _BitScanReverse(&bit_returned, (num_rows-1)); + return 256 >> (bit_returned+1); +#else + return flp2(256/num_rows); +#endif +} + // The row blocks buffer holds a packed set of information used to inform each // workgroup about how to do its work: // // |6666 5555 5555 5544 4444 4444 3333 3333|3322 2222|2222 1111 1111 1100 0000 0000| // |3210 9876 5432 1098 7654 3210 9876 5432|1098 7654|3210 9876 5432 1098 7654 3210| -// |------------Row Information------------|----flag^|---WG ID within a long row---| +// |------------Row Information------------|--------^|---WG ID within a long row---| +// | | flag/|or # reduce threads for short| // // The upper 32 bits of each rowBlock entry tell the workgroup the ID of the first // row it will be working on. When one workgroup calculates multiple rows, this @@ -34,6 +78,9 @@ // The lower 24 bits are used whenever multiple workgroups calculate a single long // row. This tells each workgroup its ID within that row, so it knows which // part of the row to operate on. +// Alternately, on "short" row blocks, the lower bits are used to communicate +// the number of threads that should be used for the reduction. Pre-calculating +// this on the CPU-side results in a noticable performance uplift on many matrices. // Bit 24 is a flag bit used so that the multiple WGs calculating a long row can // know when the first workgroup for that row has finished initializing the output // value. While this bit is the same as the first workgroup's flag bit, this @@ -41,12 +88,17 @@ // rowBlockType is currently instantiated as ulong template< typename rowBlockType > -void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int* rowDelimiters, int nRows, int blkSize ) +void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int* rowDelimiters, int nRows, int blkSize, int blkMultiplier, int rows_for_vector, bool allocate_row_blocks = true ) { - rowBlockType* rowBlocksBase = rowBlocks; + rowBlockType* rowBlocksBase; + int total_row_blocks = 1; // Start at one because of rowBlock[0] - *rowBlocks = 0; - rowBlocks++; + if (allocate_row_blocks) + { + rowBlocksBase = rowBlocks; + *rowBlocks = 0; + rowBlocks++; + } rowBlockType sum = 0; rowBlockType i, last_i = 0; @@ -57,64 +109,151 @@ void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int* return; } + int consecutive_long_rows = 0; for( i = 1; i <= nRows; i++ ) { - sum += ( rowDelimiters[ i ] - rowDelimiters[ i - 1 ] ); + int row_length = ( rowDelimiters[ i ] - rowDelimiters[ i - 1 ] ); + sum += row_length; - // more than one row results in non-zero elements to be greater than blockSize - // This is csr-stream case; bottom WG_BITS == 0 - if( ( i - last_i > 1 ) && sum > blkSize ) + // The following section of code calculates whether you're moving between + // a series of "short" rows and a series of "long" rows. + // This is because the reduction in CSR-Adaptive likes things to be + // roughly the same length. Long rows can be reduced horizontally. + // Short rows can be reduced one-thread-per-row. Try not to mix them. + if ( row_length > 128 ) + consecutive_long_rows++; + else if ( consecutive_long_rows > 0 ) { - *rowBlocks = ( (i - 1) << (64 - ROW_BITS) ); - rowBlocks++; - i--; - last_i = i; - sum = 0; + // If it turns out we WERE in a long-row region, cut if off now. + if (row_length < 32) // Now we're in a short-row region + consecutive_long_rows = -1; + else + consecutive_long_rows++; } + // If you just entered into a "long" row from a series of short rows, + // then we need to make sure we cut off those short rows. Put them in + // their own workgroup. + if ( consecutive_long_rows == 1 ) + { + // Assuming there *was* a previous workgroup. If not, nothing to do here. + if( i - last_i > 1 ) + { + if (allocate_row_blocks) + { + *rowBlocks = ( (i - 1) << (64 - ROW_BITS) ); + // If this row fits into CSR-Stream, calculate how many rows + // can be used to do a parallel reduction. + // Fill in the low-order bits with the numThreadsForRed + if (((i-1) - last_i) > rows_for_vector) + *(rowBlocks-1) |= numThreadsForReduction((i - 1) - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i-1; + sum = row_length; + } + } + else if (consecutive_long_rows == -1) + { + // We see the first short row after some long ones that + // didn't previously fill up a row block. + if (allocate_row_blocks) + { + *rowBlocks = ( (i - 1) << (64 - ROW_BITS) ); + if (((i-1) - last_i) > rows_for_vector) + *(rowBlocks-1) |= numThreadsForReduction((i - 1) - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i-1; + sum = row_length; + consecutive_long_rows = 0; + } + + // Now, what's up with this row? What did it do? + // exactly one row results in non-zero elements to be greater than blockSize // This is csr-vector case; bottom WG_BITS == workgroup ID - else if( ( i - last_i == 1 ) && sum > blkSize ) + if( ( i - last_i == 1 ) && sum > blkSize ) { - int numWGReq = static_cast< int >( ceil( (double)sum / blkSize ) ); + int numWGReq = static_cast< int >( ceil( (double)row_length / (blkMultiplier*blkSize) ) ); // Check to ensure #workgroups can fit in WG_BITS bits, if not // then the last workgroup will do all the remaining work numWGReq = ( numWGReq < (int)pow( 2, WG_BITS ) ) ? numWGReq : (int)pow( 2, WG_BITS ); - for( int w = 1; w < numWGReq; w++ ) + if (allocate_row_blocks) { - *rowBlocks = ( (i - 1) << (64 - ROW_BITS) ); - *rowBlocks |= static_cast< rowBlockType >( w ); + for( int w = 1; w < numWGReq; w++ ) + { + *rowBlocks = ( (i - 1) << (64 - ROW_BITS) ); + *rowBlocks |= static_cast< rowBlockType >( w ); + rowBlocks++; + } + *rowBlocks = ( i << (64 - ROW_BITS) ); rowBlocks++; } - - *rowBlocks = ( i << (64 -ROW_BITS) ); - rowBlocks++; - + total_row_blocks += numWGReq; last_i = i; sum = 0; + consecutive_long_rows = 0; } - // sum of non-zero elements is exactly equal to blockSize - // This is csr-stream case; bottom WG_BITS == 0 + // more than one row results in non-zero elements to be greater than blockSize + // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads + else if( ( i - last_i > 1 ) && sum > blkSize ) + { + i--; // This row won't fit, so back off one. + if (allocate_row_blocks) + { + *rowBlocks = ( i << (64 - ROW_BITS) ); + if ((i - last_i) > rows_for_vector) + *(rowBlocks-1) |= numThreadsForReduction(i - last_i); + rowBlocks++; + } + total_row_blocks++; + last_i = i; + sum = 0; + consecutive_long_rows = 0; + } + // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads else if( sum == blkSize ) { - *rowBlocks = ( i << (64 - ROW_BITS) ); - rowBlocks++; + if (allocate_row_blocks) + { + *rowBlocks = ( i << (64 - ROW_BITS) ); + if ((i - last_i) > rows_for_vector) + *(rowBlocks-1) |= numThreadsForReduction(i - last_i); + rowBlocks++; + } + total_row_blocks++; last_i = i; sum = 0; + consecutive_long_rows = 0; } - } - *rowBlocks = ( static_cast< rowBlockType >( nRows ) << (64 - ROW_BITS) ); - rowBlocks++; - - size_t dist = std::distance( rowBlocksBase, rowBlocks ); - assert( dist <= rowBlockSize ); + // If we didn't fill a row block with the last row, make sure we don't lose it. + if ( allocate_row_blocks && (*(rowBlocks-1) >> (64 - ROW_BITS)) != static_cast< rowBlockType>(nRows) ) + { + *rowBlocks = ( static_cast< rowBlockType >( nRows ) << (64 - ROW_BITS) ); + if ((nRows - last_i) > rows_for_vector) + *(rowBlocks-1) |= numThreadsForReduction(i - last_i); + rowBlocks++; + } + total_row_blocks++; - // Update the size of rowBlocks to reflect the actual amount of memory used - rowBlockSize = dist; + if (allocate_row_blocks) + { + size_t dist = std::distance( rowBlocksBase, rowBlocks ); + assert( (2 * dist) <= rowBlockSize ); + // Update the size of rowBlocks to reflect the actual amount of memory used + // We're multiplying the size by two because the extended precision form of + // CSR-Adaptive requires more space for the final global reduction. + rowBlockSize = 2 * dist; + } + else + rowBlockSize = 2 * total_row_blocks; } #endif diff --git a/src/library/io/mm-reader.cpp b/src/library/io/mm-reader.cpp index ae78dae..ea4cbeb 100644 --- a/src/library/io/mm-reader.cpp +++ b/src/library/io/mm-reader.cpp @@ -596,9 +596,10 @@ clsparseSCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls if( pCsrMatx->rowBlockSize ) { clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks ); + ComputeRowBlocks( (cl_ulong*)NULL, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, false ); cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize ); - ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE ); + ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true ); } return clsparseSuccess; @@ -666,9 +667,10 @@ clsparseDCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls if( pCsrMatx->rowBlockSize ) { clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks ); + ComputeRowBlocks( (cl_ulong*)NULL, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, false ); cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize ); - ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE ); + ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true ); } return clsparseSuccess; @@ -734,9 +736,10 @@ clsparseDCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls // if( pCsrMatx->rowBlockSize ) // { // clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks ); +// ComputeRowBlocks( (cl_ulong*)NULL, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, false ); // cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize ); -// ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE ); +// ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true ); // } // return clsparseSuccess; diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl index b5c6935..2a1f04e 100644 --- a/src/library/kernels/csrmv_adaptive.cl +++ b/src/library/kernels/csrmv_adaptive.cl @@ -15,43 +15,117 @@ R"( * limitations under the License. * ************************************************************************ */ -#define WGSIZE 256 - -#ifdef DOUBLE - #define FPTYPE double - +// No reason to include these beyond version 1.2, where double is not an extension. +#if defined(DOUBLE) && __OPENCL_VERSION__ < CL_VERSION_1_2 #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable + #pragma OPENCL EXTENSION cl_khr_fp64 : enable #elif defined(cl_amd_fp64) - #pragma OPENCL EXTENSION cl_amd_fp64 : enable + #pragma OPENCL EXTENSION cl_amd_fp64 : enable #else - #error "Double precision floating point not supported by OpenCL implementation." + #error "Double precision floating point not supported by OpenCL implementation." #endif +#endif + +#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) + #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable + #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable + #define ATOM64 +#endif + +#if __OPENCL_VERSION__ > CL_VERSION_1_0 + #define ATOM32 +#elif defined(cl_khr_global_int32_base_atomics) && defined(cl_khr_global_int32_extended_atomics) + #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : require + #pragma OPENCL_EXTENSION cl_khr_global_int32_extended_atomics : require + #define ATOM32 #else - #define FPTYPE float + #error "Required integer atomics not supported by this OpenCL implemenation." #endif -#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable -#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable +#ifndef INDEX_TYPE +#error "INDEX_TYPE undefined!" +#endif -int lowerPowerOf2(int num) -{ - num --; +#ifndef VALUE_TYPE +#error "VALUE_TYPE undefined!" +#endif + +#ifndef ROWBITS +#error "ROWBITS undefined!" +#endif + +#ifndef WGBITS +#error "WGBITS undefined!" +#endif - num |= num >> 1; - num |= num >> 2; - num |= num >> 4; - num |= num >> 8; - num |= num >> 16; +#ifndef WG_SIZE +#error "WG_SIZE undefined!" +#endif + +#ifndef BLOCKSIZE +#error "BLOCKSIZE undefined!" +#endif - num ++; +#ifndef BLOCK_MULTIPLIER +#error "BLOCK_MULTIPLIER undefined!" +#endif - num >>= 1; +#ifndef ROWS_FOR_VECTOR +#error "ROWS_FOR_VECTOR undefined!" +#endif + +// Internal functions to wrap atomics, depending on if we support 64-bit +// atomics or not. Helps keep the code clean in the other parts of the code. +// All of the 32-bit atomics are built assuming we're on a little endian architecture. +inline unsigned long clsparse_atomic_xor(__global unsigned long * restrict const ptr, + const unsigned long xor_val) +{ +#ifdef ATOM64 + return atom_xor(ptr, xor_val); +#else + return atomic_max((__global unsigned int*)ptr, (unsigned int)xor_val); +#endif +} - return num; +inline unsigned long clsparse_atomic_max(__global unsigned long * restrict const ptr, + const unsigned long compare) +{ +#ifdef ATOM64 + return atom_max(ptr, compare); +#else + return atomic_max((__global unsigned int*)ptr, (unsigned int)compare); +#endif } -void atomic_add_float (global FPTYPE *ptr, FPTYPE temp) +inline unsigned long clsparse_atomic_inc(__global unsigned long * restrict const inc_this) +{ +#ifdef ATOM64 + return atom_inc(inc_this); +#else + return atomic_inc((__global unsigned int *)inc_this); +#endif +} + +inline unsigned long clsparse_atomic_cmpxchg(__global unsigned long * restrict const ptr, + const unsigned long compare, + const unsigned long val) +{ +#ifdef DOUBLE + #ifdef ATOM64 + return atom_cmpxchg(ptr, compare, val); + #else + // Should never run this. Don't use a path that requires cmpxchg for doubles + // if you don't support 64-bit atomics. + return compare; + #endif +#else + return atomic_cmpxchg((__global unsigned int*)ptr, compare, val); +#endif +} + +VALUE_TYPE atomic_add_float_extended( global VALUE_TYPE * restrict const ptr, + const VALUE_TYPE temp, + VALUE_TYPE * restrict const old_sum ) { #ifdef DOUBLE unsigned long newVal; @@ -60,8 +134,10 @@ void atomic_add_float (global FPTYPE *ptr, FPTYPE temp) { prevVal = as_ulong(*ptr); newVal = as_ulong(temp + *ptr); - } while (atom_cmpxchg((global unsigned long *)ptr, prevVal, newVal) != prevVal); - + } while (clsparse_atomic_cmpxchg((__global unsigned long *)ptr, prevVal, newVal) != prevVal); + if (old_sum != 0) + *old_sum = as_double(prevVal); + return as_double(newVal); #else unsigned int newVal; unsigned int prevVal; @@ -69,32 +145,191 @@ void atomic_add_float (global FPTYPE *ptr, FPTYPE temp) { prevVal = as_uint(*ptr); newVal = as_uint(temp + *ptr); - } while (atomic_cmpxchg((global unsigned int *)ptr, prevVal, newVal) != prevVal); + } while (clsparse_atomic_cmpxchg((__global unsigned long *)ptr, prevVal, newVal) != prevVal); + if (old_sum != 0) + *old_sum = as_float(prevVal); + return as_float(newVal); #endif } +void atomic_add_float( global void * const ptr, const VALUE_TYPE temp ) +{ + atomic_add_float_extended(ptr, temp, 0); +} + +// Knuth's Two-Sum algorithm, which allows us to add together two floating +// point numbers and exactly tranform the answer into a sum and a +// rounding error. +// Inputs: x and y, the two inputs to be aded together. +// In/Out: *sumk_err, which is incremented (by reference) -- holds the +// error value as a result of the 2sum calculation. +// Returns: The non-corrected sum of inputs x and y. +VALUE_TYPE two_sum( VALUE_TYPE x, + VALUE_TYPE y, + VALUE_TYPE * restrict const sumk_err ) +{ + const VALUE_TYPE sumk_s = x + y; +#ifdef EXTENDED_PRECISION + // We use this 2Sum algorithm to perform a compensated summation, + // which can reduce the cummulative rounding errors in our SpMV summation. + // Our compensated sumation is based on the SumK algorithm (with K==2) from + // Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in + // SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005. + + // 2Sum can be done in 6 FLOPs without a branch. However, calculating + // double precision is slower than single precision on every existing GPU. + // As such, replacing 2Sum with Fast2Sum when using DPFP results in better + // performance (~5% higher total). This is true even though we must ensure + // that |a| > |b|. Branch divergence is better than the DPFP slowdown. + // Thus, for DPFP, our compensated summation algorithm is actually described + // by both Pichat and Neumaier in "Correction d'une somme en arithmetique + // a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972) + // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher + // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51, + // 1974), respectively. + if (fabs(x) < fabs(y)) + { + const VALUE_TYPE swap = x; + x = y; + y = swap; + } + (*sumk_err) += (y - (sumk_s - x)); + // Original 6 FLOP 2Sum algorithm. + //const VALUE_TYPE bp = sumk_s - x; + //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp)); +#endif + return sumk_s; +} + +// Performs (x_vals * x_vec) + y using an FMA. +// Ideally, we would perform an error-free transformation here and return the +// appropriate error. However, the EFT of an FMA is very expensive. As such, +// if we are in EXTENDED_PRECISION mode, this function devolves into two_sum +// with x_vals and x_vec inputs multiplied separately from the compensated add. +VALUE_TYPE two_fma( const VALUE_TYPE x_vals, + const VALUE_TYPE x_vec, + VALUE_TYPE y, + VALUE_TYPE * restrict const sumk_err ) +{ +#ifdef EXTENDED_PRECISION + VALUE_TYPE x = x_vals * x_vec; + const VALUE_TYPE sumk_s = x + y; + if (fabs(x) < fabs(y)) + { + const VALUE_TYPE swap = x; + x = y; + y = swap; + } + (*sumk_err) += (y - (sumk_s - x)); + // 2Sum in the FMA case. Poor performance on low-DPFP GPUs. + //const VALUE_TYPE bp = fma(-x_vals, x_vec, sumk_s); + //(*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp)); + return sumk_s; +#else + return fma(x_vals, x_vec, y); +#endif +} + +VALUE_TYPE atomic_two_sum_float( global VALUE_TYPE * restrict const x_ptr, + VALUE_TYPE y, + VALUE_TYPE * restrict const sumk_err ) +{ + // Have to wait until the return from the atomic op to know what X was. + VALUE_TYPE sumk_s = 0.; +#ifdef EXTENDED_PRECISION + VALUE_TYPE x; + sumk_s = atomic_add_float_extended(x_ptr, y, &x); + if (fabs(x) < fabs(y)) + { + const VALUE_TYPE swap = x; + x = y; + y = swap; + } + (*sumk_err) += (y - (sumk_s - x)); +#else + atomic_add_float(x_ptr, y); +#endif + return sumk_s; +} + +// A method of doing the final reduction in CSR-Vector without having to copy +// and paste it a bunch of times. +// The EXTENDED_PRECISION section is done as part of a compensated summation +// meant to reduce cummulative rounding errors. This can become a problem on +// GPUs because the reduction order is different than what would be used on a +// CPU. It is based on the PSumK algorithm (with K==2) from Yamanaka, Ogita, +// Rump, and Oishi, "A Parallel Algorithm of Accurate Dot Product," in +// J. Parallel Computing, 34(6-8), pp. 392-410, Jul. 2008. +// A version of this method is also used in csrmv_general. +// Inputs: cur_sum: the input from which our sum starts +// err: the current running cascade error for this final summation +// partial: the local memory which holds the values to sum +// (we eventually use it to pass down temp. err vals as well) +// lid: local ID of the work item calling this function. +// thread_lane: The lane within this thread's row. +// max_size: This parallel summation method operates in multiple rounds +// to do a parallel reduction. This is the length of each row. +// reduc_size: As you reduce data down, this tells you how far away +// you will grab a value and add to your own local sum value. +VALUE_TYPE sum2_reduce( VALUE_TYPE cur_sum, + VALUE_TYPE * restrict const err, + __local VALUE_TYPE * restrict const partial, + const unsigned int lid, + const unsigned int thread_lane, + const unsigned int max_size, + const unsigned int reduc_size ) +{ + if ( max_size > reduc_size ) + { +#ifdef EXTENDED_PRECISION + const unsigned int partial_dest = lid + reduc_size; + if (thread_lane < reduc_size) + cur_sum = two_sum(cur_sum, partial[partial_dest], err); + // We reuse the LDS entries to move the error values down into lower + // threads. This saves LDS space, allowing higher occupancy, but requires + // more barriers, which can reduce performance. + barrier(CLK_LOCAL_MEM_FENCE); + // Have all of those upper threads pass their temporary errors + // into a location that the lower threads can read. + partial[lid] = (thread_lane >= reduc_size) ? *err : partial[lid]; + barrier(CLK_LOCAL_MEM_FENCE); + if (thread_lane < reduc_size) // Add those errors in. + { + *err += partial[partial_dest]; + partial[lid] = cur_sum; + } +#else + cur_sum += partial[lid + reduc_size]; + barrier( CLK_LOCAL_MEM_FENCE ); + partial[lid] = cur_sum; +#endif + } + return cur_sum; +} + __kernel void -csrmv_adaptive(__global const FPTYPE * restrict vals, - __global const int * restrict cols, - __global const int * restrict rowPtrs, - __global const FPTYPE * restrict vec, - __global FPTYPE * restrict out, - __global unsigned long * restrict rowBlocks, - __global FPTYPE * pAlpha, - __global FPTYPE * pBeta) +csrmv_adaptive(__global const VALUE_TYPE * restrict const vals, + __global const unsigned int * restrict const cols, + __global const unsigned int * restrict const rowPtrs, + __global const VALUE_TYPE * restrict const vec, + __global VALUE_TYPE * restrict const out, + __global unsigned long * restrict const rowBlocks, + __global const VALUE_TYPE * restrict const pAlpha, + __global const VALUE_TYPE * restrict const pBeta) { - __local FPTYPE partialSums[BLOCKSIZE]; - size_t gid = get_group_id(0); - size_t lid = get_local_id(0); - const FPTYPE alpha = *pAlpha; - const FPTYPE beta = *pBeta; + __local VALUE_TYPE partialSums[BLOCKSIZE]; + const unsigned int gid = get_group_id(0); + const unsigned int lid = get_local_id(0); + const VALUE_TYPE alpha = *pAlpha; + const VALUE_TYPE beta = *pBeta; // The row blocks buffer holds a packed set of information used to inform each // workgroup about how to do its work: // // |6666 5555 5555 5544 4444 4444 3333 3333|3322 2222|2222 1111 1111 1100 0000 0000| // |3210 9876 5432 1098 7654 3210 9876 5432|1098 7654|3210 9876 5432 1098 7654 3210| - // |------------Row Information------------|----flag^|---WG ID within a long row---| + // |------------Row Information------------|--------^|---WG ID within a long row---| + // | | flag/|or # reduce threads for short| // // The upper 32 bits of each rowBlock entry tell the workgroup the ID of the first // row it will be working on. When one workgroup calculates multiple rows, this @@ -102,19 +337,47 @@ csrmv_adaptive(__global const FPTYPE * restrict vals, // The lower 24 bits are used whenever multiple workgroups calculate a single long // row. This tells each workgroup its ID within that row, so it knows which // part of the row to operate on. + // Alternately, on "short" row blocks, the lower bits are used to communicate + // the number of threads that should be used for the reduction. Pre-calculating + // this on the CPU-side results in a noticable performance uplift on many matrices. // Bit 24 is a flag bit used so that the multiple WGs calculating a long row can // know when the first workgroup for that row has finished initializing the output // value. While this bit is the same as the first workgroup's flag bit, this // workgroup will spin-loop. unsigned int row = ((rowBlocks[gid] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL)); unsigned int stop_row = ((rowBlocks[gid + 1] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL)); + unsigned int num_rows = stop_row - row; + + // Get the "workgroup within this long row" ID out of the bottom bits of the row block. + unsigned int wg = rowBlocks[gid] & ((1 << WGBITS) - 1); + + // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row. + // If there are more items in this row, we assign more workgroups. + unsigned int vecStart = mad24(wg, (unsigned int)(BLOCK_MULTIPLIER*BLOCKSIZE), rowPtrs[row]); + unsigned int vecEnd = (rowPtrs[row + 1] > vecStart + BLOCK_MULTIPLIER*BLOCKSIZE) ? vecStart + BLOCK_MULTIPLIER*BLOCKSIZE : rowPtrs[row + 1]; + +#if (defined(DOUBLE) || defined(LONG)) && !defined(ATOM64) + // In here because we don't support 64-bit atomics while working on 64-bit data. + // As such, we can't use CSR-LongRows. Time to do a fixup -- first WG does the + // entire row with CSR-Vector. Other rows immediately exit. + if (num_rows == 0 || (num_rows == 1 && wg)) // CSR-LongRows case + { + num_rows = ROWS_FOR_VECTOR; + stop_row = wg ? row : (row + 1); + wg = 0; + } +#endif + + VALUE_TYPE temp_sum = 0.; + VALUE_TYPE sumk_e = 0.; + VALUE_TYPE new_error = 0.; // If the next row block starts more than 2 rows away, then we choose CSR-Stream. // If this is zero (long rows) or one (final workgroup in a long row, or a single - // row in a row block), we want to use the CSR-Vector algorithm. + // row in a row block), we want to use the CSR-Vector algorithm(s). // We have found, through experimentation, that CSR-Vector is generally faster // when working on 2 rows, due to its simplicity and better reduction method. - if (stop_row - row > 2) + if (num_rows > ROWS_FOR_VECTOR) { // CSR-Stream case. See Sections III.A and III.B in the SC'14 paper: // "Efficient Sparse Matrix-Vector Multiplication on GPUs using the CSR Storage Format" @@ -131,24 +394,22 @@ csrmv_adaptive(__global const FPTYPE * restrict vals, // long) rows, it's actually better to perform a tree-style reduction where // multiple threads team up to reduce the same row. - // The calculations below tell you how many threads this workgroup can allocate + // The calculation below tells you how many threads this workgroup can allocate // to each row, assuming that every row gets the same number of threads. - // We want the closest lower-power-of-2 to this number -- that is how many - // threads can work in each row's reduction using our algorithm. - int possibleThreadsRed = get_local_size(0)/(stop_row - row); - int numThreadsForRed = lowerPowerOf2(possibleThreadsRed); - - unsigned int local_row = row + lid; + // We want the closest lower (or equal) power-of-2 to this number -- + // that is how many threads can work in each row's reduction using our algorithm. + // For instance, with workgroup size 256, 2 rows = 128 threads, 3 rows = 64 + // threads, 4 rows = 64 threads, 5 rows = 32 threads, etc. + //int numThreadsForRed = get_local_size(0) >> ((CHAR_BIT*sizeof(unsigned int))-clz(num_rows-1)); + const unsigned int numThreadsForRed = wg; // Same calculation as above, done on host. // Stream all of this row block's matrix values into local memory. // Perform the matvec in parallel with this work. - int col = rowPtrs[row] + lid; + const unsigned int col = rowPtrs[row] + lid; if (gid != (get_num_groups(0) - 1)) { - for(int i = 0; i < BLOCKSIZE; i += 256) - { - partialSums[lid + i] = vals[col + i] * vec[cols[col + i]]; - } + for(int i = 0; i < BLOCKSIZE; i += WG_SIZE) + partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]]; } else { @@ -159,11 +420,9 @@ csrmv_adaptive(__global const FPTYPE * restrict vals, // However, this may change in the future (e.g. with shared virtual memory.) // This causes a minor performance loss because this is the last workgroup // to be launched, and this loop can't be unrolled. - int max_to_load = rowPtrs[stop_row] - rowPtrs[row]; - for(int i = 0; i < max_to_load; i += 256) - { - partialSums[lid + i] = vals[col + i] * vec[cols[col + i]]; - } + const unsigned int max_to_load = rowPtrs[stop_row] - rowPtrs[row]; + for(int i = 0; i < max_to_load; i += WG_SIZE) + partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]]; } barrier(CLK_LOCAL_MEM_FENCE); @@ -172,58 +431,56 @@ csrmv_adaptive(__global const FPTYPE * restrict vals, // In this case, we want to have the workgroup perform a tree-style reduction // of each row. {numThreadsForRed} adjacent threads team up to linearly reduce // a row into {numThreadsForRed} locations in local memory. - // After that, a single thread from each of those teams linearly walks through - // the local memory values for that row and reduces to the final output value. - FPTYPE temp = 0.; - - // {numThreadsForRed} adjacent threads all work on the same row, so their - // start and end values are the same. - size_t st = lid/numThreadsForRed; - int local_first_val = (rowPtrs[row + st] - rowPtrs[row]); - int local_last_val = rowPtrs[row + st + 1] - rowPtrs[row]; - int workForEachThread = (local_last_val - local_first_val)/numThreadsForRed; - size_t threadInBlock = lid & (numThreadsForRed - 1); - - // Not all row blocks are full -- they may have an odd number of rows. As such, - // we need to ensure that adjacent-groups only work on real data for this rowBlock. - if(st < (stop_row - row)) - { - // only works when numThreadsForRed is a power of 2 - for(int i = 0; i < workForEachThread; i++) - { - temp += partialSums[local_first_val + i*numThreadsForRed + threadInBlock]; - } - - // The last few values (the remainder of this row) also need to be aded in. - int local_cur_val = local_first_val + numThreadsForRed*workForEachThread; - if(threadInBlock < local_last_val - local_cur_val) - { - temp += partialSums[local_cur_val + threadInBlock]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - partialSums[lid] = temp; - - // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed} - // values sitting in the local memory. This next step takes the first thread from - // each of the adjacent-groups and uses it to walk through those values and reduce - // them into a final output value for the row. - temp = 0.; - if(lid < (stop_row - row)) - { -#pragma unroll 4 - for(int i = 0; i < numThreadsForRed; i++) - { - temp += partialSums[lid*numThreadsForRed + i]; - } - // All of our write-outs check to see if the output vector should first be zeroed. - // If so, just do a write rather than a read-write. Measured to be a slight (~5%) - // performance improvement. - if (beta != 0.) - out[local_row] = (beta * out[local_row]) + (alpha * temp); - else - out[local_row] = (alpha * temp); - } + // After that, the entire workgroup does a parallel reduction, and each + // row ends up with an individual answer. + + // {numThreadsForRed} adjacent threads all work on the same row, so their + // start and end values are the same. + // numThreadsForRed guaranteed to be a power of two, so the clz code below + // avoids an integer divide. ~2% perf gain in EXTRA_PRECISION. + //size_t st = lid/numThreadsForRed; + const unsigned int local_row = row + (lid >> (31 - clz(numThreadsForRed))); + const unsigned int local_first_val = rowPtrs[local_row] - rowPtrs[row]; + const unsigned int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row]; + const unsigned int threadInBlock = lid & (numThreadsForRed - 1); + + // Not all row blocks are full -- they may have an odd number of rows. As such, + // we need to ensure that adjacent-groups only work on real data for this rowBlock. + if(local_row < stop_row) + { + // This is dangerous -- will infinite loop if your last value is within + // numThreadsForRed of MAX_UINT. Noticable performance gain to avoid a + // long induction variable here, though. + for(unsigned int local_cur_val = local_first_val + threadInBlock; + local_cur_val < local_last_val; + local_cur_val += numThreadsForRed) + temp_sum = two_sum(partialSums[local_cur_val], temp_sum, &sumk_e); + } + barrier(CLK_LOCAL_MEM_FENCE); + + temp_sum = two_sum(temp_sum, sumk_e, &new_error); + partialSums[lid] = temp_sum; + + // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed} + // values sitting in the local memory. This means that, roughly, the beginning of + // LDS is full up to {workgroup size} entries. + // Now we perform a parallel reduction that sums together the answers for each + // row in parallel, leaving us an answer in 'temp_sum' for each row. + for (int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + barrier( CLK_LOCAL_MEM_FENCE ); + temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, i); + } + + if (threadInBlock == 0 && local_row < stop_row) + { + // All of our write-outs check to see if the output vector should first be zeroed. + // If so, just do a write rather than a read-write. Measured to be a slight (~5%) + // performance improvement. + if (beta != 0.) + temp_sum = two_fma(beta, out[local_row], temp_sum, &new_error); + out[local_row] = temp_sum + new_error; + } } else { @@ -232,30 +489,77 @@ csrmv_adaptive(__global const FPTYPE * restrict vals, // However, this reduction is also much faster than CSR-Scalar, because local memory // is designed for scatter-gather operations. // We need a while loop because there may be more rows than threads in the WG. - while(local_row < stop_row) - { - int local_first_val = (rowPtrs[local_row] - rowPtrs[row]); - int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row]; - FPTYPE temp = 0; - for (int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++) - { - temp += partialSums[local_cur_val]; - } - - // After you've done the reduction into the temp register, - // put that into the output for each row. - if (beta != 0.) - out[local_row] = (beta * out[local_row]) + (alpha * temp); - else - out[local_row] = (alpha * temp); - local_row += get_local_size(0); - } + unsigned int local_row = row + lid; + while(local_row < stop_row) + { + int local_first_val = (rowPtrs[local_row] - rowPtrs[row]); + int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row]; + temp_sum = 0.; + sumk_e = 0.; + for (int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++) + temp_sum = two_sum(partialSums[local_cur_val], temp_sum, &sumk_e); + + // After you've done the reduction into the temp_sum register, + // put that into the output for each row. + if (beta != 0.) + temp_sum = two_fma(beta, out[local_row], temp_sum, &sumk_e); + out[local_row] = temp_sum + sumk_e; + local_row += WG_SIZE; + } } } + else if (num_rows >= 1 && !wg) // CSR-Vector case. + { + // ^^ The above check says that if this workgroup is supposed to work on <= ROWS_VECTOR + // number of rows then we should do the CSR-Vector algorithm. If we want this row to be + // done with CSR-LongRows, then all of its workgroups (except the last one) will have the + // same stop_row and row. The final workgroup in a LongRow will have stop_row and row + // different, but the internal "wg" number will be non-zero. + + // If this workgroup is operating on multiple rows (because CSR-Stream is poor for small + // numbers of rows), then it needs to iterate until it reaches the stop_row. + // We don't check <= stop_row because of the potential for unsigned overflow. + while (row < stop_row) + { + // Any workgroup only calculates, at most, BLOCKSIZE items in this row. + // If there are more items in this row, we use CSR-LongRows. + temp_sum = 0.; + sumk_e = 0.; + new_error = 0.; + vecStart = rowPtrs[row]; + vecEnd = rowPtrs[row+1]; + + // Load in a bunch of partial results into your register space, rather than LDS (no contention) + // Then dump the partially reduced answers into the LDS for inter-work-item reduction. + // Using a long induction variable to make sure unsigned int overflow doesn't break things. + for (long j = vecStart + lid; j < vecEnd; j+=WG_SIZE) + { + const unsigned int col = cols[(unsigned int)j]; + temp_sum = two_fma(alpha*vals[(unsigned int)j], vec[col], temp_sum, &sumk_e); + } + + temp_sum = two_sum(temp_sum, sumk_e, &new_error); + partialSums[lid] = temp_sum; + + // Reduce partial sums + for (int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + barrier( CLK_LOCAL_MEM_FENCE); + temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, lid, WG_SIZE, i); + } + + if (lid == 0UL) + { + if (beta != 0.) + temp_sum = two_fma(beta, out[row], temp_sum, &new_error); + out[row] = temp_sum + new_error; + } + row++; + } + } else { - // In CSR-Vector, we may have more than one workgroup calculating this row - // or one workgroup calculating two rows. + // In CSR-LongRows, we have more than one workgroup calculating this row. // The output values for those types of rows are stored using atomic_add, because // more than one parallel workgroup's value makes up the final answer. // Unfortunately, this makes it difficult to do y=Ax, rather than y=Ax+y, because @@ -268,93 +572,113 @@ csrmv_adaptive(__global const FPTYPE * restrict vals, // First, figure out which workgroup you are in the row. Bottom 24 bits. // You can use that to find the global ID for the first workgroup calculating // this long row. - size_t first_wg_in_row = gid - (rowBlocks[gid] & ((1UL << WGBITS) - 1UL)); - unsigned int compare_value = rowBlocks[gid] & (1UL << WGBITS); + const unsigned int first_wg_in_row = gid - (rowBlocks[gid] & ((1UL << WGBITS) - 1UL)); + const unsigned int compare_value = rowBlocks[gid] & (1UL << WGBITS); // Bit 24 in the first workgroup is the flag that everyone waits on. - if(gid == first_wg_in_row && lid == 0) - { - // The first workgroup handles the output initialization. - if (beta != 0.) - out[row] *= beta; - else - out[row] = 0.; - // We currently have, at most, two rows in a CSR-Vector calculation. - // If we have two, we need to initialize the second output as well. - if(stop_row - row == 2) - { - if (beta != 0.) - out[row+1] *= beta; - else - out[row+1] = 0.; - } - atom_xor(&rowBlocks[first_wg_in_row], (1UL << WGBITS)); // Release other workgroups. + if(gid == first_wg_in_row && lid == 0UL) + { + // The first workgroup handles the output initialization. + volatile VALUE_TYPE out_val = out[row]; + temp_sum = (beta - 1.) * out_val; +#ifdef EXTENDED_PRECISION + rowBlocks[get_num_groups(0) + gid + 1] = 0UL; +#endif + clsparse_atomic_xor(&rowBlocks[first_wg_in_row], (1UL << WGBITS)); // Release other workgroups. } // For every other workgroup, bit 24 holds the value they wait on. // If your bit 24 == first_wg's bit 24, you spin loop. // The first workgroup will eventually flip this bit, and you can move forward. barrier(CLK_GLOBAL_MEM_FENCE); - while(gid != first_wg_in_row && - lid==0 && - ((atom_max(&rowBlocks[first_wg_in_row], 0UL) & (1UL << WGBITS)) == compare_value)); + while(gid != first_wg_in_row && + lid == 0U && + ((clsparse_atomic_max(&rowBlocks[first_wg_in_row], 0UL) & (1UL << WGBITS)) == compare_value)); barrier(CLK_GLOBAL_MEM_FENCE); // After you've passed the barrier, update your local flag to make sure that // the next time through, you know what to wait on. - if (gid != first_wg_in_row && lid == 0) + if (gid != first_wg_in_row && lid == 0UL) rowBlocks[gid] ^= (1UL << WGBITS); - unsigned int myRow = row; // Adding another variable for row as it is getting updated below - char iteration = 0; - - // CSR-Vector will always do at least one iteration. // All but the final workgroup in a long-row collaboration have the same start_row // and stop_row. They only run for one iteration. - // If this workgroup is operating on multiple rows (because CSR-Stream is poor for small - // #s of rows), then it is (currently) not part of a collaboration on a long - // row. As such, it needs to iterate until it reaches the stop_row. - // We don't check <= stop_row because of the potential for unsigned overflow. - while (iteration == 0 || myRow < stop_row) - { - // Get the "workgroup within this long row" ID out of the bottom bits of the row block. - // If this is the only workgroup working on this row, this will be zero, so still correct. - unsigned int wg = rowBlocks[gid] & ((1 << WGBITS) - 1); - - // Any workgroup only calculates, at most, BLOCKSIZE items in this row. - // If there are more items in this row, we assign more workgroups. - unsigned int vecStart = rowPtrs[myRow] + (wg * BLOCKSIZE); - unsigned int vecEnd = (rowPtrs[myRow + 1] > vecStart + BLOCKSIZE) ? vecStart + BLOCKSIZE : rowPtrs[myRow+1]; - - // Load in a bunch of partial results into your register space, rather than LDS (no contention) - // Then dump the partially reduced answers into the LDS for inter-work-item reduction. - // Using a long induction variable to make sure unsigned int overflow doesn't break things. - FPTYPE mySum = 0.; - for (long j = vecStart + lid; j < vecEnd; j+=WGSIZE) - { - unsigned int col = cols[(unsigned int)j]; - mySum += vals[(unsigned int)j] * vec[col]; - } - partialSums[lid] = mySum; - barrier(CLK_LOCAL_MEM_FENCE); - - // Reduce partial sums - // Needs to be modified if there is a change in the # of work-items per workgroup. - if(lid < 64) partialSums[lid] += partialSums[lid + 64] + partialSums[lid + 128] + partialSums[lid + 192]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 16) partialSums[lid] += partialSums[lid + 16] + partialSums[lid + 32] + partialSums[lid + 48]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 4) partialSums[lid] += partialSums[lid + 4] + partialSums[lid + 8] + partialSums[lid + 12]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 1) partialSums[lid] += partialSums[lid + 1] + partialSums[lid + 2] + partialSums[lid + 3]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lid == 0) - atomic_add_float(&out[myRow], (alpha * partialSums[0])); - - // CSR-VECTOR on workgroups for two rows which are inefficient for CSR-Stream - myRow++; - iteration++; - } + // Load in a bunch of partial results into your register space, rather than LDS (no contention) + // Then dump the partially reduced answers into the LDS for inter-work-item reduction. + const unsigned int col = vecStart + lid; + if (row == stop_row) // inner thread, we can hardcode/unroll this loop + { + // Don't put BLOCK_MULTIPLIER*BLOCKSIZE as the stop point, because + // some GPU compilers will *aggressively* unroll this loop. + // That increases register pressure and reduces occupancy. + for (int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) + { + temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e); +#if 2*WG_SIZE <= BLOCK_MULTIPLIER*BLOCKSIZE + // If you can, unroll this loop once. It somewhat helps performance. + j += WG_SIZE; + temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e); +#endif + } + } + else + { + for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE) + temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e); + } + + temp_sum = two_sum(temp_sum, sumk_e, &new_error); + partialSums[lid] = temp_sum; + + // Reduce partial sums + for (int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + barrier( CLK_LOCAL_MEM_FENCE); + temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, lid, WG_SIZE, i); + } + + if (lid == 0UL) + { + atomic_two_sum_float(&out[row], temp_sum, &new_error); + +#ifdef EXTENDED_PRECISION + unsigned int error_loc = get_num_groups(0) + first_wg_in_row + 1; + // The last half of the rowBlocks buffer is used to hold errors. + atomic_add_float(&(rowBlocks[error_loc]), new_error); + // Coordinate across all of the workgroups in this coop in order to have + // the last workgroup fix up the error values. + // If this workgroup's row is different than the next workgroup's row + // then this is the last workgroup -- it's this workgroup's job to add + // the error values into the final sum. + if (row != stop_row) + { + // Go forward once your ID is the same as the low order bits of the + // coop's first workgroup. That value will be used to store the number + // of threads that have completed so far. Once all the previous threads + // are done, it's time to send out the errors! + while((clsparse_atomic_max(&rowBlocks[first_wg_in_row], 0UL) & ((1UL << WGBITS) - 1)) != wg); + +#ifdef DOUBLE + new_error = as_double(rowBlocks[error_loc]); +#else + new_error = as_float((int)rowBlocks[error_loc]); +#endif + // Don't need to work atomically here, because this is the only workgroup + // left working on this row. + out[row] += new_error; + rowBlocks[error_loc] = 0UL; + + // Reset the rowBlocks low order bits for next time. + rowBlocks[first_wg_in_row] = rowBlocks[gid] - wg; + } + else + { + // Otherwise, increment the low order bits of the first thread in this + // coop. We're using this to tell how many workgroups in a coop are done. + // Do this with an atomic, since other threads may be doing this too. + const unsigned long no_warn = clsparse_atomic_inc(&rowBlocks[first_wg_in_row]); + } +#endif + } } } )" diff --git a/src/library/kernels/csrmv_general.cl b/src/library/kernels/csrmv_general.cl index 7d0c68f..59d36c9 100644 --- a/src/library/kernels/csrmv_general.cl +++ b/src/library/kernels/csrmv_general.cl @@ -51,6 +51,132 @@ R"( # error SUBWAVE_SIZE is not a power of two! #endif +// Knuth's Two-Sum algorithm, which allows us to add together two floating +// point numbers and exactly tranform the answer into a sum and a +// rounding error. +// Inputs: x and y, the two inputs to be aded together. +// In/Out: *sumk_err, which is incremented (by reference) -- holds the +// error value as a result of the 2sum calculation. +// Returns: The non-corrected sum of inputs x and y. +VALUE_TYPE two_sum( VALUE_TYPE x, + VALUE_TYPE y, + VALUE_TYPE * restrict const sumk_err) +{ + const VALUE_TYPE sumk_s = x + y; +#ifdef EXTENDED_PRECISION + // We use this 2Sum algorithm to perform a compensated summation, + // which can reduce the cummulative rounding errors in our SpMV summation. + // Our compensated sumation is based on the SumK algorithm (with K==2) from + // Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in + // SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005. + + // 2Sum can be done in 6 FLOPs without a branch. However, calculating + // double precision is slower than single precision on every existing GPU. + // As such, replacing 2Sum with Fast2Sum when using DPFP results in slightly + // better performance. This is especially true on non-workstation GPUs with + // low DPFP rates. Fast2Sum is faster even though we must ensure that + // |a| > |b|. Branch divergence is better than the DPFP slowdown. + // Thus, for DPFP, our compensated summation algorithm is actually described + // by both Pichat and Neumaier in "Correction d'une somme en arithmetique + // a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972) + // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher + // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51, + // 1974), respectively. + if (fabs(x) < fabs(y)) + { + const VALUE_TYPE swap = x; + x = y; + y = swap; + } + (*sumk_err) += (y - (sumk_s - x)); + // Original 6 FLOP 2Sum algorithm. + //VALUE_TYPE bp = sumk_s - x; + //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp)); +#endif + return sumk_s; +} + +// Performs (x_vals * x_vec) + y using an FMA. +// Ideally, we would perform an error-free transformation here and return the +// appropriate error. However, the EFT of an FMA is very expensive. As such, +// if we are in EXTENDED_PRECISION mode, this function devolves into two_sum +// with x_vals and x_vec inputs multiplied separately from the compensated add. +VALUE_TYPE two_fma( const VALUE_TYPE x_vals, + const VALUE_TYPE x_vec, + VALUE_TYPE y, + VALUE_TYPE * restrict const sumk_err ) +{ +#ifdef EXTENDED_PRECISION + VALUE_TYPE x = x_vals * x_vec; + const VALUE_TYPE sumk_s = x + y; + if (fabs(x) < fabs(y)) + { + const VALUE_TYPE swap = x; + x = y; + y = swap; + } + (*sumk_err) += (y - (sumk_s - x)); + // 2Sum in the FMA case. Poor performance on low-DPFP GPUs. + //const VALUE_TYPE bp = fma(-x_vals, x_vec, sumk_s); + //(*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp)); + return sumk_s; +#else + return fma(x_vals, x_vec, y); +#endif +} + +// A method of doing the final reduction without having to copy and paste +// it a bunch of times. +// The EXTENDED_PRECISION section is done as part of the PSum2 addition, +// where we take temporary sums and errors for multiple threads and combine +// them together using the same 2Sum method. +// Inputs: cur_sum: the input from which our sum starts +// err: the current running cascade error for this final summation +// partial: the local memory which holds the values to sum +// (we eventually use it to pass down temp. err vals as well) +// lid: local ID of the work item calling this function. +// thread_lane: The lane within this SUBWAVE for reduction. +// round: This parallel summation method operates in multiple rounds +// to do a parallel reduction. See the blow comment for usage. +VALUE_TYPE sum2_reduce( VALUE_TYPE cur_sum, + VALUE_TYPE * restrict const err, + volatile __local VALUE_TYPE * restrict const partial, + const INDEX_TYPE lid, + const INDEX_TYPE thread_lane, + const INDEX_TYPE round) +{ + if (SUBWAVE_SIZE > round) + { +#ifdef EXTENDED_PRECISION + const unsigned int partial_dest = lid + round; + if (thread_lane < round) + cur_sum = two_sum(cur_sum, partial[partial_dest], err); + // We reuse the LDS entries to move the error values down into lower + // threads. This saves LDS space, allowing higher occupancy, but requires + // more barriers, which can reduce performance. + barrier(CLK_LOCAL_MEM_FENCE); + // Have all of those upper threads pass their temporary errors + // into a location that the lower threads can read. + if (thread_lane >= round) + partial[lid] = *err; + barrier(CLK_LOCAL_MEM_FENCE); + if (thread_lane < round) { // Add those errors in. + *err += partial[partial_dest]; + partial[lid] = cur_sum; + } +#else + // This is the more traditional reduction algorithm. It is up to + // 25% faster (about 10% on average -- potentially worse on devices + // with low double-precision calculation rates), but can result in + // numerical inaccuracies, especially in single precision. + cur_sum += partial[lid + round]; + barrier( CLK_LOCAL_MEM_FENCE ); + partial[lid] = cur_sum; +#endif + } + return cur_sum; +} + // Uses macro constants: // WAVE_SIZE - "warp size", typically 64 (AMD) or 32 (NV) // WG_SIZE - workgroup ("block") size, 1D representation assumed @@ -75,55 +201,56 @@ void csrmv_general ( const INDEX_TYPE num_rows, local volatile VALUE_TYPE sdata [WG_SIZE + SUBWAVE_SIZE / 2]; //const int vectors_per_block = WG_SIZE/SUBWAVE_SIZE; - const int global_id = get_global_id(0); // global workitem id - const int local_id = get_local_id(0); // local workitem id - const int thread_lane = local_id & (SUBWAVE_SIZE - 1); - const int vector_id = global_id / SUBWAVE_SIZE; // global vector id + const INDEX_TYPE global_id = get_global_id(0); // global workitem id + const INDEX_TYPE local_id = get_local_id(0); // local workitem id + const INDEX_TYPE thread_lane = local_id & (SUBWAVE_SIZE - 1); + const INDEX_TYPE vector_id = global_id / SUBWAVE_SIZE; // global vector id //const int vector_lane = local_id / SUBWAVE_SIZE; // vector id within the workgroup - const int num_vectors = get_global_size(0) / SUBWAVE_SIZE; + const INDEX_TYPE num_vectors = get_global_size(0) / SUBWAVE_SIZE; const VALUE_TYPE _alpha = alpha[off_alpha]; const VALUE_TYPE _beta = beta[off_beta]; for(INDEX_TYPE row = vector_id; row < num_rows; row += num_vectors) { - const int row_start = row_offset[row]; - const int row_end = row_offset[row+1]; - VALUE_TYPE sum = (VALUE_TYPE) 0; + const INDEX_TYPE row_start = row_offset[row]; + const INDEX_TYPE row_end = row_offset[row+1]; + VALUE_TYPE sum = 0.; - for(int j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE) - { - if (_alpha == 1) - sum = fma(val[j], x[off_x + col[j]], sum); - else if (_alpha == 0) - sum = 0; - else - sum = fma(_alpha * val[j], x[off_x + col[j]], sum);//sum += val[j] * x[col[j]]; - } + VALUE_TYPE sumk_e = 0.; + // It is about 5% faster to always multiply by alpha, rather than to + // check whether alpha is 0, 1, or other and do different code paths. + for(INDEX_TYPE j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE) + sum = two_fma(_alpha * val[j], x[off_x + col[j]], sum, &sumk_e); + VALUE_TYPE new_error = 0.; + sum = two_sum(sum, sumk_e, &new_error); - //parllel reduction in shared memory + // Parallel reduction in shared memory. sdata[local_id] = sum; - barrier( CLK_LOCAL_MEM_FENCE ); - if (SUBWAVE_SIZE > 32) sdata[local_id] = sum += sdata[local_id + 32]; - barrier( CLK_LOCAL_MEM_FENCE ); - if (SUBWAVE_SIZE > 16) sdata[local_id] = sum += sdata[local_id + 16]; - barrier( CLK_LOCAL_MEM_FENCE ); - if (SUBWAVE_SIZE > 8) sdata[local_id] = sum += sdata[local_id + 8]; - barrier( CLK_LOCAL_MEM_FENCE ); - if (SUBWAVE_SIZE > 4) sdata[local_id] = sum += sdata[local_id + 4]; - barrier( CLK_LOCAL_MEM_FENCE ); - if (SUBWAVE_SIZE > 2) sdata[local_id] = sum += sdata[local_id + 2]; - barrier( CLK_LOCAL_MEM_FENCE ); - if (SUBWAVE_SIZE > 1) sum += sdata[local_id + 1]; + + // This compensated summation reduces cummulative rounding errors, + // which can become a problem on GPUs because our reduction order is + // different than what would be used on a CPU. + // It is based on the PSumK algorithm (with K==2) from + // Yamanaka, Ogita, Rump, and Oishi, "A Parallel Algorithm of + // Accurate Dot Product," in the Journal of Parallel Computing, + // 34(6-8), pp. 392-410, Jul. 2008. + #pragma unroll + for (int i = (WG_SIZE >> 1); i > 0; i >>= 1) + { + barrier( CLK_LOCAL_MEM_FENCE ); + sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, i); + } if (thread_lane == 0) { - if (_beta == 1) - y[off_y + row] = sum + y[off_y + row]; - else if (_beta == 0) - y[off_y + row] = sum; + if (_beta == 0) + y[off_y + row] = sum + new_error; else - y[off_y + row] = sum + _beta * y[off_y + row]; + { + sum = two_fma(_beta, y[off_y + row], sum, &new_error); + y[off_y + row] = sum + new_error; + } } } } diff --git a/src/library/transform/clsparse-coo2csr.cpp b/src/library/transform/clsparse-coo2csr.cpp index ed6c84c..6a36d8f 100644 --- a/src/library/transform/clsparse-coo2csr.cpp +++ b/src/library/transform/clsparse-coo2csr.cpp @@ -77,7 +77,9 @@ clsparseCsrMetaSize( clsparseCsrMatrix* csrMatx, clsparseControl control ) // This allocates up front the maximum size of rowBlocks at start; likely not all the memory is used but // this is the fastest - pCsrMatx->rowBlockSize = 3 * ( pCsrMatx->num_nonzeros / BLKSIZE ) + 2; + // The formula is 3 * (NNZ / block size) + 2, but we double this because CSR-Adaptive uses the + // second half of the rowBlocks buffer for global reductions. + pCsrMatx->rowBlockSize = 6 * ( pCsrMatx->num_nonzeros / BLKSIZE ) + 4; return clsparseSuccess; } @@ -100,7 +102,7 @@ clsparseCsrMetaCompute( clsparseCsrMatrix* csrMatx, clsparseControl control ) clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks ); cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize ); - ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE ); + ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true ); return clsparseSuccess; } diff --git a/src/tests/resources/csr_matrix_environment.h b/src/tests/resources/csr_matrix_environment.h index f5df9fa..74fc5d5 100644 --- a/src/tests/resources/csr_matrix_environment.h +++ b/src/tests/resources/csr_matrix_environment.h @@ -73,7 +73,7 @@ class CSREnvironment: public ::testing::Environment csrSMatrix.rowOffsets = ::clCreateBuffer( context, CL_MEM_READ_ONLY, ( csrSMatrix.num_rows + 1 ) * sizeof( cl_int ), NULL, &status ); - csrSMatrix.rowBlocks = ::clCreateBuffer( context, CL_MEM_READ_ONLY, + csrSMatrix.rowBlocks = ::clCreateBuffer( context, CL_MEM_READ_WRITE, csrSMatrix.rowBlockSize * sizeof( cl_ulong ), NULL, &status ); clsparseStatus fileError = clsparseSCsrMatrixfromFile( &csrSMatrix, file_name.c_str( ), CLSE::control ); diff --git a/src/tests/test-blas2.cpp b/src/tests/test-blas2.cpp index d72b5c0..25c12c3 100644 --- a/src/tests/test-blas2.cpp +++ b/src/tests/test-blas2.cpp @@ -26,12 +26,14 @@ #include #include +// ULP calculation +#include clsparseControl ClSparseEnvironment::control = NULL; cl_command_queue ClSparseEnvironment::queue = NULL; cl_context ClSparseEnvironment::context = NULL; //cl_uint ClSparseEnvironment::N = 1024; - +static cl_bool extended_precision = false; namespace po = boost::program_options; namespace uBLAS = boost::numeric::ublas; @@ -104,11 +106,35 @@ class Blas2 : public ::testing::Test } + // Knuth's Two-Sum algorithm, which allows us to add together two floating + // point numbers and exactly tranform the answer into a sum and a + // rounding error. + // Inputs: x and y, the two inputs to be aded together. + // In/Out: *sumk_err, which is incremented (by reference) -- holds the + // error value as a result of the 2sum calculation. + // Returns: The non-corrected sum of inputs x and y. + T two_sum(T x, T y, T *sumk_err) + { + // We use this 2Sum algorithm to perform a compensated summation, + // which can reduce the cummulative rounding errors in our SpMV + // summation. Our compensated sumation is based on the SumK algorithm + // (with K==2) from Ogita, Rump, and Oishi, "Accurate Sum and Dot + // Product" in SIAM J. on Scientific Computing 26(6) pp 1955-1988, + // Jun. 2005. + T sumk_s = x + y; + T bp = sumk_s - x; + (*sumk_err) += ((x - (sumk_s - bp)) + (y - bp)); + return sumk_s; + } + void test_csrmv() { clsparseStatus status; cl_int cl_status; + if (extended_precision) + clsparseEnableExtendedPrecision(CLSE::control, true); + if (typeid(T) == typeid(cl_float) ) { status = clsparseScsrmv(&gAlpha, &CSRE::csrSMatrix, &gX, @@ -121,21 +147,70 @@ class Blas2 : public ::testing::Test int* cols = &CSRE::ublasSCsr.index2_data()[0]; for (int row = 0; row < CSRE::n_rows; row++) { + // Summation done at a higher precision to decrease + // summation errors from rounding. hY[row] *= hBeta; int row_end = rows[row+1]; + double temp_sum; + temp_sum = hY[row]; for (int i = rows[row]; i < rows[row+1]; i++) - hY[row] += hAlpha * vals[i] * hX[cols[i]]; + { + // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]]; + temp_sum += hAlpha * vals[i] * hX[cols[i]]; + } + hY[row] = temp_sum; } - T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values, CL_TRUE, CL_MAP_READ, 0, gY.num_values * sizeof(T), 0, nullptr, nullptr, &cl_status); ASSERT_EQ(CL_SUCCESS, cl_status); + uint64_t max_ulps = 0; + uint64_t min_ulps = UINT64_MAX; + uint64_t total_ulps = 0; for (int i = 0; i < hY.size(); i++) - ASSERT_NEAR(hY[i], host_result[i], fabs(hY[i]*1e-3)); + { + long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]); + intDiff = llabs(intDiff); + total_ulps += intDiff; + if (max_ulps < intDiff) + max_ulps = intDiff; + if (min_ulps > intDiff) + min_ulps = intDiff; + // Debug printouts. + //printf("Row %d Float Ulps: %lld\n", i, intDiff); + //printf("\tFloat hY[%d] = %.*e (0x%08" PRIx32 "), ", i, 9, hY[i], *(uint32_t *)&hY[i]); + //printf("host_result[%d] = %.*e (0x%08" PRIx32 ")\n", i, 9, host_result[i], *(uint32_t *)&host_result[i]); + } + if (extended_precision) + { + printf("Float Min ulps: %" PRIu64 "\n", min_ulps); + printf("Float Max ulps: %" PRIu64 "\n", max_ulps); + printf("Float Total ulps: %" PRIu64 "\n", total_ulps); + printf("Float Average ulps: %f (Size: %lu)\n", (double)total_ulps/(double)hY.size(), hY.size()); + } + + for (int i = 0; i < hY.size(); i++) + { + double compare_val = 0.; + if (extended_precision) + { + // The limit here is somewhat weak because some GPUs don't + // support correctly rounded denorms in SPFP mode. + if (boost::math::isnormal(hY[i])) + compare_val = fabs(hY[i]*1e-3); + } + else + { + if (boost::math::isnormal(hY[i])) + compare_val = fabs(hY[i]*0.1); + } + if (compare_val < 10*FLT_EPSILON) + compare_val = 10*FLT_EPSILON; + ASSERT_NEAR(hY[i], host_result[i], compare_val); + } cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values, host_result, 0, nullptr, nullptr); @@ -154,10 +229,22 @@ class Blas2 : public ::testing::Test int* cols = &CSRE::ublasDCsr.index2_data()[0]; for (int row = 0; row < CSRE::n_rows; row++) { + // Summation done using a compensated summation to decrease + // summation errors from rounding. This allows us to get + // smaller errors without requiring quad precision support. + // This method is like performing summation at quad precision and + // casting down to double in the end. hY[row] *= hBeta; int row_end = rows[row+1]; + double temp_sum; + temp_sum = hY[row]; + T sumk_err = 0.; for (int i = rows[row]; i < rows[row+1]; i++) - hY[row] += hAlpha * vals[i] * hX[cols[i]]; + { + // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]]; + temp_sum = two_sum(temp_sum, hAlpha*vals[i]*hX[cols[i]], &sumk_err); + } + hY[row] = temp_sum + sumk_err; } T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values, @@ -166,14 +253,64 @@ class Blas2 : public ::testing::Test 0, nullptr, nullptr, &cl_status); ASSERT_EQ(CL_SUCCESS, cl_status); + uint64_t max_ulps = 0; + uint64_t min_ulps = ULLONG_MAX; + uint64_t total_ulps = 0; for (int i = 0; i < hY.size(); i++) - ASSERT_NEAR(hY[i], host_result[i], fabs(hY[i]*1e-10)); + { + long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]); + intDiff = llabs(intDiff); + total_ulps += intDiff; + if (max_ulps < intDiff) + max_ulps = intDiff; + if (min_ulps > intDiff) + min_ulps = intDiff; + // Debug printouts. + //printf("Row %d Double Ulps: %lld\n", i, intDiff); + //printf("\tDouble hY[%d] = %.*e (0x%016" PRIx64 "), ", i, 17, hY[i], *(uint64_t *)&hY[i]); + //printf("host_result[%d] = %.*e (0x%016" PRIx64 ")\n", i, 17, host_result[i], *(uint64_t *)&host_result[i]); + } + if (extended_precision) + { + printf("Double Min ulps: %" PRIu64 "\n", min_ulps); + printf("Double Max ulps: %" PRIu64 "\n", max_ulps); + printf("Double Total ulps: %" PRIu64 "\n", total_ulps); + printf("Double Average ulps: %f (Size: %lu)\n", (double)total_ulps/(double)hY.size(), hY.size()); + + for (int i = 0; i < hY.size(); i++) + { + double compare_val = fabs(hY[i]*1e-14); + if (compare_val < 10*DBL_EPSILON) + compare_val = 10*DBL_EPSILON; + ASSERT_NEAR(hY[i], host_result[i], compare_val); + } + } + else + { + for (int i = 0; i < hY.size(); i++) + { + double compare_val = 0.; + if (boost::math::isnormal(hY[i])) + compare_val = fabs(hY[i]*0.1); + if (compare_val < 10*DBL_EPSILON) + compare_val = 10*DBL_EPSILON; + ASSERT_NEAR(hY[i], host_result[i], compare_val); + } + } cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values, host_result, 0, nullptr, nullptr); ASSERT_EQ(CL_SUCCESS, cl_status); } - + // Reset output buffer for next test. + ::clReleaseMemObject(gY.values); + clsparseInitVector(&gY); + gY.values = clCreateBuffer(CLSE::context, + CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, + hY.size() * sizeof(T), hY.data().begin(), + &cl_status); + gY.num_values = hY.size(); + ASSERT_EQ(CL_SUCCESS, cl_status); } uBLAS::vector hX; @@ -198,13 +335,7 @@ TYPED_TEST_CASE(Blas2, TYPES); TYPED_TEST(Blas2, csrmv_adaptive) { - if ( typeid(TypeParam) == typeid(cl_float) ) - this->test_csrmv(); - else - { -// std::cerr << "Adaptive version of csrmv might crash!" << std::endl; - this->test_csrmv(); - } + this->test_csrmv(); } TYPED_TEST(Blas2, csrmv_vector) @@ -237,7 +368,7 @@ TYPED_TEST(Blas2, csrmv_vector) ASSERT_EQ(clsparseSuccess, status); CSRE::csrSMatrix.rowBlocks = - ::clCreateBuffer( CLSE::context, CL_MEM_READ_ONLY, + ::clCreateBuffer( CLSE::context, CL_MEM_READ_WRITE, CSRE::csrSMatrix.rowBlockSize * sizeof( cl_ulong ), NULL, &cl_status ); @@ -278,8 +409,8 @@ int main (int argc, char* argv[]) ("alpha,a", po::value(&alpha)->default_value(1.0), "Alpha parameter for eq: \n\ty = alpha * M * x + beta * y") ("beta,b", po::value(&beta)->default_value(1.0), - "Beta parameter for eq: \n\ty = alpha * M * x + beta * y"); - + "Beta parameter for eq: \n\ty = alpha * M * x + beta * y") + ("extended,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy by emulating extended precision."); po::variables_map vm; po::parsed_options parsed = @@ -323,6 +454,9 @@ int main (int argc, char* argv[]) } + if (vm["extended"].as()) + extended_precision = true; + ::testing::InitGoogleTest(&argc, argv); //order does matter! ::testing::AddGlobalTestEnvironment( new CLSE(pID, dID));