Skip to content

Adding Compensated Summation to SpM-DV #123

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
ad7144c
Minor bug fixes. In particular, the change to computeRowBlocks preven…
jlgreathouse Jul 20, 2015
774d357
Reworked how CPU-side SpMV is performed in order to increase its fide…
jlgreathouse Jul 20, 2015
3ad5527
Added compensated summation to csrmv_general. Assuming there is no fl…
jlgreathouse Jul 20, 2015
7670486
Adding compensated summation support to CSR-Adaptive. This requires e…
jlgreathouse Jul 20, 2015
c4d4f98
Merge branch 'extended_precision' into newer_adaptive
jlgreathouse Jul 31, 2015
01a68bc
Minor code cleanup modifications. Moved CSR-Adaptive configuration pa…
jlgreathouse Jul 31, 2015
2475bc3
New modifications to CSR-Adaptive in order to allow a single workgrou…
jlgreathouse Jul 31, 2015
430a7aa
Modifications to the CSR-Stream case. In particular, a faster paralle…
jlgreathouse Jul 31, 2015
54108c7
Changed CSR-Adaptive to improve the CSR-Stream performance. Calculate…
jlgreathouse Jul 31, 2015
021d3d6
Removing a function that is no longer in use.
jlgreathouse Aug 1, 2015
25b5737
Change row block generation function to break row blocks that are a s…
jlgreathouse Aug 1, 2015
a52ad84
Merge remote-tracking branch 'upstream/develop' into newer_adaptive
jlgreathouse Aug 1, 2015
ebf357c
Code cleanup and modifications to improve readability and performance…
jlgreathouse Aug 3, 2015
b23c29c
Changing the csrmv_adaptive configuration parameter from sending 2 ro…
jlgreathouse Aug 3, 2015
60c30c1
Merge remote-tracking branch 'upstream/develop' into newer_adaptive
jlgreathouse Aug 3, 2015
5ae4468
Adding consts on a bunch of constant variables. Changing some data ty…
jlgreathouse Aug 11, 2015
7daac88
Adding floating point FMA into CSR-Adaptive. This results in minor pe…
jlgreathouse Aug 11, 2015
0e773e2
Changing around how CSR-Stream reduction is performed. Removing a lot…
jlgreathouse Aug 11, 2015
30f32f8
Made changes to preprocessor macros to align with csrmv_general. Basi…
jlgreathouse Aug 11, 2015
05cfc0d
Changes to allow us to use double precision even when the system does…
jlgreathouse Aug 12, 2015
dd85239
Adding FMAs to CSR-General and some consts and other type correctness.
jlgreathouse Aug 12, 2015
5bd0e0c
Merge remote-tracking branch 'upstream/develop' into newer_adaptive
jlgreathouse Aug 13, 2015
5084268
Making extended precision (compensated summation) part of the clSpars…
jlgreathouse Aug 13, 2015
6900254
Extending clsparse-bench to allow command-line control of compensated…
jlgreathouse Aug 13, 2015
ee78e57
Minor modifications to test-blas2 so that it works OK even though GPU…
jlgreathouse Aug 13, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ template <typename T>
class xSpMdV: public clsparseFunc
{
public:
xSpMdV( PFCLSPARSETIMER sparseGetTimer, size_t profileCount, cl_device_type devType ): clsparseFunc( devType, CL_QUEUE_PROFILING_ENABLE ), gpuTimer( nullptr ), cpuTimer( nullptr )
xSpMdV( PFCLSPARSETIMER sparseGetTimer, size_t profileCount, cl_bool extended_precision, cl_device_type devType ): clsparseFunc( devType, CL_QUEUE_PROFILING_ENABLE ), gpuTimer( nullptr ), cpuTimer( nullptr )
{
// Create and initialize our timer class, if the external timer shared library loaded
if( sparseGetTimer )
Expand All @@ -42,6 +42,7 @@ class xSpMdV: public clsparseFunc
cpuTimerID = cpuTimer->getUniqueID( "CPU xSpMdV", 0 );
}

clsparseEnableExtendedPrecision( control, extended_precision );

clsparseEnableAsync( control, false );
}
Expand Down
14 changes: 9 additions & 5 deletions src/benchmarks/clsparse-bench/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,15 @@ int main( int argc, char *argv[ ] )
desc.add_options( )
( "help,h", "produces this help message" )
( "dirpath,d", po::value( &root_dir ), "Matrix directory" )
( "alpha", po::value<cl_double>( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" )
( "beta", po::value<cl_double>( &beta )->default_value( 0.0f ), "specifies the scalar beta" )
( "alpha,a", po::value<cl_double>( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" )
( "beta,b", po::value<cl_double>( &beta )->default_value( 0.0f ), "specifies the scalar beta" )
( "rows", po::value<size_t>( &rows )->default_value( 16 ), "specifies the number of rows for matrix data" )
( "columns", po::value<size_t>( &columns )->default_value( 16 ), "specifies the number of columns for matrix data" )
( "function,f", po::value<std::string>( &function )->default_value( "SpMdV" ), "Sparse functions to test. Options: "
"SpMdV, SpMdM, CG, BiCGStab, Csr2Dense, Dense2Csr, Csr2Coo, Coo2Csr" )
( "precision,r", po::value<std::string>( &precision )->default_value( "s" ), "Options: s,d,c,z" )
( "profile,p", po::value<size_t>( &profileCount )->default_value( 20 ), "Time and report the kernel speed (default: profiling off)" )
( "profile,p", po::value<size_t>( &profileCount )->default_value( 20 ), "Number of times to run the desired test function" )
( "extended,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy by emulating extended precision" )
;

po::variables_map vm;
Expand Down Expand Up @@ -160,6 +161,9 @@ int main( int argc, char *argv[ ] )
std::cerr << "Could not find the external timing library; timings disabled" << std::endl;
}

cl_bool extended_precision = false;
if (vm["extended"].as<bool>())
extended_precision = true;

// Timer module discovered and loaded successfully
// Initialize function pointers to call into the shared module
Expand All @@ -170,9 +174,9 @@ int main( int argc, char *argv[ ] )
if( boost::iequals( function, "SpMdV" ) )
{
if( precision == "s" )
my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU ) );
my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU ) );
else if( precision == "d" )
my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU ) );
my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU ) );
else
{
std::cerr << "Unknown spmdv precision" << std::endl;
Expand Down
3 changes: 3 additions & 0 deletions src/include/clSPARSE.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ extern "C" {
CLSPARSE_EXPORT clsparseStatus
clsparseEnableAsync( clsparseControl control, cl_bool async );

//enable/disable the use of compensated summation
CLSPARSE_EXPORT clsparseStatus
clsparseEnableExtendedPrecision( clsparseControl control, cl_bool async );

//setup events to sync
//TODO:: NOT WORKING! NDRange throws Failure
Expand Down
20 changes: 6 additions & 14 deletions src/library/blas2/clsparse-csrmv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,11 @@ csrmv (const clsparseScalarPrivate *pAlpha,
return clsparseStructInvalid;
}

// We have problems with failing test cases with csrmv_adaptive on double precision
// fall back to csrmv_vector
if( typeid( T ) == typeid( cl_double ) )
{
return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
}
// Use this for csrmv_general instead of adaptive.
//return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );

// Call adaptive CSR kernels
return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
// Call adaptive CSR kernels
return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );

}
}
Expand Down Expand Up @@ -83,12 +79,8 @@ csrmv (const clsparse::array_base<T>& pAlpha,
return clsparseStructInvalid;
}

// We have problems with failing test cases with csrmv_adaptive on double precision
// fall back to csrmv_vector
if( typeid( T ) == typeid( cl_double ) )
{
return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
}
// Use this for csrmv_general instead of adaptive.
//return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );

// Call adaptive CSR kernels
return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
Expand Down
70 changes: 54 additions & 16 deletions src/library/blas2/csrmv-adaptive.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,33 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
const cl_uint group_size = 256;

std::string params = std::string( )
+ " -DINDEX_TYPE=uint"
+ " -DROWBITS=" + std::to_string( ROW_BITS )
+ " -DWGBITS=" + std::to_string( WG_BITS )
+ " -DBLOCKSIZE=" + std::to_string( BLKSIZE );
#ifdef DOUBLE
buildFlags += " -DDOUBLE";
#endif
+ " -DWG_SIZE=" + std::to_string( group_size )
+ " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
+ " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
+ " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR );

std::string options;
if(typeid(T) == typeid(cl_double))
{
std::string options = std::string() + " -DDOUBLE";
params.append(options);
}

options = std::string() + " -DVALUE_TYPE=double -DDOUBLE";
else if(typeid(T) == typeid(cl_float))
options = std::string() + " -DVALUE_TYPE=float";
else if(typeid(T) == typeid(cl_uint))
options = std::string() + " -DVALUE_TYPE=uint";
else if(typeid(T) == typeid(cl_int))
options = std::string() + " -DVALUE_TYPE=int";
else if(typeid(T) == typeid(cl_ulong))
options = std::string() + " -DVALUE_TYPE=ulong -DLONG";
else if(typeid(T) == typeid(cl_long))
options = std::string() + " -DVALUE_TYPE=long -DLONG";
else
return clsparseInvalidKernelArgs;

if(control->extended_precision)
options += " -DEXTENDED_PRECISION";
params.append(options);

cl::Kernel kernel = KernelCache::get( control->queue,
"csrmv_adaptive",
Expand All @@ -70,7 +84,10 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
// if NVIDIA is used it does not allow to run the group size
// which is not a multiplication of group_size. Don't know if that
// have an impact on performance
cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size;
// Setting global work size to half the row block size because we are only
// using half the row blocks buffer for actual work.
// The other half is used for the extended precision reduction.
cl_uint global_work_size = ( (pCsrMatx->rowBlockSize/2) - 1 ) * group_size;
cl::NDRange local( group_size );
cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );

Expand Down Expand Up @@ -102,15 +119,33 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
const cl_uint group_size = 256;

std::string params = std::string( )
+ " -DINDEX_TYPE=uint"
+ " -DROWBITS=" + std::to_string( ROW_BITS )
+ " -DWGBITS=" + std::to_string( WG_BITS )
+ " -DBLOCKSIZE=" + std::to_string( BLKSIZE );
+ " -DWG_SIZE=" + std::to_string( group_size )
+ " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
+ " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
+ " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR );

std::string options;
if(typeid(T) == typeid(cl_double))
{
std::string options = std::string() + " -DDOUBLE";
params.append(options);
}
options = std::string() + " -DVALUE_TYPE=double -DDOUBLE";
else if(typeid(T) == typeid(cl_float))
options = std::string() + " -DVALUE_TYPE=float";
else if(typeid(T) == typeid(cl_uint))
options = std::string() + " -DVALUE_TYPE=uint";
else if(typeid(T) == typeid(cl_int))
options = std::string() + " -DVALUE_TYPE=int";
else if(typeid(T) == typeid(cl_ulong))
options = std::string() + " -DVALUE_TYPE=ulong -DLONG";
else if(typeid(T) == typeid(cl_long))
options = std::string() + " -DVALUE_TYPE=long -DLONG";
else
return clsparseInvalidKernelArgs;

if(control->extended_precision)
options += " -DEXTENDED_PRECISION";
params.append(options);

cl::Kernel kernel = KernelCache::get( control->queue,
"csrmv_adaptive",
Expand All @@ -129,7 +164,10 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
// if NVIDIA is used it does not allow to run the group size
// which is not a multiplication of group_size. Don't know if that
// have an impact on performance
cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size;
// Setting global work size to half the row block size because we are only
// using half the row blocks buffer for actual work.
// The other half is used for the extended precision reduction.
cl_uint global_work_size = ( (pCsrMatx->rowBlockSize/2) - 1 ) * group_size;
cl::NDRange local( group_size );
cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );

Expand Down
8 changes: 6 additions & 2 deletions src/library/blas2/csrmv-vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,16 @@ csrmv_vector(const clsparseScalarPrivate* pAlpha,
if (nnz_per_row < 8) { subwave_size = 4; }
if (nnz_per_row < 4) { subwave_size = 2; }

const std::string params = std::string() +
std::string params = std::string() +
"-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
+ " -DVALUE_TYPE=" + OclTypeTraits<T>::type
+ " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
+ " -DWG_SIZE=" + std::to_string(group_size)
+ " -DWAVE_SIZE=" + std::to_string(wave_size)
+ " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);

if(control->extended_precision)
params += " -DEXTENDED_PRECISION";

cl::Kernel kernel = KernelCache::get(control->queue,
"csrmv_general",
Expand Down Expand Up @@ -124,14 +126,16 @@ csrmv_vector(const clsparse::array_base<T>& pAlpha,
if (nnz_per_row < 8) { subwave_size = 4; }
if (nnz_per_row < 4) { subwave_size = 2; }

const std::string params = std::string() +
std::string params = std::string() +
"-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
+ " -DVALUE_TYPE=" + OclTypeTraits<T>::type
+ " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
+ " -DWG_SIZE=" + std::to_string(group_size)
+ " -DWAVE_SIZE=" + std::to_string(wave_size)
+ " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);

if(control->extended_precision)
params += " -DEXTENDED_PRECISION";

cl::Kernel kernel = KernelCache::get(control->queue,
"csrmv_general",
Expand Down
2 changes: 2 additions & 0 deletions src/library/include/clSPARSE-private.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,7 @@
const cl_uint WG_BITS = 24;
const cl_uint ROW_BITS = 32;
const cl_uint BLKSIZE = 1024;
const cl_uint BLOCK_MULTIPLIER = 3;
const cl_uint ROWS_FOR_VECTOR = 1;

#endif
13 changes: 13 additions & 0 deletions src/library/internal/clsparse-control.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ clsparseCreateControl( cl_command_queue queue, clsparseStatus *status )
control->wavefront_size = 0;
control->max_wg_size = 0;
control->async = false;
control->extended_precision = false;

collectEnvParams( control );

Expand Down Expand Up @@ -144,6 +145,18 @@ clsparseEnableAsync( clsparseControl control, cl_bool async )
return clsparseSuccess;
}

clsparseStatus
clsparseEnableExtendedPrecision( clsparseControl control, cl_bool extended_precision )
{
if( control == NULL )
{
return clsparseInvalidControlObject;
}

control->extended_precision = extended_precision;
return clsparseSuccess;
}

clsparseStatus
clsparseReleaseControl( clsparseControl control )
{
Expand Down
3 changes: 3 additions & 0 deletions src/library/internal/clsparse-control.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ struct _clsparseControl
size_t wavefront_size;
size_t max_wg_size;

// Should we attempt to perform compensated summation?
cl_bool extended_precision;

// current device max compute units;
cl_uint max_compute_units;

Expand Down
Loading