clMathLibraries · jlgreathouse · Jul 20, 2015 · Jul 20, 2015 · Jul 20, 2015 · Jul 20, 2015
diff --git a/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp b/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp
@@ -25,7 +25,7 @@ template <typename T>
 class xSpMdV: public clsparseFunc
 {
 public:
-    xSpMdV( PFCLSPARSETIMER sparseGetTimer, size_t profileCount, cl_device_type devType ): clsparseFunc( devType, CL_QUEUE_PROFILING_ENABLE ), gpuTimer( nullptr ), cpuTimer( nullptr )
+    xSpMdV( PFCLSPARSETIMER sparseGetTimer, size_t profileCount, cl_bool extended_precision, cl_device_type devType ): clsparseFunc( devType, CL_QUEUE_PROFILING_ENABLE ), gpuTimer( nullptr ), cpuTimer( nullptr )
     {
         //	Create and initialize our timer class, if the external timer shared library loaded
         if( sparseGetTimer )
@@ -42,6 +42,7 @@ class xSpMdV: public clsparseFunc
             cpuTimerID = cpuTimer->getUniqueID( "CPU xSpMdV", 0 );
         }
 
+        clsparseEnableExtendedPrecision( control, extended_precision );
 
         clsparseEnableAsync( control, false );
     }

diff --git a/src/benchmarks/clsparse-bench/src/main.cpp b/src/benchmarks/clsparse-bench/src/main.cpp
@@ -120,14 +120,15 @@ int main( int argc, char *argv[ ] )
     desc.add_options( )
         ( "help,h", "produces this help message" )
         ( "dirpath,d", po::value( &root_dir ), "Matrix directory" )
-        ( "alpha", po::value<cl_double>( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" )
-        ( "beta", po::value<cl_double>( &beta )->default_value( 0.0f ), "specifies the scalar beta" )
+        ( "alpha,a", po::value<cl_double>( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" )
+        ( "beta,b", po::value<cl_double>( &beta )->default_value( 0.0f ), "specifies the scalar beta" )
         ( "rows", po::value<size_t>( &rows )->default_value( 16 ), "specifies the number of rows for matrix data" )
         ( "columns", po::value<size_t>( &columns )->default_value( 16 ), "specifies the number of columns for matrix data" )
         ( "function,f", po::value<std::string>( &function )->default_value( "SpMdV" ), "Sparse functions to test. Options: "
                     "SpMdV, SpMdM, CG, BiCGStab, Csr2Dense, Dense2Csr, Csr2Coo, Coo2Csr" )
         ( "precision,r", po::value<std::string>( &precision )->default_value( "s" ), "Options: s,d,c,z" )
-        ( "profile,p", po::value<size_t>( &profileCount )->default_value( 20 ), "Time and report the kernel speed (default: profiling off)" )
+        ( "profile,p", po::value<size_t>( &profileCount )->default_value( 20 ), "Number of times to run the desired test function" )
+        ( "extended,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy by emulating extended precision" )
         ;
 
     po::variables_map vm;
@@ -160,6 +161,9 @@ int main( int argc, char *argv[ ] )
         std::cerr << "Could not find the external timing library; timings disabled" << std::endl;
     }
 
+    cl_bool extended_precision = false;
+    if (vm["extended"].as<bool>())
+                extended_precision = true;
 
     //	Timer module discovered and loaded successfully
     //	Initialize function pointers to call into the shared module
@@ -170,9 +174,9 @@ int main( int argc, char *argv[ ] )
     if( boost::iequals( function, "SpMdV" ) )
     {
         if( precision == "s" )
-            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU ) );
+            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU ) );
         else if( precision == "d" )
-            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU ) );
+            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU ) );
         else
         {
             std::cerr << "Unknown spmdv precision" << std::endl;

diff --git a/src/include/clSPARSE.h b/src/include/clSPARSE.h
@@ -90,6 +90,9 @@ extern "C" {
     CLSPARSE_EXPORT clsparseStatus
         clsparseEnableAsync( clsparseControl control, cl_bool async );
 
+    //enable/disable the use of compensated summation
+    CLSPARSE_EXPORT clsparseStatus
+        clsparseEnableExtendedPrecision( clsparseControl control, cl_bool async );
 
     //setup events to sync
     //TODO:: NOT WORKING! NDRange throws Failure

diff --git a/src/library/blas2/clsparse-csrmv.hpp b/src/library/blas2/clsparse-csrmv.hpp
@@ -45,15 +45,11 @@ csrmv (const clsparseScalarPrivate *pAlpha,
             return clsparseStructInvalid;
         }
 
-       //   We have problems with failing test cases with csrmv_adaptive on double precision
-       //   fall back to csrmv_vector
-       if( typeid( T ) == typeid( cl_double ) )
-       {
-           return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
-       }
+        // Use this for csrmv_general instead of adaptive.
+        //return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
 
-       // Call adaptive CSR kernels
-       return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
+        // Call adaptive CSR kernels
+        return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
 
     }
 }
@@ -83,12 +79,8 @@ csrmv (const clsparse::array_base<T>& pAlpha,
             return clsparseStructInvalid;
         }
 
-        //   We have problems with failing test cases with csrmv_adaptive on double precision
-        //   fall back to csrmv_vector
-        if( typeid( T ) == typeid( cl_double ) )
-        {
-            return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
-        }
+        // Use this for csrmv_general instead of adaptive.
+        //return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
 
         // Call adaptive CSR kernels
         return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );

diff --git a/src/library/blas2/csrmv-adaptive.hpp b/src/library/blas2/csrmv-adaptive.hpp
@@ -39,19 +39,33 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
     const cl_uint group_size = 256;
 
     std::string params = std::string( )
+    + " -DINDEX_TYPE=uint"
     + " -DROWBITS=" + std::to_string( ROW_BITS )
     + " -DWGBITS=" + std::to_string( WG_BITS )
-    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE );
-#ifdef DOUBLE
-    buildFlags += " -DDOUBLE";
-#endif
+    + " -DWG_SIZE=" + std::to_string( group_size )
+    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
+    + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
+    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR );
 
+    std::string options;
     if(typeid(T) == typeid(cl_double))
-    {
-            std::string options = std::string() + " -DDOUBLE";
-            params.append(options);
-    }
-
+        options = std::string() + " -DVALUE_TYPE=double -DDOUBLE";
+    else if(typeid(T) == typeid(cl_float))
+        options = std::string() + " -DVALUE_TYPE=float";
+    else if(typeid(T) == typeid(cl_uint))
+        options = std::string() + " -DVALUE_TYPE=uint";
+    else if(typeid(T) == typeid(cl_int))
+        options = std::string() + " -DVALUE_TYPE=int";
+    else if(typeid(T) == typeid(cl_ulong))
+        options = std::string() + " -DVALUE_TYPE=ulong -DLONG";
+    else if(typeid(T) == typeid(cl_long))
+        options = std::string() + " -DVALUE_TYPE=long -DLONG";
+    else
+        return clsparseInvalidKernelArgs;
+
+    if(control->extended_precision)
+        options += " -DEXTENDED_PRECISION";
+    params.append(options);
 
     cl::Kernel kernel = KernelCache::get( control->queue,
                                           "csrmv_adaptive",
@@ -70,7 +84,10 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
     // if NVIDIA is used it does not allow to run the group size
     // which is not a multiplication of group_size. Don't know if that
     // have an impact on performance
-    cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size;
+    // Setting global work size to half the row block size because we are only
+    // using half the row blocks buffer for actual work.
+    // The other half is used for the extended precision reduction.
+    cl_uint global_work_size = ( (pCsrMatx->rowBlockSize/2) - 1 ) * group_size;
     cl::NDRange local( group_size );
     cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );
 
@@ -102,15 +119,33 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
     const cl_uint group_size = 256;
 
     std::string params = std::string( )
+    + " -DINDEX_TYPE=uint"
     + " -DROWBITS=" + std::to_string( ROW_BITS )
     + " -DWGBITS=" + std::to_string( WG_BITS )
-    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE );
+    + " -DWG_SIZE=" + std::to_string( group_size )
+    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
+    + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
+    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR );
 
+    std::string options;
     if(typeid(T) == typeid(cl_double))
-    {
-        std::string options = std::string() + " -DDOUBLE";
-        params.append(options);
-    }
+        options = std::string() + " -DVALUE_TYPE=double -DDOUBLE";
+    else if(typeid(T) == typeid(cl_float))
+        options = std::string() + " -DVALUE_TYPE=float";
+    else if(typeid(T) == typeid(cl_uint))
+        options = std::string() + " -DVALUE_TYPE=uint";
+    else if(typeid(T) == typeid(cl_int))
+        options = std::string() + " -DVALUE_TYPE=int";
+    else if(typeid(T) == typeid(cl_ulong))
+        options = std::string() + " -DVALUE_TYPE=ulong -DLONG";
+    else if(typeid(T) == typeid(cl_long))
+        options = std::string() + " -DVALUE_TYPE=long -DLONG";
+    else
+        return clsparseInvalidKernelArgs;
+
+    if(control->extended_precision)
+        options += " -DEXTENDED_PRECISION";
+    params.append(options);
 
     cl::Kernel kernel = KernelCache::get( control->queue,
                                           "csrmv_adaptive",
@@ -129,7 +164,10 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
     // if NVIDIA is used it does not allow to run the group size
     // which is not a multiplication of group_size. Don't know if that
     // have an impact on performance
-    cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size;
+    // Setting global work size to half the row block size because we are only
+    // using half the row blocks buffer for actual work.
+    // The other half is used for the extended precision reduction.
+    cl_uint global_work_size = ( (pCsrMatx->rowBlockSize/2) - 1 ) * group_size;
     cl::NDRange local( group_size );
     cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );
 

diff --git a/src/library/blas2/csrmv-vector.hpp b/src/library/blas2/csrmv-vector.hpp
@@ -48,14 +48,16 @@ csrmv_vector(const clsparseScalarPrivate* pAlpha,
     if (nnz_per_row < 8)  {  subwave_size = 4;  }
     if (nnz_per_row < 4)  {  subwave_size = 2;  }
 
-    const std::string params = std::string() +
+    std::string params = std::string() +
             "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
             + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
             + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
             + " -DWG_SIZE=" + std::to_string(group_size)
             + " -DWAVE_SIZE=" + std::to_string(wave_size)
             + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);
 
+    if(control->extended_precision)
+        params += " -DEXTENDED_PRECISION";
 
     cl::Kernel kernel = KernelCache::get(control->queue,
                                          "csrmv_general",
@@ -124,14 +126,16 @@ csrmv_vector(const clsparse::array_base<T>& pAlpha,
     if (nnz_per_row < 8)  {  subwave_size = 4;  }
     if (nnz_per_row < 4)  {  subwave_size = 2;  }
 
-    const std::string params = std::string() +
+    std::string params = std::string() +
             "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
             + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
             + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
             + " -DWG_SIZE=" + std::to_string(group_size)
             + " -DWAVE_SIZE=" + std::to_string(wave_size)
             + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);
 
+    if(control->extended_precision)
+        params += " -DEXTENDED_PRECISION";
 
     cl::Kernel kernel = KernelCache::get(control->queue,
                                          "csrmv_general",

diff --git a/src/library/include/clSPARSE-private.hpp b/src/library/include/clSPARSE-private.hpp
@@ -42,5 +42,7 @@
 const cl_uint WG_BITS = 24;
 const cl_uint ROW_BITS = 32;
 const cl_uint BLKSIZE = 1024;
+const cl_uint BLOCK_MULTIPLIER = 3;
+const cl_uint ROWS_FOR_VECTOR = 1;
 
 #endif
diff --git a/src/library/internal/clsparse-control.cpp b/src/library/internal/clsparse-control.cpp
@@ -104,6 +104,7 @@ clsparseCreateControl( cl_command_queue queue, clsparseStatus *status )
     control->wavefront_size = 0;
     control->max_wg_size = 0;
     control->async = false;
+    control->extended_precision = false;
 
     collectEnvParams( control );
 
@@ -144,6 +145,18 @@ clsparseEnableAsync( clsparseControl control, cl_bool async )
     return clsparseSuccess;
 }
 
+clsparseStatus
+clsparseEnableExtendedPrecision( clsparseControl control, cl_bool extended_precision )
+{
+    if( control == NULL )
+    {
+        return clsparseInvalidControlObject;
+    }
+
+    control->extended_precision = extended_precision;
+    return clsparseSuccess;
+}
+
 clsparseStatus
 clsparseReleaseControl( clsparseControl control )
 {

diff --git a/src/library/internal/clsparse-control.hpp b/src/library/internal/clsparse-control.hpp
@@ -54,6 +54,9 @@ struct _clsparseControl
     size_t wavefront_size;
     size_t max_wg_size;
 
+    // Should we attempt to perform compensated summation?
+    cl_bool extended_precision;
+
     // current device max compute units;
     cl_uint max_compute_units;