diff --git a/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp b/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp
index ba30512..bf80607 100644
--- a/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp
+++ b/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp
@@ -25,7 +25,7 @@ template <typename T>
 class xSpMdV: public clsparseFunc
 {
 public:
-    xSpMdV( PFCLSPARSETIMER sparseGetTimer, size_t profileCount, cl_device_type devType ): clsparseFunc( devType, CL_QUEUE_PROFILING_ENABLE ), gpuTimer( nullptr ), cpuTimer( nullptr )
+    xSpMdV( PFCLSPARSETIMER sparseGetTimer, size_t profileCount, cl_bool extended_precision, cl_device_type devType ): clsparseFunc( devType, CL_QUEUE_PROFILING_ENABLE ), gpuTimer( nullptr ), cpuTimer( nullptr )
     {
         //	Create and initialize our timer class, if the external timer shared library loaded
         if( sparseGetTimer )
@@ -42,6 +42,7 @@ class xSpMdV: public clsparseFunc
             cpuTimerID = cpuTimer->getUniqueID( "CPU xSpMdV", 0 );
         }
 
+        clsparseEnableExtendedPrecision( control, extended_precision );
 
         clsparseEnableAsync( control, false );
     }
diff --git a/src/benchmarks/clsparse-bench/src/main.cpp b/src/benchmarks/clsparse-bench/src/main.cpp
index b73b73b..62efa6a 100644
--- a/src/benchmarks/clsparse-bench/src/main.cpp
+++ b/src/benchmarks/clsparse-bench/src/main.cpp
@@ -120,14 +120,15 @@ int main( int argc, char *argv[ ] )
     desc.add_options( )
         ( "help,h", "produces this help message" )
         ( "dirpath,d", po::value( &root_dir ), "Matrix directory" )
-        ( "alpha", po::value<cl_double>( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" )
-        ( "beta", po::value<cl_double>( &beta )->default_value( 0.0f ), "specifies the scalar beta" )
+        ( "alpha,a", po::value<cl_double>( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" )
+        ( "beta,b", po::value<cl_double>( &beta )->default_value( 0.0f ), "specifies the scalar beta" )
         ( "rows", po::value<size_t>( &rows )->default_value( 16 ), "specifies the number of rows for matrix data" )
         ( "columns", po::value<size_t>( &columns )->default_value( 16 ), "specifies the number of columns for matrix data" )
         ( "function,f", po::value<std::string>( &function )->default_value( "SpMdV" ), "Sparse functions to test. Options: "
                     "SpMdV, SpMdM, CG, BiCGStab, Csr2Dense, Dense2Csr, Csr2Coo, Coo2Csr" )
         ( "precision,r", po::value<std::string>( &precision )->default_value( "s" ), "Options: s,d,c,z" )
-        ( "profile,p", po::value<size_t>( &profileCount )->default_value( 20 ), "Time and report the kernel speed (default: profiling off)" )
+        ( "profile,p", po::value<size_t>( &profileCount )->default_value( 20 ), "Number of times to run the desired test function" )
+        ( "extended,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy by emulating extended precision" )
         ;
 
     po::variables_map vm;
@@ -160,6 +161,9 @@ int main( int argc, char *argv[ ] )
         std::cerr << "Could not find the external timing library; timings disabled" << std::endl;
     }
 
+    cl_bool extended_precision = false;
+    if (vm["extended"].as<bool>())
+                extended_precision = true;
 
     //	Timer module discovered and loaded successfully
     //	Initialize function pointers to call into the shared module
@@ -170,9 +174,9 @@ int main( int argc, char *argv[ ] )
     if( boost::iequals( function, "SpMdV" ) )
     {
         if( precision == "s" )
-            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU ) );
+            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU ) );
         else if( precision == "d" )
-            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU ) );
+            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU ) );
         else
         {
             std::cerr << "Unknown spmdv precision" << std::endl;
diff --git a/src/include/clSPARSE.h b/src/include/clSPARSE.h
index 3d5c86a..38c2dd1 100644
--- a/src/include/clSPARSE.h
+++ b/src/include/clSPARSE.h
@@ -90,6 +90,9 @@ extern "C" {
     CLSPARSE_EXPORT clsparseStatus
         clsparseEnableAsync( clsparseControl control, cl_bool async );
 
+    //enable/disable the use of compensated summation
+    CLSPARSE_EXPORT clsparseStatus
+        clsparseEnableExtendedPrecision( clsparseControl control, cl_bool async );
 
     //setup events to sync
     //TODO:: NOT WORKING! NDRange throws Failure
diff --git a/src/library/blas2/clsparse-csrmv.hpp b/src/library/blas2/clsparse-csrmv.hpp
index 9a379cc..17dbd11 100644
--- a/src/library/blas2/clsparse-csrmv.hpp
+++ b/src/library/blas2/clsparse-csrmv.hpp
@@ -45,15 +45,11 @@ csrmv (const clsparseScalarPrivate *pAlpha,
             return clsparseStructInvalid;
         }
 
-       //   We have problems with failing test cases with csrmv_adaptive on double precision
-       //   fall back to csrmv_vector
-       if( typeid( T ) == typeid( cl_double ) )
-       {
-           return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
-       }
+        // Use this for csrmv_general instead of adaptive.
+        //return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
 
-       // Call adaptive CSR kernels
-       return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
+        // Call adaptive CSR kernels
+        return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
 
     }
 }
@@ -83,12 +79,8 @@ csrmv (const clsparse::array_base<T>& pAlpha,
             return clsparseStructInvalid;
         }
 
-        //   We have problems with failing test cases with csrmv_adaptive on double precision
-        //   fall back to csrmv_vector
-        if( typeid( T ) == typeid( cl_double ) )
-        {
-            return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
-        }
+        // Use this for csrmv_general instead of adaptive.
+        //return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
 
         // Call adaptive CSR kernels
         return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
diff --git a/src/library/blas2/csrmv-adaptive.hpp b/src/library/blas2/csrmv-adaptive.hpp
index d043b3c..95ff105 100644
--- a/src/library/blas2/csrmv-adaptive.hpp
+++ b/src/library/blas2/csrmv-adaptive.hpp
@@ -39,19 +39,33 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
     const cl_uint group_size = 256;
 
     std::string params = std::string( )
+    + " -DINDEX_TYPE=uint"
     + " -DROWBITS=" + std::to_string( ROW_BITS )
     + " -DWGBITS=" + std::to_string( WG_BITS )
-    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE );
-#ifdef DOUBLE
-    buildFlags += " -DDOUBLE";
-#endif
+    + " -DWG_SIZE=" + std::to_string( group_size )
+    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
+    + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
+    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR );
 
+    std::string options;
     if(typeid(T) == typeid(cl_double))
-    {
-            std::string options = std::string() + " -DDOUBLE";
-            params.append(options);
-    }
-
+        options = std::string() + " -DVALUE_TYPE=double -DDOUBLE";
+    else if(typeid(T) == typeid(cl_float))
+        options = std::string() + " -DVALUE_TYPE=float";
+    else if(typeid(T) == typeid(cl_uint))
+        options = std::string() + " -DVALUE_TYPE=uint";
+    else if(typeid(T) == typeid(cl_int))
+        options = std::string() + " -DVALUE_TYPE=int";
+    else if(typeid(T) == typeid(cl_ulong))
+        options = std::string() + " -DVALUE_TYPE=ulong -DLONG";
+    else if(typeid(T) == typeid(cl_long))
+        options = std::string() + " -DVALUE_TYPE=long -DLONG";
+    else
+        return clsparseInvalidKernelArgs;
+
+    if(control->extended_precision)
+        options += " -DEXTENDED_PRECISION";
+    params.append(options);
 
     cl::Kernel kernel = KernelCache::get( control->queue,
                                           "csrmv_adaptive",
@@ -70,7 +84,10 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
     // if NVIDIA is used it does not allow to run the group size
     // which is not a multiplication of group_size. Don't know if that
     // have an impact on performance
-    cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size;
+    // Setting global work size to half the row block size because we are only
+    // using half the row blocks buffer for actual work.
+    // The other half is used for the extended precision reduction.
+    cl_uint global_work_size = ( (pCsrMatx->rowBlockSize/2) - 1 ) * group_size;
     cl::NDRange local( group_size );
     cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );
 
@@ -102,15 +119,33 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
     const cl_uint group_size = 256;
 
     std::string params = std::string( )
+    + " -DINDEX_TYPE=uint"
     + " -DROWBITS=" + std::to_string( ROW_BITS )
     + " -DWGBITS=" + std::to_string( WG_BITS )
-    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE );
+    + " -DWG_SIZE=" + std::to_string( group_size )
+    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
+    + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
+    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR );
 
+    std::string options;
     if(typeid(T) == typeid(cl_double))
-    {
-        std::string options = std::string() + " -DDOUBLE";
-        params.append(options);
-    }
+        options = std::string() + " -DVALUE_TYPE=double -DDOUBLE";
+    else if(typeid(T) == typeid(cl_float))
+        options = std::string() + " -DVALUE_TYPE=float";
+    else if(typeid(T) == typeid(cl_uint))
+        options = std::string() + " -DVALUE_TYPE=uint";
+    else if(typeid(T) == typeid(cl_int))
+        options = std::string() + " -DVALUE_TYPE=int";
+    else if(typeid(T) == typeid(cl_ulong))
+        options = std::string() + " -DVALUE_TYPE=ulong -DLONG";
+    else if(typeid(T) == typeid(cl_long))
+        options = std::string() + " -DVALUE_TYPE=long -DLONG";
+    else
+        return clsparseInvalidKernelArgs;
+
+    if(control->extended_precision)
+        options += " -DEXTENDED_PRECISION";
+    params.append(options);
 
     cl::Kernel kernel = KernelCache::get( control->queue,
                                           "csrmv_adaptive",
@@ -129,7 +164,10 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
     // if NVIDIA is used it does not allow to run the group size
     // which is not a multiplication of group_size. Don't know if that
     // have an impact on performance
-    cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size;
+    // Setting global work size to half the row block size because we are only
+    // using half the row blocks buffer for actual work.
+    // The other half is used for the extended precision reduction.
+    cl_uint global_work_size = ( (pCsrMatx->rowBlockSize/2) - 1 ) * group_size;
     cl::NDRange local( group_size );
     cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );
 
diff --git a/src/library/blas2/csrmv-vector.hpp b/src/library/blas2/csrmv-vector.hpp
index 2b8b637..406933f 100644
--- a/src/library/blas2/csrmv-vector.hpp
+++ b/src/library/blas2/csrmv-vector.hpp
@@ -48,7 +48,7 @@ csrmv_vector(const clsparseScalarPrivate* pAlpha,
     if (nnz_per_row < 8)  {  subwave_size = 4;  }
     if (nnz_per_row < 4)  {  subwave_size = 2;  }
 
-    const std::string params = std::string() +
+    std::string params = std::string() +
             "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
             + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
             + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
@@ -56,6 +56,8 @@ csrmv_vector(const clsparseScalarPrivate* pAlpha,
             + " -DWAVE_SIZE=" + std::to_string(wave_size)
             + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);
 
+    if(control->extended_precision)
+        params += " -DEXTENDED_PRECISION";
 
     cl::Kernel kernel = KernelCache::get(control->queue,
                                          "csrmv_general",
@@ -124,7 +126,7 @@ csrmv_vector(const clsparse::array_base<T>& pAlpha,
     if (nnz_per_row < 8)  {  subwave_size = 4;  }
     if (nnz_per_row < 4)  {  subwave_size = 2;  }
 
-    const std::string params = std::string() +
+    std::string params = std::string() +
             "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
             + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
             + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
@@ -132,6 +134,8 @@ csrmv_vector(const clsparse::array_base<T>& pAlpha,
             + " -DWAVE_SIZE=" + std::to_string(wave_size)
             + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);
 
+    if(control->extended_precision)
+        params += " -DEXTENDED_PRECISION";
 
     cl::Kernel kernel = KernelCache::get(control->queue,
                                          "csrmv_general",
diff --git a/src/library/include/clSPARSE-private.hpp b/src/library/include/clSPARSE-private.hpp
index 73f71ef..89df7cc 100644
--- a/src/library/include/clSPARSE-private.hpp
+++ b/src/library/include/clSPARSE-private.hpp
@@ -42,5 +42,7 @@
 const cl_uint WG_BITS = 24;
 const cl_uint ROW_BITS = 32;
 const cl_uint BLKSIZE = 1024;
+const cl_uint BLOCK_MULTIPLIER = 3;
+const cl_uint ROWS_FOR_VECTOR = 1;
 
 #endif
diff --git a/src/library/internal/clsparse-control.cpp b/src/library/internal/clsparse-control.cpp
index 36234b1..e3a6a25 100644
--- a/src/library/internal/clsparse-control.cpp
+++ b/src/library/internal/clsparse-control.cpp
@@ -104,6 +104,7 @@ clsparseCreateControl( cl_command_queue queue, clsparseStatus *status )
     control->wavefront_size = 0;
     control->max_wg_size = 0;
     control->async = false;
+    control->extended_precision = false;
 
     collectEnvParams( control );
 
@@ -144,6 +145,18 @@ clsparseEnableAsync( clsparseControl control, cl_bool async )
     return clsparseSuccess;
 }
 
+clsparseStatus
+clsparseEnableExtendedPrecision( clsparseControl control, cl_bool extended_precision )
+{
+    if( control == NULL )
+    {
+        return clsparseInvalidControlObject;
+    }
+
+    control->extended_precision = extended_precision;
+    return clsparseSuccess;
+}
+
 clsparseStatus
 clsparseReleaseControl( clsparseControl control )
 {
diff --git a/src/library/internal/clsparse-control.hpp b/src/library/internal/clsparse-control.hpp
index 4b44602..5a722c7 100644
--- a/src/library/internal/clsparse-control.hpp
+++ b/src/library/internal/clsparse-control.hpp
@@ -54,6 +54,9 @@ struct _clsparseControl
     size_t wavefront_size;
     size_t max_wg_size;
 
+    // Should we attempt to perform compensated summation?
+    cl_bool extended_precision;
+
     // current device max compute units;
     cl_uint max_compute_units;
 
diff --git a/src/library/internal/computeRowBlocks.hpp b/src/library/internal/computeRowBlocks.hpp
index eeb8bf4..6271c1a 100644
--- a/src/library/internal/computeRowBlocks.hpp
+++ b/src/library/internal/computeRowBlocks.hpp
@@ -21,12 +21,56 @@
 #include <iterator>
 #include <cassert>
 
+#ifndef __has_builtin
+  #define __has_builtin(x) 0
+#endif
+
+static inline unsigned int flp2(unsigned int x)
+{
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+    x |= (x >> 8);
+    x |= (x >> 16);
+    return x - (x >> 1);
+}
+
+// Short rows in CSR-Adaptive are batched together into a single row block.
+// If there are a relatively small number of these, then we choose to do
+// a horizontal reduction (groups of threads all reduce the same row).
+// If there are many threads (e.g. more threads than the maximum size
+// of our workgroup) then we choose to have each thread serially reduce
+// the row.
+// This function calculates the number of threads that could team up
+// to reduce these groups of rows. For instance, if you have a
+// workgroup size of 256 and 4 rows, you could have 64 threads
+// working on each row. If you have 5 rows, only 32 threads could
+// reliably work on each row because our reduction assumes power-of-2.
+template< typename rowBlockType >
+static inline rowBlockType numThreadsForReduction(rowBlockType num_rows)
+{
+#if defined(__INTEL_COMPILER)
+    return 256 >> (_bit_scan_reverse(num_rows-1)+1);
+#elif (defined(__clang__) && __has_builtin(__builtin_clz)) || \
+      !defined(__clang) && \
+      defined(__GNUG__) && ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 30202)
+    return (256 >> (8*sizeof(int)-__builtin_clz(num_rows-1)));
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+    int bit_returned;
+    _BitScanReverse(&bit_returned, (num_rows-1));
+    return 256 >> (bit_returned+1);
+#else
+    return flp2(256/num_rows);
+#endif
+}
+
 // The row blocks buffer holds a packed set of information used to inform each
 // workgroup about how to do its work:
 //
 // |6666 5555 5555 5544 4444 4444 3333 3333|3322 2222|2222 1111 1111 1100 0000 0000|
 // |3210 9876 5432 1098 7654 3210 9876 5432|1098 7654|3210 9876 5432 1098 7654 3210|
-// |------------Row Information------------|----flag^|---WG ID within a long row---|
+// |------------Row Information------------|--------^|---WG ID within a long row---|
+// |                                       |    flag/|or # reduce threads for short|
 //
 // The upper 32 bits of each rowBlock entry tell the workgroup the ID of the first
 // row it will be working on. When one workgroup calculates multiple rows, this
@@ -34,6 +78,9 @@
 // The lower 24 bits are used whenever multiple workgroups calculate a single long
 // row. This tells each workgroup its ID within that row, so it knows which
 // part of the row to operate on.
+// Alternately, on "short" row blocks, the lower bits are used to communicate
+// the number of threads that should be used for the reduction. Pre-calculating
+// this on the CPU-side results in a noticable performance uplift on many matrices.
 // Bit 24 is a flag bit used so that the multiple WGs calculating a long row can
 // know when the first workgroup for that row has finished initializing the output
 // value. While this bit is the same as the first workgroup's flag bit, this
@@ -41,12 +88,17 @@
 
 //  rowBlockType is currently instantiated as ulong
 template< typename rowBlockType >
-void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int* rowDelimiters, int nRows, int blkSize )
+void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int* rowDelimiters, int nRows, int blkSize, int blkMultiplier, int rows_for_vector, bool allocate_row_blocks = true )
 {
-    rowBlockType* rowBlocksBase = rowBlocks;
+    rowBlockType* rowBlocksBase;
+    int total_row_blocks = 1; // Start at one because of rowBlock[0]
 
-    *rowBlocks = 0;
-    rowBlocks++;
+    if (allocate_row_blocks)
+    {
+        rowBlocksBase = rowBlocks;
+        *rowBlocks = 0;
+        rowBlocks++;
+    }
     rowBlockType sum = 0;
     rowBlockType i, last_i = 0;
 
@@ -57,64 +109,151 @@ void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int*
         return;
     }
 
+    int consecutive_long_rows = 0;
     for( i = 1; i <= nRows; i++ )
     {
-        sum += ( rowDelimiters[ i ] - rowDelimiters[ i - 1 ] );
+        int row_length = ( rowDelimiters[ i ] - rowDelimiters[ i - 1 ] );
+        sum += row_length;
 
-        // more than one row results in non-zero elements to be greater than blockSize
-        // This is csr-stream case; bottom WG_BITS == 0
-        if( ( i - last_i > 1 ) && sum > blkSize )
+        // The following section of code calculates whether you're moving between
+        // a series of "short" rows and a series of "long" rows.
+        // This is because the reduction in CSR-Adaptive likes things to be
+        // roughly the same length. Long rows can be reduced horizontally.
+        // Short rows can be reduced one-thread-per-row. Try not to mix them.
+        if ( row_length > 128 )
+            consecutive_long_rows++;
+        else if ( consecutive_long_rows > 0 )
         {
-            *rowBlocks = ( (i - 1) << (64 - ROW_BITS) );
-            rowBlocks++;
-            i--;
-            last_i = i;
-            sum = 0;
+            // If it turns out we WERE in a long-row region, cut if off now.
+            if (row_length < 32) // Now we're in a short-row region
+                consecutive_long_rows = -1;
+            else
+                consecutive_long_rows++;
         }
 
+        // If you just entered into a "long" row from a series of short rows,
+        // then we need to make sure we cut off those short rows. Put them in
+        // their own workgroup.
+        if ( consecutive_long_rows == 1 )
+        {
+            // Assuming there *was* a previous workgroup. If not, nothing to do here.
+            if( i - last_i > 1 )
+            {
+                if (allocate_row_blocks)
+                {
+                    *rowBlocks = ( (i - 1) << (64 - ROW_BITS) );
+                    // If this row fits into CSR-Stream, calculate how many rows
+                    // can be used to do a parallel reduction.
+                    // Fill in the low-order bits with the numThreadsForRed
+                    if (((i-1) - last_i) > rows_for_vector)
+                        *(rowBlocks-1) |= numThreadsForReduction((i - 1) - last_i);
+                    rowBlocks++;
+                }
+                total_row_blocks++;
+                last_i = i-1;
+                sum = row_length;
+            }
+        }
+        else if (consecutive_long_rows == -1)
+        {
+            // We see the first short row after some long ones that
+            // didn't previously fill up a row block.
+            if (allocate_row_blocks)
+            {
+                *rowBlocks = ( (i - 1) << (64 - ROW_BITS) );
+                if (((i-1) - last_i) > rows_for_vector)
+                    *(rowBlocks-1) |= numThreadsForReduction((i - 1) - last_i);
+                rowBlocks++;
+            }
+            total_row_blocks++;
+            last_i = i-1;
+            sum = row_length;
+            consecutive_long_rows = 0;
+         }
+
+        // Now, what's up with this row? What did it do?
+
         // exactly one row results in non-zero elements to be greater than blockSize
         // This is csr-vector case; bottom WG_BITS == workgroup ID
-        else if( ( i - last_i == 1 ) && sum > blkSize )
+        if( ( i - last_i == 1 ) && sum > blkSize )
         {
-            int numWGReq = static_cast< int >( ceil( (double)sum / blkSize ) );
+            int numWGReq = static_cast< int >( ceil( (double)row_length / (blkMultiplier*blkSize) ) );
 
             // Check to ensure #workgroups can fit in WG_BITS bits, if not
             // then the last workgroup will do all the remaining work
             numWGReq = ( numWGReq < (int)pow( 2, WG_BITS ) ) ? numWGReq : (int)pow( 2, WG_BITS );
 
-            for( int w = 1; w < numWGReq; w++ )
+            if (allocate_row_blocks)
             {
-                *rowBlocks = ( (i - 1) << (64 - ROW_BITS) );
-                *rowBlocks |= static_cast< rowBlockType >( w );
+                for( int w = 1; w < numWGReq; w++ )
+                {
+                    *rowBlocks = ( (i - 1) << (64 - ROW_BITS) );
+                    *rowBlocks |= static_cast< rowBlockType >( w );
+                    rowBlocks++;
+                }
+                *rowBlocks = ( i << (64 - ROW_BITS) );
                 rowBlocks++;
             }
-
-            *rowBlocks = ( i << (64 -ROW_BITS) );
-            rowBlocks++;
-
+            total_row_blocks += numWGReq;
             last_i = i;
             sum = 0;
+            consecutive_long_rows = 0;
         }
-        // sum of non-zero elements is exactly equal to blockSize
-        // This is csr-stream case; bottom WG_BITS == 0
+        // more than one row results in non-zero elements to be greater than blockSize
+        // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads
+        else if( ( i - last_i > 1 ) && sum > blkSize )
+        {
+            i--; // This row won't fit, so back off one.
+            if (allocate_row_blocks)
+            {
+                *rowBlocks = ( i << (64 - ROW_BITS) );
+                if ((i - last_i) > rows_for_vector)
+                    *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
+                rowBlocks++;
+            }
+            total_row_blocks++;
+            last_i = i;
+            sum = 0;
+            consecutive_long_rows = 0;
+        }
+        // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads
         else if( sum == blkSize )
         {
-            *rowBlocks = ( i << (64 - ROW_BITS) );
-            rowBlocks++;
+            if (allocate_row_blocks)
+            {
+                *rowBlocks = ( i << (64 - ROW_BITS) );
+                if ((i - last_i) > rows_for_vector)
+                    *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
+                rowBlocks++;
+            }
+            total_row_blocks++;
             last_i = i;
             sum = 0;
+            consecutive_long_rows = 0;
         }
-
     }
 
-    *rowBlocks = ( static_cast< rowBlockType >( nRows ) << (64 - ROW_BITS) );
-    rowBlocks++;
-
-    size_t dist = std::distance( rowBlocksBase, rowBlocks );
-    assert( dist <= rowBlockSize );
+    // If we didn't fill a row block with the last row, make sure we don't lose it.
+    if ( allocate_row_blocks && (*(rowBlocks-1) >> (64 - ROW_BITS)) != static_cast< rowBlockType>(nRows) )
+    {
+        *rowBlocks = ( static_cast< rowBlockType >( nRows ) << (64 - ROW_BITS) );
+        if ((nRows - last_i) > rows_for_vector)
+            *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
+        rowBlocks++;
+    }
+    total_row_blocks++;
 
-    //   Update the size of rowBlocks to reflect the actual amount of memory used
-    rowBlockSize = dist;
+    if (allocate_row_blocks)
+    {
+        size_t dist = std::distance( rowBlocksBase, rowBlocks );
+        assert( (2 * dist) <= rowBlockSize );
+        // Update the size of rowBlocks to reflect the actual amount of memory used
+        // We're multiplying the size by two because the extended precision form of
+        // CSR-Adaptive requires more space for the final global reduction.
+        rowBlockSize = 2 * dist;
+    }
+    else
+        rowBlockSize = 2 * total_row_blocks;
 }
 
 #endif
diff --git a/src/library/io/mm-reader.cpp b/src/library/io/mm-reader.cpp
index ae78dae..ea4cbeb 100644
--- a/src/library/io/mm-reader.cpp
+++ b/src/library/io/mm-reader.cpp
@@ -596,9 +596,10 @@ clsparseSCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls
     if( pCsrMatx->rowBlockSize )
     {
         clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
+        ComputeRowBlocks( (cl_ulong*)NULL, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, false );
         cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE );
+        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true );
     }
 
     return clsparseSuccess;
@@ -666,9 +667,10 @@ clsparseDCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls
     if( pCsrMatx->rowBlockSize )
     {
         clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
+        ComputeRowBlocks( (cl_ulong*)NULL, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, false );
         cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE );
+        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true );
     }
 
     return clsparseSuccess;
@@ -734,9 +736,10 @@ clsparseDCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls
 //    if( pCsrMatx->rowBlockSize )
 //    {
 //        clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
+//        ComputeRowBlocks( (cl_ulong*)NULL, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, false );
 //        cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-//        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE );
+//        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true );
 //    }
 
 //    return clsparseSuccess;
diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index b5c6935..2a1f04e 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -15,43 +15,117 @@ R"(
  * limitations under the License.
  * ************************************************************************ */
 
-#define WGSIZE 256
-
-#ifdef DOUBLE
-  #define FPTYPE double
-
+// No reason to include these beyond version 1.2, where double is not an extension.
+#if defined(DOUBLE) && __OPENCL_VERSION__ < CL_VERSION_1_2
   #ifdef cl_khr_fp64
-      #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
   #elif defined(cl_amd_fp64)
-      #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
   #else
-      #error "Double precision floating point not supported by OpenCL implementation."
+    #error "Double precision floating point not supported by OpenCL implementation."
   #endif
+#endif
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+  #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+  #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+  #define ATOM64
+#endif
+
+#if __OPENCL_VERSION__ > CL_VERSION_1_0
+  #define ATOM32
+#elif defined(cl_khr_global_int32_base_atomics) && defined(cl_khr_global_int32_extended_atomics)
+  #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : require
+  #pragma OPENCL_EXTENSION cl_khr_global_int32_extended_atomics : require
+  #define ATOM32
 #else
-  #define FPTYPE float
+  #error "Required integer atomics not supported by this OpenCL implemenation."
 #endif
 
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
-#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
+#ifndef INDEX_TYPE
+#error "INDEX_TYPE undefined!"
+#endif
 
-int lowerPowerOf2(int num)
-{
-        num --;
+#ifndef VALUE_TYPE
+#error "VALUE_TYPE undefined!"
+#endif
+
+#ifndef ROWBITS
+#error "ROWBITS undefined!"
+#endif
+
+#ifndef WGBITS
+#error "WGBITS undefined!"
+#endif
 
-        num |= num >> 1;
-        num |= num >> 2;
-        num |= num >> 4;
-        num |= num >> 8;
-        num |= num >> 16;
+#ifndef WG_SIZE
+#error "WG_SIZE undefined!"
+#endif
+
+#ifndef BLOCKSIZE
+#error "BLOCKSIZE undefined!"
+#endif
 
-        num ++;
+#ifndef BLOCK_MULTIPLIER
+#error "BLOCK_MULTIPLIER undefined!"
+#endif
 
-        num >>= 1;
+#ifndef ROWS_FOR_VECTOR
+#error "ROWS_FOR_VECTOR undefined!"
+#endif
+
+// Internal functions to wrap atomics, depending on if we support 64-bit
+// atomics or not. Helps keep the code clean in the other parts of the code.
+// All of the 32-bit atomics are built assuming we're on a little endian architecture.
+inline unsigned long clsparse_atomic_xor(__global unsigned long * restrict const ptr,
+                                    const unsigned long xor_val)
+{
+#ifdef ATOM64
+    return atom_xor(ptr, xor_val);
+#else
+    return atomic_max((__global unsigned int*)ptr, (unsigned int)xor_val);
+#endif
+}
 
-        return num;
+inline unsigned long clsparse_atomic_max(__global unsigned long * restrict const ptr,
+                                    const unsigned long compare)
+{
+#ifdef ATOM64
+    return atom_max(ptr, compare);
+#else
+    return atomic_max((__global unsigned int*)ptr, (unsigned int)compare);
+#endif
 }
 
-void atomic_add_float (global FPTYPE *ptr, FPTYPE temp)
+inline unsigned long clsparse_atomic_inc(__global unsigned long * restrict const inc_this)
+{
+#ifdef ATOM64
+    return atom_inc(inc_this);
+#else
+    return atomic_inc((__global unsigned int *)inc_this);
+#endif
+}
+
+inline unsigned long clsparse_atomic_cmpxchg(__global unsigned long * restrict const ptr,
+                                    const unsigned long compare,
+                                    const unsigned long val)
+{
+#ifdef DOUBLE
+  #ifdef ATOM64
+    return atom_cmpxchg(ptr, compare, val);
+  #else
+    // Should never run this. Don't use a path that requires cmpxchg for doubles
+    // if you don't support 64-bit atomics.
+    return compare;
+  #endif
+#else
+    return atomic_cmpxchg((__global unsigned int*)ptr, compare, val);
+#endif
+}
+
+VALUE_TYPE atomic_add_float_extended( global VALUE_TYPE * restrict const ptr,
+                                  const VALUE_TYPE temp,
+                                  VALUE_TYPE * restrict const old_sum )
 {
 #ifdef DOUBLE
 	unsigned long newVal;
@@ -60,8 +134,10 @@ void atomic_add_float (global FPTYPE *ptr, FPTYPE temp)
 	{
 		prevVal = as_ulong(*ptr);
 		newVal = as_ulong(temp + *ptr);
-	} while (atom_cmpxchg((global unsigned long *)ptr, prevVal, newVal) != prevVal);
-
+	} while (clsparse_atomic_cmpxchg((__global unsigned long *)ptr, prevVal, newVal) != prevVal);
+    if (old_sum != 0)
+        *old_sum = as_double(prevVal);
+    return as_double(newVal);
 #else
 	unsigned int newVal;
 	unsigned int prevVal;
@@ -69,32 +145,191 @@ void atomic_add_float (global FPTYPE *ptr, FPTYPE temp)
 	{
 		prevVal = as_uint(*ptr);
 		newVal = as_uint(temp + *ptr);
-	} while (atomic_cmpxchg((global unsigned int *)ptr, prevVal, newVal) != prevVal);
+	} while (clsparse_atomic_cmpxchg((__global unsigned long *)ptr, prevVal, newVal) != prevVal);
+    if (old_sum != 0)
+        *old_sum = as_float(prevVal);
+    return as_float(newVal);
 #endif
 }
 
+void atomic_add_float( global void * const ptr, const VALUE_TYPE temp )
+{
+    atomic_add_float_extended(ptr, temp, 0);
+}
+
+// Knuth's Two-Sum algorithm, which allows us to add together two floating
+// point numbers and exactly tranform the answer into a sum and a
+// rounding error.
+// Inputs: x and y, the two inputs to be aded together.
+// In/Out: *sumk_err, which is incremented (by reference) -- holds the
+//         error value as a result of the 2sum calculation.
+// Returns: The non-corrected sum of inputs x and y.
+VALUE_TYPE two_sum( VALUE_TYPE x,
+                VALUE_TYPE y,
+                VALUE_TYPE * restrict const sumk_err )
+{
+    const VALUE_TYPE sumk_s = x + y;
+#ifdef EXTENDED_PRECISION
+    // We use this 2Sum algorithm to perform a compensated summation,
+    // which can reduce the cummulative rounding errors in our SpMV summation.
+    // Our compensated sumation is based on the SumK algorithm (with K==2) from
+    // Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in
+    // SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005.
+
+    // 2Sum can be done in 6 FLOPs without a branch. However, calculating
+    // double precision is slower than single precision on every existing GPU.
+    // As such, replacing 2Sum with Fast2Sum when using DPFP results in better
+    // performance (~5% higher total). This is true even though we must ensure
+    // that |a| > |b|. Branch divergence is better than the DPFP slowdown.
+    // Thus, for DPFP, our compensated summation algorithm is actually described
+    // by both Pichat and Neumaier in "Correction d'une somme en arithmetique
+    // a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972)
+    // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher
+    // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51,
+    // 1974), respectively.
+    if (fabs(x) < fabs(y))
+    {
+        const VALUE_TYPE swap = x;
+        x = y;
+        y = swap;
+    }
+    (*sumk_err) += (y - (sumk_s - x));
+    // Original 6 FLOP 2Sum algorithm.
+    //const VALUE_TYPE bp = sumk_s - x;
+    //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp));
+#endif
+    return sumk_s;
+}
+
+// Performs (x_vals * x_vec) + y using an FMA.
+// Ideally, we would perform an error-free transformation here and return the
+// appropriate error. However, the EFT of an FMA is very expensive. As such,
+// if we are in EXTENDED_PRECISION mode, this function devolves into two_sum
+// with x_vals and x_vec inputs multiplied separately from the compensated add.
+VALUE_TYPE two_fma( const VALUE_TYPE x_vals,
+        const VALUE_TYPE x_vec,
+        VALUE_TYPE y,
+        VALUE_TYPE * restrict const sumk_err )
+{
+#ifdef EXTENDED_PRECISION
+    VALUE_TYPE x = x_vals * x_vec;
+    const VALUE_TYPE sumk_s = x + y;
+    if (fabs(x) < fabs(y))
+    {
+        const VALUE_TYPE swap = x;
+        x = y;
+        y = swap;
+    }
+    (*sumk_err) += (y - (sumk_s - x));
+    // 2Sum in the FMA case. Poor performance on low-DPFP GPUs.
+    //const VALUE_TYPE bp = fma(-x_vals, x_vec, sumk_s);
+    //(*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp));
+    return sumk_s;
+#else
+    return fma(x_vals, x_vec, y);
+#endif
+}
+
+VALUE_TYPE atomic_two_sum_float( global VALUE_TYPE * restrict const x_ptr,
+                            VALUE_TYPE y,
+                            VALUE_TYPE * restrict const sumk_err )
+{
+    // Have to wait until the return from the atomic op to know what X was.
+    VALUE_TYPE sumk_s = 0.;
+#ifdef EXTENDED_PRECISION
+    VALUE_TYPE x;
+    sumk_s = atomic_add_float_extended(x_ptr, y, &x);
+    if (fabs(x) < fabs(y))
+    {
+        const VALUE_TYPE swap = x;
+        x = y;
+        y = swap;
+    }
+    (*sumk_err) += (y - (sumk_s - x));
+#else
+    atomic_add_float(x_ptr, y);
+#endif
+    return sumk_s;
+}
+
+// A method of doing the final reduction in CSR-Vector without having to copy
+// and paste it a bunch of times.
+// The EXTENDED_PRECISION section is done as part of a compensated summation
+// meant to reduce cummulative rounding errors. This can become a problem on
+// GPUs because the reduction order is different than what would be used on a
+// CPU. It is based on the PSumK algorithm (with K==2) from Yamanaka, Ogita,
+// Rump, and Oishi, "A Parallel Algorithm of Accurate Dot Product," in
+// J. Parallel Computing, 34(6-8), pp. 392-410, Jul. 2008.
+// A version of this method is also used in csrmv_general.
+// Inputs:  cur_sum: the input from which our sum starts
+//          err: the current running cascade error for this final summation
+//          partial: the local memory which holds the values to sum
+//                  (we eventually use it to pass down temp. err vals as well)
+//          lid: local ID of the work item calling this function.
+//          thread_lane: The lane within this thread's row.
+//          max_size: This parallel summation method operates in multiple rounds
+//                  to do a parallel reduction. This is the length of each row.
+//          reduc_size: As you reduce data down, this tells you how far away
+//                 you will grab a value and add to your own local sum value.
+VALUE_TYPE sum2_reduce( VALUE_TYPE cur_sum,
+        VALUE_TYPE * restrict const err,
+        __local VALUE_TYPE * restrict const partial,
+        const unsigned int lid,
+        const unsigned int thread_lane,
+        const unsigned int max_size,
+        const unsigned int reduc_size )
+{
+    if ( max_size > reduc_size )
+    {
+#ifdef EXTENDED_PRECISION
+        const unsigned int partial_dest = lid + reduc_size;
+        if (thread_lane < reduc_size)
+            cur_sum  = two_sum(cur_sum, partial[partial_dest], err);
+        // We reuse the LDS entries to move the error values down into lower
+        // threads. This saves LDS space, allowing higher occupancy, but requires
+        // more barriers, which can reduce performance.
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // Have all of those upper threads pass their temporary errors
+        // into a location that the lower threads can read.
+        partial[lid] = (thread_lane >= reduc_size) ? *err : partial[lid];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (thread_lane < reduc_size) // Add those errors in.
+        {
+            *err += partial[partial_dest];
+            partial[lid] = cur_sum;
+        }
+#else
+        cur_sum += partial[lid + reduc_size];
+        barrier( CLK_LOCAL_MEM_FENCE );
+        partial[lid] = cur_sum;
+#endif
+    }
+    return cur_sum;
+}
+
 __kernel void
-csrmv_adaptive(__global const FPTYPE * restrict vals,
-                       __global const int * restrict cols,
-                       __global const int * restrict rowPtrs,
-                       __global const FPTYPE * restrict vec,
-                       __global FPTYPE * restrict out,
-                       __global unsigned long * restrict rowBlocks,
-                       __global FPTYPE * pAlpha,
-                       __global FPTYPE * pBeta)
+csrmv_adaptive(__global const VALUE_TYPE * restrict const vals,
+                       __global const unsigned int * restrict const cols,
+                       __global const unsigned int * restrict const rowPtrs,
+                       __global const VALUE_TYPE * restrict const vec,
+                       __global VALUE_TYPE * restrict const out,
+                       __global unsigned long * restrict const rowBlocks,
+                       __global const VALUE_TYPE * restrict const pAlpha,
+                       __global const VALUE_TYPE * restrict const pBeta)
 {
-   __local FPTYPE partialSums[BLOCKSIZE];
-   size_t gid = get_group_id(0);
-   size_t lid = get_local_id(0);
-   const FPTYPE alpha = *pAlpha;
-   const FPTYPE beta = *pBeta;
+   __local VALUE_TYPE partialSums[BLOCKSIZE];
+   const unsigned int gid = get_group_id(0);
+   const unsigned int lid = get_local_id(0);
+   const VALUE_TYPE alpha = *pAlpha;
+   const VALUE_TYPE beta = *pBeta;
 
    // The row blocks buffer holds a packed set of information used to inform each
    // workgroup about how to do its work:
    //
    // |6666 5555 5555 5544 4444 4444 3333 3333|3322 2222|2222 1111 1111 1100 0000 0000|
    // |3210 9876 5432 1098 7654 3210 9876 5432|1098 7654|3210 9876 5432 1098 7654 3210|
-   // |------------Row Information------------|----flag^|---WG ID within a long row---|
+   // |------------Row Information------------|--------^|---WG ID within a long row---|
+   // |                                       |    flag/|or # reduce threads for short|
    //
    // The upper 32 bits of each rowBlock entry tell the workgroup the ID of the first
    // row it will be working on. When one workgroup calculates multiple rows, this
@@ -102,19 +337,47 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
    // The lower 24 bits are used whenever multiple workgroups calculate a single long
    // row. This tells each workgroup its ID within that row, so it knows which
    // part of the row to operate on.
+   // Alternately, on "short" row blocks, the lower bits are used to communicate
+   // the number of threads that should be used for the reduction. Pre-calculating
+   // this on the CPU-side results in a noticable performance uplift on many matrices.
    // Bit 24 is a flag bit used so that the multiple WGs calculating a long row can
    // know when the first workgroup for that row has finished initializing the output
    // value. While this bit is the same as the first workgroup's flag bit, this
    // workgroup will spin-loop.
    unsigned int row = ((rowBlocks[gid] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL));
    unsigned int stop_row = ((rowBlocks[gid + 1] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL));
+   unsigned int num_rows = stop_row - row;
+
+   // Get the "workgroup within this long row" ID out of the bottom bits of the row block.
+   unsigned int wg = rowBlocks[gid] & ((1 << WGBITS) - 1);
+
+   // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row.
+   // If there are more items in this row, we assign more workgroups.
+   unsigned int vecStart = mad24(wg, (unsigned int)(BLOCK_MULTIPLIER*BLOCKSIZE), rowPtrs[row]);
+   unsigned int vecEnd = (rowPtrs[row + 1] > vecStart + BLOCK_MULTIPLIER*BLOCKSIZE) ? vecStart + BLOCK_MULTIPLIER*BLOCKSIZE : rowPtrs[row + 1];
+
+#if (defined(DOUBLE) || defined(LONG)) && !defined(ATOM64)
+   // In here because we don't support 64-bit atomics while working on 64-bit data.
+   // As such, we can't use CSR-LongRows. Time to do a fixup -- first WG does the
+   // entire row with CSR-Vector. Other rows immediately exit.
+   if (num_rows == 0 || (num_rows == 1 && wg)) // CSR-LongRows case
+   {
+       num_rows = ROWS_FOR_VECTOR;
+       stop_row = wg ? row : (row + 1);
+       wg = 0;
+   }
+#endif
+
+   VALUE_TYPE temp_sum = 0.;
+   VALUE_TYPE sumk_e = 0.;
+   VALUE_TYPE new_error = 0.;
 
    // If the next row block starts more than 2 rows away, then we choose CSR-Stream.
    // If this is zero (long rows) or one (final workgroup in a long row, or a single
-   // row in a row block), we want to use the CSR-Vector algorithm.
+   // row in a row block), we want to use the CSR-Vector algorithm(s).
    // We have found, through experimentation, that CSR-Vector is generally faster
    // when working on 2 rows, due to its simplicity and better reduction method.
-   if (stop_row - row > 2)
+   if (num_rows > ROWS_FOR_VECTOR)
    {
        // CSR-Stream case. See Sections III.A and III.B in the SC'14 paper:
        // "Efficient Sparse Matrix-Vector Multiplication on GPUs using the CSR Storage Format"
@@ -131,24 +394,22 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        // long) rows, it's actually better to perform a tree-style reduction where
        // multiple threads team up to reduce the same row.
 
-       // The calculations below tell you how many threads this workgroup can allocate
+       // The calculation below tells you how many threads this workgroup can allocate
        // to each row, assuming that every row gets the same number of threads.
-       // We want the closest lower-power-of-2 to this number -- that is how many
-       // threads can work in each row's reduction using our algorithm.
-      int possibleThreadsRed = get_local_size(0)/(stop_row - row);
-      int numThreadsForRed = lowerPowerOf2(possibleThreadsRed);
-
-      unsigned int local_row = row + lid;
+       // We want the closest lower (or equal) power-of-2 to this number --
+       // that is how many threads can work in each row's reduction using our algorithm.
+       // For instance, with workgroup size 256, 2 rows = 128 threads, 3 rows = 64
+       // threads, 4 rows = 64 threads, 5 rows = 32 threads, etc.
+       //int numThreadsForRed = get_local_size(0) >> ((CHAR_BIT*sizeof(unsigned int))-clz(num_rows-1));
+       const unsigned int numThreadsForRed = wg; // Same calculation as above, done on host.
 
        // Stream all of this row block's matrix values into local memory.
        // Perform the matvec in parallel with this work.
-      int col = rowPtrs[row] + lid;
+       const unsigned int col = rowPtrs[row] + lid;
       if (gid != (get_num_groups(0) - 1))
       {
-          for(int i = 0; i < BLOCKSIZE; i += 256)
-          {
-              partialSums[lid + i] = vals[col + i] * vec[cols[col + i]];
-          }
+          for(int i = 0; i < BLOCKSIZE; i += WG_SIZE)
+              partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]];
       }
       else
       {
@@ -159,11 +420,9 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           // However, this may change in the future (e.g. with shared virtual memory.)
           // This causes a minor performance loss because this is the last workgroup
           // to be launched, and this loop can't be unrolled.
-          int max_to_load = rowPtrs[stop_row] - rowPtrs[row];
-          for(int i = 0; i < max_to_load; i += 256)
-          {
-              partialSums[lid + i] = vals[col + i] * vec[cols[col + i]];
-          }
+          const unsigned int max_to_load = rowPtrs[stop_row] - rowPtrs[row];
+          for(int i = 0; i < max_to_load; i += WG_SIZE)
+              partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]];
       }
       barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -172,58 +431,56 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           // In this case, we want to have the workgroup perform a tree-style reduction
           // of each row. {numThreadsForRed} adjacent threads team up to linearly reduce
           // a row into {numThreadsForRed} locations in local memory.
-          // After that, a single thread from each of those teams linearly walks through
-          // the local memory values for that row and reduces to the final output value.
-         FPTYPE temp = 0.;
-
-         // {numThreadsForRed} adjacent threads all work on the same row, so their
-         // start and end values are the same.
-         size_t st = lid/numThreadsForRed;
-         int local_first_val = (rowPtrs[row + st] - rowPtrs[row]);
-         int local_last_val = rowPtrs[row + st + 1] - rowPtrs[row];
-         int workForEachThread = (local_last_val - local_first_val)/numThreadsForRed;
-         size_t threadInBlock = lid & (numThreadsForRed - 1);
-
-         // Not all row blocks are full -- they may have an odd number of rows. As such,
-         // we need to ensure that adjacent-groups only work on real data for this rowBlock.
-         if(st < (stop_row - row))
-         {
-            // only works when numThreadsForRed is a power of 2
-            for(int i = 0; i < workForEachThread; i++)
-            {
-               temp += partialSums[local_first_val + i*numThreadsForRed + threadInBlock];
-            }
-
-            // The last few values (the remainder of this row) also need to be aded in.
-            int local_cur_val = local_first_val + numThreadsForRed*workForEachThread;
-            if(threadInBlock < local_last_val - local_cur_val)
-            {
-               temp += partialSums[local_cur_val + threadInBlock];
-            }
-         }
-         barrier(CLK_LOCAL_MEM_FENCE);
-         partialSums[lid] = temp;
-
-         // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed}
-         // values sitting in the local memory. This next step takes the first thread from
-         // each of the adjacent-groups and uses it to walk through those values and reduce
-         // them into a final output value for the row.
-         temp = 0.;
-         if(lid < (stop_row - row))
-         {
-#pragma unroll 4
-            for(int i = 0; i < numThreadsForRed; i++)
-            {
-               temp += partialSums[lid*numThreadsForRed + i];
-            }
-            // All of our write-outs check to see if the output vector should first be zeroed.
-            // If so, just do a write rather than a read-write. Measured to be a slight (~5%)
-            // performance improvement.
-            if (beta != 0.)
-                out[local_row] = (beta * out[local_row]) + (alpha * temp);
-            else
-                out[local_row] = (alpha * temp);
-         }
+          // After that, the entire workgroup does a parallel reduction, and each
+          // row ends up with an individual answer.
+
+          // {numThreadsForRed} adjacent threads all work on the same row, so their
+          // start and end values are the same.
+          // numThreadsForRed guaranteed to be a power of two, so the clz code below
+          // avoids an integer divide. ~2% perf gain in EXTRA_PRECISION.
+          //size_t st = lid/numThreadsForRed;
+          const unsigned int local_row = row + (lid >> (31 - clz(numThreadsForRed)));
+          const unsigned int local_first_val = rowPtrs[local_row] - rowPtrs[row];
+          const unsigned int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row];
+          const unsigned int threadInBlock = lid & (numThreadsForRed - 1);
+
+          // Not all row blocks are full -- they may have an odd number of rows. As such,
+          // we need to ensure that adjacent-groups only work on real data for this rowBlock.
+          if(local_row < stop_row)
+          {
+              // This is dangerous -- will infinite loop if your last value is within
+              // numThreadsForRed of MAX_UINT. Noticable performance gain to avoid a
+              // long induction variable here, though.
+              for(unsigned int local_cur_val = local_first_val + threadInBlock;
+                      local_cur_val < local_last_val;
+                      local_cur_val += numThreadsForRed)
+                  temp_sum = two_sum(partialSums[local_cur_val], temp_sum, &sumk_e);
+          }
+          barrier(CLK_LOCAL_MEM_FENCE);
+
+          temp_sum = two_sum(temp_sum, sumk_e, &new_error);
+          partialSums[lid] = temp_sum;
+
+          // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed}
+          // values sitting in the local memory. This means that, roughly, the beginning of
+          // LDS is full up to {workgroup size} entries.
+          // Now we perform a parallel reduction that sums together the answers for each
+          // row in parallel, leaving us an answer in 'temp_sum' for each row.
+          for (int i = (WG_SIZE >> 1); i > 0; i >>= 1)
+          {
+              barrier( CLK_LOCAL_MEM_FENCE );
+              temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, i);
+          }
+
+          if (threadInBlock == 0 && local_row < stop_row)
+          {
+              // All of our write-outs check to see if the output vector should first be zeroed.
+              // If so, just do a write rather than a read-write. Measured to be a slight (~5%)
+              // performance improvement.
+              if (beta != 0.)
+                  temp_sum = two_fma(beta, out[local_row], temp_sum, &new_error);
+              out[local_row] = temp_sum + new_error;
+          }
       }
       else
       {
@@ -232,30 +489,77 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           // However, this reduction is also much faster than CSR-Scalar, because local memory
           // is designed for scatter-gather operations.
           // We need a while loop because there may be more rows than threads in the WG.
-         while(local_row < stop_row)
-         {
-            int local_first_val = (rowPtrs[local_row] - rowPtrs[row]);
-            int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row];
-            FPTYPE temp = 0;
-            for (int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++)
-            {
-               temp += partialSums[local_cur_val];
-            }
-
-            // After you've done the reduction into the temp register,
-            // put that into the output for each row.
-            if (beta != 0.)
-                out[local_row] = (beta * out[local_row]) + (alpha * temp);
-            else
-                out[local_row] = (alpha * temp);
-            local_row += get_local_size(0);
-         }
+          unsigned int local_row = row + lid;
+          while(local_row < stop_row)
+          {
+              int local_first_val = (rowPtrs[local_row] - rowPtrs[row]);
+              int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row];
+              temp_sum = 0.;
+              sumk_e = 0.;
+              for (int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++)
+                  temp_sum = two_sum(partialSums[local_cur_val], temp_sum, &sumk_e);
+
+              // After you've done the reduction into the temp_sum register,
+              // put that into the output for each row.
+              if (beta != 0.)
+                  temp_sum = two_fma(beta, out[local_row], temp_sum, &sumk_e);
+              out[local_row] = temp_sum + sumk_e;
+              local_row += WG_SIZE;
+          }
       }
    }
+   else if (num_rows >= 1 && !wg) // CSR-Vector case.
+   {
+       // ^^ The above check says that if this workgroup is supposed to work on <= ROWS_VECTOR
+       // number of rows then we should do the CSR-Vector algorithm. If we want this row to be
+       // done with CSR-LongRows, then all of its workgroups (except the last one) will have the
+       // same stop_row and row. The final workgroup in a LongRow will have stop_row and row
+       // different, but the internal "wg" number will be non-zero.
+
+       // If this workgroup is operating on multiple rows (because CSR-Stream is poor for small
+       // numbers of rows), then it needs to iterate until it reaches the stop_row.
+       // We don't check <= stop_row because of the potential for unsigned overflow.
+       while (row < stop_row)
+       {
+           // Any workgroup only calculates, at most, BLOCKSIZE items in this row.
+           // If there are more items in this row, we use CSR-LongRows.
+           temp_sum = 0.;
+           sumk_e = 0.;
+           new_error = 0.;
+           vecStart = rowPtrs[row];
+           vecEnd = rowPtrs[row+1];
+
+           // Load in a bunch of partial results into your register space, rather than LDS (no contention)
+           // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
+           // Using a long induction variable to make sure unsigned int overflow doesn't break things.
+           for (long j = vecStart + lid; j < vecEnd; j+=WG_SIZE)
+           {
+               const unsigned int col = cols[(unsigned int)j];
+               temp_sum = two_fma(alpha*vals[(unsigned int)j], vec[col], temp_sum, &sumk_e);
+           }
+
+           temp_sum = two_sum(temp_sum, sumk_e, &new_error);
+           partialSums[lid] = temp_sum;
+
+           // Reduce partial sums
+           for (int i = (WG_SIZE >> 1); i > 0; i >>= 1)
+           {
+               barrier( CLK_LOCAL_MEM_FENCE);
+               temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, lid, WG_SIZE, i);
+           }
+
+           if (lid == 0UL)
+           {
+               if (beta != 0.)
+                   temp_sum = two_fma(beta, out[row], temp_sum, &new_error);
+               out[row] = temp_sum + new_error;
+           }
+           row++;
+       }
+   }
    else
    {
-       // In CSR-Vector, we may have more than one workgroup calculating this row
-	   // or one workgroup calculating two rows.
+       // In CSR-LongRows, we have more than one workgroup calculating this row.
        // The output values for those types of rows are stored using atomic_add, because
        // more than one parallel workgroup's value makes up the final answer.
        // Unfortunately, this makes it difficult to do y=Ax, rather than y=Ax+y, because
@@ -268,93 +572,113 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        // First, figure out which workgroup you are in the row. Bottom 24 bits.
        // You can use that to find the global ID for the first workgroup calculating
        // this long row.
-	   size_t first_wg_in_row = gid - (rowBlocks[gid] & ((1UL << WGBITS) - 1UL));
-       unsigned int compare_value = rowBlocks[gid] & (1UL << WGBITS);
+       const unsigned int first_wg_in_row = gid - (rowBlocks[gid] & ((1UL << WGBITS) - 1UL));
+       const unsigned int compare_value = rowBlocks[gid] & (1UL << WGBITS);
 
        // Bit 24 in the first workgroup is the flag that everyone waits on.
-	   if(gid == first_wg_in_row && lid == 0)
-	   {
-            // The first workgroup handles the output initialization.
-            if (beta != 0.)
-                out[row] *= beta;
-            else
-                out[row] = 0.;
-            // We currently have, at most, two rows in a CSR-Vector calculation.
-            // If we have two, we need to initialize the second output as well.
-            if(stop_row - row == 2)
-            {
-                if (beta != 0.)
-                    out[row+1] *= beta;
-                else
-                    out[row+1] = 0.;
-            }
-            atom_xor(&rowBlocks[first_wg_in_row], (1UL << WGBITS)); // Release other workgroups.
+       if(gid == first_wg_in_row && lid == 0UL)
+       {
+           // The first workgroup handles the output initialization.
+           volatile VALUE_TYPE out_val = out[row];
+           temp_sum = (beta - 1.) * out_val;
+#ifdef EXTENDED_PRECISION
+           rowBlocks[get_num_groups(0) + gid + 1] = 0UL;
+#endif
+           clsparse_atomic_xor(&rowBlocks[first_wg_in_row], (1UL << WGBITS)); // Release other workgroups.
        }
        // For every other workgroup, bit 24 holds the value they wait on.
        // If your bit 24 == first_wg's bit 24, you spin loop.
        // The first workgroup will eventually flip this bit, and you can move forward.
        barrier(CLK_GLOBAL_MEM_FENCE);
-	   while(gid != first_wg_in_row &&
-             lid==0 &&
-             ((atom_max(&rowBlocks[first_wg_in_row], 0UL) & (1UL << WGBITS)) == compare_value));
+       while(gid != first_wg_in_row &&
+               lid == 0U &&
+               ((clsparse_atomic_max(&rowBlocks[first_wg_in_row], 0UL) & (1UL << WGBITS)) == compare_value));
        barrier(CLK_GLOBAL_MEM_FENCE);
 
        // After you've passed the barrier, update your local flag to make sure that
        // the next time through, you know what to wait on.
-       if (gid != first_wg_in_row && lid == 0)
+       if (gid != first_wg_in_row && lid == 0UL)
            rowBlocks[gid] ^= (1UL << WGBITS);
 
-       unsigned int myRow = row; // Adding another variable for row as it is getting updated below
-       char iteration = 0;
-
-       // CSR-Vector will always do at least one iteration.
        // All but the final workgroup in a long-row collaboration have the same start_row
        // and stop_row. They only run for one iteration.
-       // If this workgroup is operating on multiple rows (because CSR-Stream is poor for small
-       // #s of rows), then it is (currently) not part of a collaboration on a long
-       // row. As such, it needs to iterate until it reaches the stop_row.
-       // We don't check <= stop_row because of the potential for unsigned overflow.
-		while (iteration == 0 || myRow < stop_row)
-		{
-            // Get the "workgroup within this long row" ID out of the bottom bits of the row block.
-            // If this is the only workgroup working on this row, this will be zero, so still correct.
-			unsigned int wg = rowBlocks[gid] & ((1 << WGBITS) - 1);
-
-            // Any workgroup only calculates, at most, BLOCKSIZE items in this row.
-            // If there are more items in this row, we assign more workgroups.
-            unsigned int vecStart = rowPtrs[myRow] + (wg * BLOCKSIZE);
-			unsigned int vecEnd = (rowPtrs[myRow + 1] > vecStart + BLOCKSIZE) ? vecStart + BLOCKSIZE : rowPtrs[myRow+1];
-
-            // Load in a bunch of partial results into your register space, rather than LDS (no contention)
-            // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
-            // Using a long induction variable to make sure unsigned int overflow doesn't break things.
-			FPTYPE mySum = 0.;
-			for (long j = vecStart + lid; j < vecEnd; j+=WGSIZE)
-			 {
-				 unsigned int col = cols[(unsigned int)j];
-				 mySum += vals[(unsigned int)j] * vec[col];
-			 }
-			 partialSums[lid] = mySum;
-			 barrier(CLK_LOCAL_MEM_FENCE);
-
-			 // Reduce partial sums
-			 // Needs to be modified if there is a change in the # of work-items per workgroup.
-			 if(lid < 64) partialSums[lid] += partialSums[lid + 64] + partialSums[lid + 128] + partialSums[lid + 192];
-			 barrier(CLK_LOCAL_MEM_FENCE);
-			 if(lid < 16) partialSums[lid] += partialSums[lid + 16] + partialSums[lid + 32] + partialSums[lid + 48];
-			 barrier(CLK_LOCAL_MEM_FENCE);
-			 if(lid < 4) partialSums[lid] += partialSums[lid + 4] + partialSums[lid + 8] + partialSums[lid + 12];
-			 barrier(CLK_LOCAL_MEM_FENCE);
-			 if(lid < 1) partialSums[lid] += partialSums[lid + 1] + partialSums[lid + 2] + partialSums[lid + 3];
-			 barrier(CLK_LOCAL_MEM_FENCE);
-
-			 if (lid == 0)
-				 atomic_add_float(&out[myRow], (alpha * partialSums[0]));
-
-			 // CSR-VECTOR on workgroups for two rows which are inefficient for CSR-Stream
-			 myRow++;
-             iteration++;
-		}
+       // Load in a bunch of partial results into your register space, rather than LDS (no contention)
+       // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
+       const unsigned int col = vecStart + lid;
+       if (row == stop_row) // inner thread, we can hardcode/unroll this loop
+       {
+           // Don't put BLOCK_MULTIPLIER*BLOCKSIZE as the stop point, because
+           // some GPU compilers will *aggressively* unroll this loop.
+           // That increases register pressure and reduces occupancy.
+           for (int j = 0; j < (int)(vecEnd - col); j += WG_SIZE)
+           {
+               temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e);
+#if 2*WG_SIZE <= BLOCK_MULTIPLIER*BLOCKSIZE
+               // If you can, unroll this loop once. It somewhat helps performance.
+               j += WG_SIZE;
+               temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e);
+#endif
+           }
+       }
+       else
+       {
+           for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE)
+               temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e);
+       }
+
+       temp_sum = two_sum(temp_sum, sumk_e, &new_error);
+       partialSums[lid] = temp_sum;
+
+       // Reduce partial sums
+       for (int i = (WG_SIZE >> 1); i > 0; i >>= 1)
+       {
+           barrier( CLK_LOCAL_MEM_FENCE);
+           temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, lid, WG_SIZE, i);
+       }
+
+       if (lid == 0UL)
+       {
+           atomic_two_sum_float(&out[row], temp_sum, &new_error);
+
+#ifdef EXTENDED_PRECISION
+           unsigned int error_loc = get_num_groups(0) + first_wg_in_row + 1;
+           // The last half of the rowBlocks buffer is used to hold errors.
+           atomic_add_float(&(rowBlocks[error_loc]), new_error);
+           // Coordinate across all of the workgroups in this coop in order to have
+           // the last workgroup fix up the error values.
+           // If this workgroup's row is different than the next workgroup's row
+           // then this is the last workgroup -- it's this workgroup's job to add
+           // the error values into the final sum.
+           if (row != stop_row)
+           {
+               // Go forward once your ID is the same as the low order bits of the
+               // coop's first workgroup. That value will be used to store the number
+               // of threads that have completed so far. Once all the previous threads
+               // are done, it's time to send out the errors!
+               while((clsparse_atomic_max(&rowBlocks[first_wg_in_row], 0UL) & ((1UL << WGBITS) - 1)) != wg);
+
+#ifdef DOUBLE
+               new_error = as_double(rowBlocks[error_loc]);
+#else
+               new_error = as_float((int)rowBlocks[error_loc]);
+#endif
+               // Don't need to work atomically here, because this is the only workgroup
+               // left working on this row.
+               out[row] += new_error;
+               rowBlocks[error_loc] = 0UL;
+
+               // Reset the rowBlocks low order bits for next time.
+               rowBlocks[first_wg_in_row] = rowBlocks[gid] - wg;
+           }
+           else
+           {
+               // Otherwise, increment the low order bits of the first thread in this
+               // coop. We're using this to tell how many workgroups in a coop are done.
+               // Do this with an atomic, since other threads may be doing this too.
+               const unsigned long no_warn = clsparse_atomic_inc(&rowBlocks[first_wg_in_row]);
+           }
+#endif
+       }
    }
 }
 )"
diff --git a/src/library/kernels/csrmv_general.cl b/src/library/kernels/csrmv_general.cl
index 7d0c68f..59d36c9 100644
--- a/src/library/kernels/csrmv_general.cl
+++ b/src/library/kernels/csrmv_general.cl
@@ -51,6 +51,132 @@ R"(
 # error SUBWAVE_SIZE is not  a power of two!
 #endif
 
+// Knuth's Two-Sum algorithm, which allows us to add together two floating
+// point numbers and exactly tranform the answer into a sum and a
+// rounding error.
+// Inputs: x and y, the two inputs to be aded together.
+// In/Out: *sumk_err, which is incremented (by reference) -- holds the
+//         error value as a result of the 2sum calculation.
+// Returns: The non-corrected sum of inputs x and y.
+VALUE_TYPE two_sum( VALUE_TYPE x,
+        VALUE_TYPE y,
+        VALUE_TYPE * restrict const sumk_err)
+{
+    const VALUE_TYPE sumk_s = x + y;
+#ifdef EXTENDED_PRECISION
+    // We use this 2Sum algorithm to perform a compensated summation,
+    // which can reduce the cummulative rounding errors in our SpMV summation.
+    // Our compensated sumation is based on the SumK algorithm (with K==2) from
+    // Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in
+    // SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005.
+
+    // 2Sum can be done in 6 FLOPs without a branch. However, calculating
+    // double precision is slower than single precision on every existing GPU.
+    // As such, replacing 2Sum with Fast2Sum when using DPFP results in slightly
+    // better performance. This is especially true on non-workstation GPUs with
+    // low DPFP rates. Fast2Sum is faster even though we must ensure that
+    // |a| > |b|. Branch divergence is better than the DPFP slowdown.
+    // Thus, for DPFP, our compensated summation algorithm is actually described
+    // by both Pichat and Neumaier in "Correction d'une somme en arithmetique
+    // a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972)
+    // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher
+    // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51,
+    // 1974), respectively.
+    if (fabs(x) < fabs(y))
+    {
+        const VALUE_TYPE swap = x;
+        x = y;
+        y = swap;
+    }
+    (*sumk_err) += (y - (sumk_s - x));
+    // Original 6 FLOP 2Sum algorithm.
+    //VALUE_TYPE bp = sumk_s - x;
+    //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp));
+#endif
+    return sumk_s;
+}
+
+// Performs (x_vals * x_vec) + y using an FMA.
+// Ideally, we would perform an error-free transformation here and return the
+// appropriate error. However, the EFT of an FMA is very expensive. As such,
+// if we are in EXTENDED_PRECISION mode, this function devolves into two_sum
+// with x_vals and x_vec inputs multiplied separately from the compensated add.
+VALUE_TYPE two_fma( const VALUE_TYPE x_vals,
+        const VALUE_TYPE x_vec,
+        VALUE_TYPE y,
+        VALUE_TYPE * restrict const sumk_err )
+{
+#ifdef EXTENDED_PRECISION
+    VALUE_TYPE x = x_vals * x_vec;
+    const VALUE_TYPE sumk_s = x + y;
+    if (fabs(x) < fabs(y))
+    {
+        const VALUE_TYPE swap = x;
+        x = y;
+        y = swap;
+    }
+    (*sumk_err) += (y - (sumk_s - x));
+    // 2Sum in the FMA case. Poor performance on low-DPFP GPUs.
+    //const VALUE_TYPE bp = fma(-x_vals, x_vec, sumk_s);
+    //(*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp));
+    return sumk_s;
+#else
+    return fma(x_vals, x_vec, y);
+#endif
+}
+
+// A method of doing the final reduction without having to copy and paste
+// it a bunch of times.
+// The EXTENDED_PRECISION section is done as part of the PSum2 addition,
+// where we take temporary sums and errors for multiple threads and combine
+// them together using the same 2Sum method.
+// Inputs:  cur_sum: the input from which our sum starts
+//          err: the current running cascade error for this final summation
+//          partial: the local memory which holds the values to sum
+//                  (we eventually use it to pass down temp. err vals as well)
+//          lid: local ID of the work item calling this function.
+//          thread_lane: The lane within this SUBWAVE for reduction.
+//          round: This parallel summation method operates in multiple rounds
+//                  to do a parallel reduction. See the blow comment for usage.
+VALUE_TYPE sum2_reduce( VALUE_TYPE cur_sum,
+        VALUE_TYPE * restrict const err,
+        volatile __local VALUE_TYPE * restrict const partial,
+        const INDEX_TYPE lid,
+        const INDEX_TYPE thread_lane,
+        const INDEX_TYPE round)
+{
+    if (SUBWAVE_SIZE > round)
+    {
+#ifdef EXTENDED_PRECISION
+        const unsigned int partial_dest = lid + round;
+        if (thread_lane < round)
+            cur_sum  = two_sum(cur_sum, partial[partial_dest], err);
+        // We reuse the LDS entries to move the error values down into lower
+        // threads. This saves LDS space, allowing higher occupancy, but requires
+        // more barriers, which can reduce performance.
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // Have all of those upper threads pass their temporary errors
+        // into a location that the lower threads can read.
+        if (thread_lane >= round)
+            partial[lid] = *err;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (thread_lane < round) { // Add those errors in.
+            *err += partial[partial_dest];
+            partial[lid] = cur_sum;
+        }
+#else
+        // This is the more traditional reduction algorithm. It is up to
+        // 25% faster (about 10% on average -- potentially worse on devices
+        // with low double-precision calculation rates), but can result in
+        // numerical inaccuracies, especially in single precision.
+        cur_sum += partial[lid + round];
+        barrier( CLK_LOCAL_MEM_FENCE );
+        partial[lid] = cur_sum;
+#endif
+    }
+    return cur_sum;
+}
+
 // Uses macro constants:
 // WAVE_SIZE  - "warp size", typically 64 (AMD) or 32 (NV)
 // WG_SIZE    - workgroup ("block") size, 1D representation assumed
@@ -75,55 +201,56 @@ void csrmv_general (     const INDEX_TYPE num_rows,
     local volatile VALUE_TYPE sdata [WG_SIZE + SUBWAVE_SIZE / 2];
 
     //const int vectors_per_block = WG_SIZE/SUBWAVE_SIZE;
-    const int global_id   = get_global_id(0);         // global workitem id
-    const int local_id    = get_local_id(0);          // local workitem id
-    const int thread_lane = local_id & (SUBWAVE_SIZE - 1);
-    const int vector_id   = global_id / SUBWAVE_SIZE; // global vector id
+    const INDEX_TYPE global_id   = get_global_id(0);         // global workitem id
+    const INDEX_TYPE local_id    = get_local_id(0);          // local workitem id
+    const INDEX_TYPE thread_lane = local_id & (SUBWAVE_SIZE - 1);
+    const INDEX_TYPE vector_id   = global_id / SUBWAVE_SIZE; // global vector id
     //const int vector_lane = local_id / SUBWAVE_SIZE;  // vector id within the workgroup
-    const int num_vectors = get_global_size(0) / SUBWAVE_SIZE;
+    const INDEX_TYPE num_vectors = get_global_size(0) / SUBWAVE_SIZE;
 
     const VALUE_TYPE _alpha = alpha[off_alpha];
     const VALUE_TYPE _beta = beta[off_beta];
 
     for(INDEX_TYPE row = vector_id; row < num_rows; row += num_vectors)
     {
-        const int row_start = row_offset[row];
-        const int row_end   = row_offset[row+1];
-        VALUE_TYPE sum = (VALUE_TYPE) 0;
+        const INDEX_TYPE row_start = row_offset[row];
+        const INDEX_TYPE row_end   = row_offset[row+1];
+        VALUE_TYPE sum = 0.;
 
-        for(int j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE)
-        {
-            if (_alpha == 1)
-                sum = fma(val[j], x[off_x + col[j]], sum);
-            else if (_alpha == 0)
-                sum = 0;
-            else
-                sum = fma(_alpha * val[j], x[off_x + col[j]], sum);//sum += val[j] * x[col[j]];
-        }
+        VALUE_TYPE sumk_e = 0.;
+        // It is about 5% faster to always multiply by alpha, rather than to
+        // check whether alpha is 0, 1, or other and do different code paths.
+        for(INDEX_TYPE j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE)
+            sum = two_fma(_alpha * val[j], x[off_x + col[j]], sum, &sumk_e);
+        VALUE_TYPE new_error = 0.;
+        sum = two_sum(sum, sumk_e, &new_error);
 
-        //parllel reduction in shared memory
+        // Parallel reduction in shared memory.
         sdata[local_id] = sum;
-        barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 32) sdata[local_id] = sum += sdata[local_id + 32];
-        barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 16) sdata[local_id] = sum += sdata[local_id + 16];
-        barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 8)  sdata[local_id] = sum += sdata[local_id + 8];
-        barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 4)  sdata[local_id] = sum += sdata[local_id + 4];
-        barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 2)  sdata[local_id] = sum += sdata[local_id + 2];
-        barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 1)                    sum += sdata[local_id + 1];
+
+        // This compensated summation reduces cummulative rounding errors,
+        // which can become a problem on GPUs because our reduction order is
+        // different than what would be used on a CPU.
+        // It is based on the PSumK algorithm (with K==2) from
+        // Yamanaka, Ogita, Rump, and Oishi, "A Parallel Algorithm of
+        // Accurate Dot Product," in the Journal of Parallel Computing,
+        // 34(6-8), pp. 392-410, Jul. 2008.
+        #pragma unroll
+        for (int i = (WG_SIZE >> 1); i > 0; i >>= 1)
+        {
+            barrier( CLK_LOCAL_MEM_FENCE );
+            sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, i);
+        }
 
         if (thread_lane == 0)
         {
-            if (_beta == 1)
-                y[off_y + row] = sum + y[off_y + row];
-            else if (_beta == 0)
-                y[off_y + row] = sum;
+            if (_beta == 0)
+                y[off_y + row] = sum + new_error;
             else
-                y[off_y + row] = sum + _beta * y[off_y + row];
+            {
+                sum = two_fma(_beta, y[off_y + row], sum, &new_error);
+                y[off_y + row] = sum + new_error;
+            }
         }
     }
 }
diff --git a/src/library/transform/clsparse-coo2csr.cpp b/src/library/transform/clsparse-coo2csr.cpp
index ed6c84c..6a36d8f 100644
--- a/src/library/transform/clsparse-coo2csr.cpp
+++ b/src/library/transform/clsparse-coo2csr.cpp
@@ -77,7 +77,9 @@ clsparseCsrMetaSize( clsparseCsrMatrix* csrMatx, clsparseControl control )
 
     // This allocates up front the maximum size of rowBlocks at start; likely not all the memory is used but
     // this is the fastest
-    pCsrMatx->rowBlockSize = 3 * ( pCsrMatx->num_nonzeros / BLKSIZE ) + 2;
+    // The formula is 3 * (NNZ / block size) + 2, but we double this because CSR-Adaptive uses the
+    // second half of the rowBlocks buffer for global reductions.
+    pCsrMatx->rowBlockSize = 6 * ( pCsrMatx->num_nonzeros / BLKSIZE ) + 4;
 
     return clsparseSuccess;
 }
@@ -100,7 +102,7 @@ clsparseCsrMetaCompute( clsparseCsrMatrix* csrMatx, clsparseControl control )
     clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
     cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-    ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE );
+    ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true );
 
     return clsparseSuccess;
 }
diff --git a/src/tests/resources/csr_matrix_environment.h b/src/tests/resources/csr_matrix_environment.h
index f5df9fa..74fc5d5 100644
--- a/src/tests/resources/csr_matrix_environment.h
+++ b/src/tests/resources/csr_matrix_environment.h
@@ -73,7 +73,7 @@ class CSREnvironment: public ::testing::Environment
         csrSMatrix.rowOffsets = ::clCreateBuffer( context, CL_MEM_READ_ONLY,
                                                   ( csrSMatrix.num_rows + 1 ) * sizeof( cl_int ), NULL, &status );
 
-        csrSMatrix.rowBlocks = ::clCreateBuffer( context, CL_MEM_READ_ONLY,
+        csrSMatrix.rowBlocks = ::clCreateBuffer( context, CL_MEM_READ_WRITE,
                                                  csrSMatrix.rowBlockSize * sizeof( cl_ulong ), NULL, &status );
 
         clsparseStatus fileError = clsparseSCsrMatrixfromFile( &csrSMatrix, file_name.c_str( ), CLSE::control );
diff --git a/src/tests/test-blas2.cpp b/src/tests/test-blas2.cpp
index d72b5c0..25c12c3 100644
--- a/src/tests/test-blas2.cpp
+++ b/src/tests/test-blas2.cpp
@@ -26,12 +26,14 @@
 #include <boost/numeric/ublas/vector.hpp>
 #include <boost/numeric/ublas/blas.hpp>
 
+// ULP calculation
+#include <boost/math/special_functions/next.hpp>
 
 clsparseControl ClSparseEnvironment::control = NULL;
 cl_command_queue ClSparseEnvironment::queue = NULL;
 cl_context ClSparseEnvironment::context = NULL;
 //cl_uint ClSparseEnvironment::N = 1024;
-
+static cl_bool extended_precision = false;
 
 namespace po = boost::program_options;
 namespace uBLAS = boost::numeric::ublas;
@@ -104,11 +106,35 @@ class Blas2 : public ::testing::Test
 
     }
 
+    // Knuth's Two-Sum algorithm, which allows us to add together two floating
+    // point numbers and exactly tranform the answer into a sum and a
+    // rounding error.
+    // Inputs: x and y, the two inputs to be aded together.
+    // In/Out: *sumk_err, which is incremented (by reference) -- holds the
+    //         error value as a result of the 2sum calculation.
+    // Returns: The non-corrected sum of inputs x and y.
+    T two_sum(T x, T y, T *sumk_err)
+    {
+        // We use this 2Sum algorithm to perform a compensated summation,
+        // which can reduce the cummulative rounding errors in our SpMV
+        // summation. Our compensated sumation is based on the SumK algorithm
+        // (with K==2) from Ogita, Rump, and Oishi, "Accurate Sum and Dot
+        // Product" in SIAM J. on Scientific Computing 26(6) pp 1955-1988,
+        // Jun. 2005.
+        T sumk_s = x + y;
+        T bp = sumk_s - x;
+        (*sumk_err) += ((x - (sumk_s - bp)) + (y - bp));
+        return sumk_s;
+    }
+
     void test_csrmv()
     {
         clsparseStatus status;
         cl_int cl_status;
 
+        if (extended_precision)
+            clsparseEnableExtendedPrecision(CLSE::control, true);
+
         if (typeid(T) == typeid(cl_float) )
         {
             status = clsparseScsrmv(&gAlpha, &CSRE::csrSMatrix, &gX,
@@ -121,21 +147,70 @@ class Blas2 : public ::testing::Test
             int* cols = &CSRE::ublasSCsr.index2_data()[0];
             for (int row = 0; row < CSRE::n_rows; row++)
             {
+                // Summation done at a higher precision to decrease
+                // summation errors from rounding.
                 hY[row] *= hBeta;
                 int row_end = rows[row+1];
+                double temp_sum;
+                temp_sum = hY[row];
                 for (int i = rows[row]; i < rows[row+1]; i++)
-                    hY[row] += hAlpha * vals[i] * hX[cols[i]];
+                {
+                    // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]];
+                    temp_sum += hAlpha * vals[i] * hX[cols[i]];
+                }
+                hY[row] = temp_sum;
             }
 
-
             T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values,
                                                        CL_TRUE, CL_MAP_READ,
                                                        0, gY.num_values * sizeof(T),
                                                        0, nullptr, nullptr, &cl_status);
             ASSERT_EQ(CL_SUCCESS, cl_status);
 
+            uint64_t max_ulps = 0;
+            uint64_t min_ulps = UINT64_MAX;
+            uint64_t total_ulps = 0;
             for (int i = 0; i < hY.size(); i++)
-                ASSERT_NEAR(hY[i], host_result[i], fabs(hY[i]*1e-3));
+            {
+                long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]);
+                intDiff = llabs(intDiff);
+                total_ulps += intDiff;
+                if (max_ulps < intDiff)
+                    max_ulps = intDiff;
+                if (min_ulps > intDiff)
+                    min_ulps = intDiff;
+                // Debug printouts.
+                //printf("Row %d Float Ulps: %lld\n", i, intDiff);
+                //printf("\tFloat hY[%d] = %.*e (0x%08" PRIx32 "), ", i, 9, hY[i], *(uint32_t *)&hY[i]);
+                //printf("host_result[%d] = %.*e (0x%08" PRIx32 ")\n", i, 9, host_result[i], *(uint32_t *)&host_result[i]);
+            }
+            if (extended_precision)
+            {
+                printf("Float Min ulps: %" PRIu64 "\n", min_ulps);
+                printf("Float Max ulps: %" PRIu64 "\n", max_ulps);
+                printf("Float Total ulps: %" PRIu64 "\n", total_ulps);
+                printf("Float Average ulps: %f (Size: %lu)\n", (double)total_ulps/(double)hY.size(), hY.size());
+            }
+
+            for (int i = 0; i < hY.size(); i++)
+            {
+                double compare_val = 0.;
+                if (extended_precision)
+                {
+                    // The limit here is somewhat weak because some GPUs don't
+                    // support correctly rounded denorms in SPFP mode.
+                    if (boost::math::isnormal(hY[i]))
+                        compare_val = fabs(hY[i]*1e-3);
+                }
+                else
+                {
+                    if (boost::math::isnormal(hY[i]))
+                        compare_val = fabs(hY[i]*0.1);
+                }
+                if (compare_val < 10*FLT_EPSILON)
+                    compare_val = 10*FLT_EPSILON;
+                ASSERT_NEAR(hY[i], host_result[i], compare_val);
+            }
 
             cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,
                                                   host_result, 0, nullptr, nullptr);
@@ -154,10 +229,22 @@ class Blas2 : public ::testing::Test
             int* cols = &CSRE::ublasDCsr.index2_data()[0];
             for (int row = 0; row < CSRE::n_rows; row++)
             {
+                // Summation done using a compensated summation to decrease
+                // summation errors from rounding. This allows us to get
+                // smaller errors without requiring quad precision support.
+                // This method is like performing summation at quad precision and
+                // casting down to double in the end.
                 hY[row] *= hBeta;
                 int row_end = rows[row+1];
+                double temp_sum;
+                temp_sum = hY[row];
+                T sumk_err = 0.;
                 for (int i = rows[row]; i < rows[row+1]; i++)
-                    hY[row] += hAlpha * vals[i] * hX[cols[i]];
+                {
+                    // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]];
+                    temp_sum = two_sum(temp_sum, hAlpha*vals[i]*hX[cols[i]], &sumk_err);
+                }
+                hY[row] = temp_sum + sumk_err;
             }
 
             T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values,
@@ -166,14 +253,64 @@ class Blas2 : public ::testing::Test
                                                        0, nullptr, nullptr, &cl_status);
             ASSERT_EQ(CL_SUCCESS, cl_status);
 
+            uint64_t max_ulps = 0;
+            uint64_t min_ulps = ULLONG_MAX;
+            uint64_t total_ulps = 0;
             for (int i = 0; i < hY.size(); i++)
-                ASSERT_NEAR(hY[i], host_result[i], fabs(hY[i]*1e-10));
+            {
+                long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]);
+                intDiff = llabs(intDiff);
+                total_ulps += intDiff;
+                if (max_ulps < intDiff)
+                    max_ulps = intDiff;
+                if (min_ulps > intDiff)
+                    min_ulps = intDiff;
+                // Debug printouts.
+                //printf("Row %d Double Ulps: %lld\n", i, intDiff);
+                //printf("\tDouble hY[%d] = %.*e (0x%016" PRIx64 "), ", i, 17, hY[i], *(uint64_t *)&hY[i]);
+                //printf("host_result[%d] = %.*e (0x%016" PRIx64 ")\n", i, 17, host_result[i], *(uint64_t *)&host_result[i]);
+            }
+            if (extended_precision)
+            {
+                printf("Double Min ulps: %" PRIu64 "\n", min_ulps);
+                printf("Double Max ulps: %" PRIu64 "\n", max_ulps);
+                printf("Double Total ulps: %" PRIu64 "\n", total_ulps);
+                printf("Double Average ulps: %f (Size: %lu)\n", (double)total_ulps/(double)hY.size(), hY.size());
+
+                for (int i = 0; i < hY.size(); i++)
+                {
+                    double compare_val = fabs(hY[i]*1e-14);
+                    if (compare_val < 10*DBL_EPSILON)
+                        compare_val = 10*DBL_EPSILON;
+                    ASSERT_NEAR(hY[i], host_result[i], compare_val);
+                }
+            }
+            else
+            {
+                for (int i = 0; i < hY.size(); i++)
+                {
+                    double compare_val = 0.;
+                    if (boost::math::isnormal(hY[i]))
+                        compare_val = fabs(hY[i]*0.1);
+                    if (compare_val < 10*DBL_EPSILON)
+                        compare_val = 10*DBL_EPSILON;
+                    ASSERT_NEAR(hY[i], host_result[i], compare_val);
+                }
+            }
 
             cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,
                                                   host_result, 0, nullptr, nullptr);
             ASSERT_EQ(CL_SUCCESS, cl_status);
         }
-
+        // Reset output buffer for next test.
+        ::clReleaseMemObject(gY.values);
+        clsparseInitVector(&gY);
+        gY.values = clCreateBuffer(CLSE::context,
+                CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                hY.size() * sizeof(T), hY.data().begin(),
+                &cl_status);
+        gY.num_values = hY.size();
+        ASSERT_EQ(CL_SUCCESS, cl_status);
     }
 
     uBLAS::vector<T> hX;
@@ -198,13 +335,7 @@ TYPED_TEST_CASE(Blas2, TYPES);
 
 TYPED_TEST(Blas2, csrmv_adaptive)
 {
-    if ( typeid(TypeParam) == typeid(cl_float) )
-        this->test_csrmv();
-    else
-    {
-//        std::cerr << "Adaptive version of csrmv might crash!" << std::endl;
-        this->test_csrmv();
-    }
+    this->test_csrmv();
 }
 
 TYPED_TEST(Blas2, csrmv_vector)
@@ -237,7 +368,7 @@ TYPED_TEST(Blas2, csrmv_vector)
     ASSERT_EQ(clsparseSuccess, status);
 
     CSRE::csrSMatrix.rowBlocks =
-            ::clCreateBuffer( CLSE::context, CL_MEM_READ_ONLY,
+            ::clCreateBuffer( CLSE::context, CL_MEM_READ_WRITE,
                               CSRE::csrSMatrix.rowBlockSize * sizeof( cl_ulong ),
                               NULL, &cl_status );
 
@@ -278,8 +409,8 @@ int main (int argc, char* argv[])
             ("alpha,a", po::value(&alpha)->default_value(1.0),
              "Alpha parameter for eq: \n\ty = alpha * M * x + beta * y")
             ("beta,b", po::value(&beta)->default_value(1.0),
-             "Beta parameter for eq: \n\ty = alpha * M * x + beta * y");
-
+             "Beta parameter for eq: \n\ty = alpha * M * x + beta * y")
+            ("extended,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy by emulating extended precision.");
 
     po::variables_map vm;
     po::parsed_options parsed =
@@ -323,6 +454,9 @@ int main (int argc, char* argv[])
 
     }
 
+    if (vm["extended"].as<bool>())
+        extended_precision = true;
+
     ::testing::InitGoogleTest(&argc, argv);
     //order does matter!
     ::testing::AddGlobalTestEnvironment( new CLSE(pID, dID));