From ad7144c0a020bb1aaff52a930ef783fac3b63bec Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Mon, 20 Jul 2015 12:20:41 -0500
Subject: [PATCH 01/21] Minor bug fixes. In particular, the change to
 computeRowBlocks prevents us from adding an extra copy of the final workgroup
 if it turns out that our total number of rows exactly fits within the final
 rowBlock. This caused problems with global asynchronous reduction.

---
 src/library/internal/computeRowBlocks.hpp    |  7 +++++--
 src/tests/resources/csr_matrix_environment.h |  2 +-
 src/tests/test-blas2.cpp                     | 10 ++--------
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/library/internal/computeRowBlocks.hpp b/src/library/internal/computeRowBlocks.hpp
index 1145aa4..387ca7f 100644
--- a/src/library/internal/computeRowBlocks.hpp
+++ b/src/library/internal/computeRowBlocks.hpp
@@ -91,8 +91,11 @@ void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int*
 
     }
 
-    *rowBlocks = ( static_cast< rowBlockType >( nRows ) << (64 - ROW_BITS) );
-    rowBlocks++;
+    if ((*(rowBlocks-1) >> (64 - ROW_BITS)) != static_cast< rowBlockType>( nRows))
+    {
+        *rowBlocks = ( static_cast< rowBlockType >( nRows ) << (64 - ROW_BITS) );
+        rowBlocks++;
+    }
 
     size_t dist = std::distance( rowBlocksBase, rowBlocks );
     assert( dist <= rowBlockSize );
diff --git a/src/tests/resources/csr_matrix_environment.h b/src/tests/resources/csr_matrix_environment.h
index bdc2d53..4a7cf95 100644
--- a/src/tests/resources/csr_matrix_environment.h
+++ b/src/tests/resources/csr_matrix_environment.h
@@ -57,7 +57,7 @@ class CSREnvironment: public ::testing::Environment
         csrSMatrix.rowOffsets = ::clCreateBuffer( context, CL_MEM_READ_ONLY,
                                                   ( csrSMatrix.num_rows + 1 ) * sizeof( cl_int ), NULL, &status );
 
-        csrSMatrix.rowBlocks = ::clCreateBuffer( context, CL_MEM_READ_ONLY,
+        csrSMatrix.rowBlocks = ::clCreateBuffer( context, CL_MEM_READ_WRITE,
                                                  csrSMatrix.rowBlockSize * sizeof( cl_ulong ), NULL, &status );
 
         clsparseStatus fileError = clsparseSCsrMatrixfromFile( &csrSMatrix, file_name.c_str( ), CLSE::control );
diff --git a/src/tests/test-blas2.cpp b/src/tests/test-blas2.cpp
index abb895b..7ebc6b8 100644
--- a/src/tests/test-blas2.cpp
+++ b/src/tests/test-blas2.cpp
@@ -182,13 +182,7 @@ TYPED_TEST_CASE(Blas2, TYPES);
 
 TYPED_TEST(Blas2, csrmv_adaptive)
 {
-    if ( typeid(TypeParam) == typeid(cl_float) )
-        this->test_csrmv();
-    else
-    {
-//        std::cerr << "Adaptive version of csrmv might crash!" << std::endl;
-        this->test_csrmv();
-    }
+    this->test_csrmv();
 }
 
 TYPED_TEST(Blas2, csrmv_vector)
@@ -221,7 +215,7 @@ TYPED_TEST(Blas2, csrmv_vector)
     ASSERT_EQ(clsparseSuccess, status);
 
     CSRE::csrSMatrix.rowBlocks =
-            ::clCreateBuffer( CLSE::context, CL_MEM_READ_ONLY,
+            ::clCreateBuffer( CLSE::context, CL_MEM_READ_WRITE,
                               CSRE::csrSMatrix.rowBlockSize * sizeof( cl_ulong ),
                               NULL, &cl_status );
 

From 774d357ea1d2385a3db0918c2c5c29c38d565242 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Mon, 20 Jul 2015 13:59:24 -0500
Subject: [PATCH 02/21] Reworked how CPU-side SpMV is performed in order to
 increase its fidelity. Using double precision intermediate calculations for
 the float answer. Using compensated summation to (in effect) perform quad
 precision intermediate calculations for the double answer. Calculating ULPs
 difference between CPU and GPU results.

---
 src/tests/test-blas2.cpp | 116 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 110 insertions(+), 6 deletions(-)

diff --git a/src/tests/test-blas2.cpp b/src/tests/test-blas2.cpp
index 7ebc6b8..3a9a4d2 100644
--- a/src/tests/test-blas2.cpp
+++ b/src/tests/test-blas2.cpp
@@ -10,6 +10,8 @@
 #include <boost/numeric/ublas/vector.hpp>
 #include <boost/numeric/ublas/blas.hpp>
 
+// ULP calculation
+#include <boost/math/special_functions/next.hpp>
 
 clsparseControl ClSparseEnvironment::control = NULL;
 cl_command_queue ClSparseEnvironment::queue = NULL;
@@ -88,6 +90,27 @@ class Blas2 : public ::testing::Test
 
     }
 
+    // Knuth's Two-Sum algorithm, which allows us to add together two floating
+    // point numbers and exactly tranform the answer into a sum and a
+    // rounding error.
+    // Inputs: x and y, the two inputs to be aded together.
+    // In/Out: *sumk_err, which is incremented (by reference) -- holds the
+    //         error value as a result of the 2sum calculation.
+    // Returns: The non-corrected sum of inputs x and y.
+    T two_sum(T x, T y, T *sumk_err)
+    {
+        // We use this 2Sum algorithm to perform a compensated summation,
+        // which can reduce the cummulative rounding errors in our SpMV
+        // summation. Our compensated sumation is based on the SumK algorithm
+        // (with K==2) from Ogita, Rump, and Oishi, "Accurate Sum and Dot
+        // Product" in SIAM J. on Scientific Computing 26(6) pp 1955-1988,
+        // Jun. 2005.
+        T sumk_s = x + y;
+        T bp = sumk_s - x;
+        (*sumk_err) += ((x - (sumk_s - bp)) + (y - bp));
+        return sumk_s;
+    }
+
     void test_csrmv()
     {
         clsparseStatus status;
@@ -105,21 +128,55 @@ class Blas2 : public ::testing::Test
             int* cols = &CSRE::ublasSCsr.index2_data()[0];
             for (int row = 0; row < CSRE::n_rows; row++)
             {
+                // Summation done at a higher precision to decrease
+                // summation errors from rounding.
                 hY[row] *= hBeta;
                 int row_end = rows[row+1];
+                double temp_sum;
+                temp_sum = hY[row];
                 for (int i = rows[row]; i < rows[row+1]; i++)
-                    hY[row] += hAlpha * vals[i] * hX[cols[i]];
+                {
+                    // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]];
+                    temp_sum += hAlpha * vals[i] * hX[cols[i]];
+                }
+                hY[row] = temp_sum;
             }
 
-
             T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values,
                                                        CL_TRUE, CL_MAP_READ,
                                                        0, gY.num_values * sizeof(T),
                                                        0, nullptr, nullptr, &cl_status);
             ASSERT_EQ(CL_SUCCESS, cl_status);
 
+            uint64_t max_ulps = 0;
+            uint64_t min_ulps = UINT64_MAX;
+            uint64_t total_ulps = 0;
             for (int i = 0; i < hY.size(); i++)
-                ASSERT_NEAR(hY[i], host_result[i], fabs(hY[i]*1e-3));
+            {
+                int intDiff = (int)boost::math::float_distance(hY[i], host_result[i]);
+                intDiff = abs(intDiff);
+                total_ulps += intDiff;
+                if (max_ulps < intDiff)
+                    max_ulps = intDiff;
+                if (min_ulps > intDiff)
+                    min_ulps = intDiff;
+                // Debug printouts.
+                //printf("Row %d Float Ulps: %d\n", i, intDiff);
+                //printf("\tFloat hY[%d] = %.*e (0x%08" PRIx32 "), ", i, 9, hY[i], *(uint32_t *)&hY[i]);
+                //printf("host_result[%d] = %.*e (0x%08" PRIx32 ")\n", i, 9, host_result[i], *(uint32_t *)&host_result[i]);
+            }
+            printf("Float Min ulps: %" PRIu64 "\n", min_ulps);
+            printf("Float Max ulps: %" PRIu64 "\n", max_ulps);
+            printf("Float Total ulps: %" PRIu64 "\n", total_ulps);
+            printf("Float Average ulps: %f (Size: %lu)\n", (double)total_ulps/(double)hY.size(), hY.size());
+
+            for (int i = 0; i < hY.size(); i++)
+            {
+                double compare_val = fabs(hY[i]*1e-5);
+                if (compare_val < 10*FLT_EPSILON)
+                    compare_val = 10*FLT_EPSILON;
+                ASSERT_NEAR(hY[i], host_result[i], compare_val);
+            }
 
             cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,
                                                   host_result, 0, nullptr, nullptr);
@@ -138,10 +195,22 @@ class Blas2 : public ::testing::Test
             int* cols = &CSRE::ublasDCsr.index2_data()[0];
             for (int row = 0; row < CSRE::n_rows; row++)
             {
+                // Summation done using a compensated summation to decrease
+                // summation errors from rounding. This allows us to get
+                // smaller errors without requiring quad precision support.
+                // This method is like performing summation at quad precision and
+                // casting down to double in the end.
                 hY[row] *= hBeta;
                 int row_end = rows[row+1];
+                double temp_sum;
+                temp_sum = hY[row];
+                T sumk_err = 0.;
                 for (int i = rows[row]; i < rows[row+1]; i++)
-                    hY[row] += hAlpha * vals[i] * hX[cols[i]];
+                {
+                    // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]];
+                    temp_sum = two_sum(temp_sum, hAlpha*vals[i]*hX[cols[i]], &sumk_err);
+                }
+                hY[row] = temp_sum + sumk_err;
             }
 
             T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values,
@@ -150,14 +219,49 @@ class Blas2 : public ::testing::Test
                                                        0, nullptr, nullptr, &cl_status);
             ASSERT_EQ(CL_SUCCESS, cl_status);
 
+            uint64_t max_ulps = 0;
+            uint64_t min_ulps = ULLONG_MAX;
+            uint64_t total_ulps = 0;
             for (int i = 0; i < hY.size(); i++)
-                ASSERT_NEAR(hY[i], host_result[i], fabs(hY[i]*1e-10));
+            {
+                long long int intDiff = (int)boost::math::float_distance(hY[i], host_result[i]);
+               intDiff = abs(intDiff);
+                total_ulps += intDiff;
+                if (max_ulps < intDiff)
+                    max_ulps = intDiff;
+                if (min_ulps > intDiff)
+                    min_ulps = intDiff;
+                // Debug printouts.
+                //printf("Row %d Double Ulps: %lld\n", i, intDiff);
+                //printf("\tDouble hY[%d] = %.*e (0x%016" PRIx64 "), ", i, 17, hY[i], *(uint64_t *)&hY[i]);
+                //printf("host_result[%d] = %.*e (0x%016" PRIx64 ")\n", i, 17, host_result[i], *(uint64_t *)&host_result[i]);
+            }
+            printf("Double Min ulps: %" PRIu64 "\n", min_ulps);
+            printf("Double Max ulps: %" PRIu64 "\n", max_ulps);
+            printf("Double Total ulps: %" PRIu64 "\n", total_ulps);
+            printf("Double Average ulps: %f (Size: %lu)\n", (double)total_ulps/(double)hY.size(), hY.size());
+
+            for (int i = 0; i < hY.size(); i++)
+            {
+                double compare_val = fabs(hY[i]*1e-14);
+                if (compare_val < 10*DBL_EPSILON)
+                    compare_val = 10*DBL_EPSILON;
+                ASSERT_NEAR(hY[i], host_result[i], compare_val);
+            }
 
             cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,
                                                   host_result, 0, nullptr, nullptr);
             ASSERT_EQ(CL_SUCCESS, cl_status);
         }
-
+        // Reset output buffer for next test.
+        ::clReleaseMemObject(gY.values);
+        clsparseInitVector(&gY);
+        gY.values = clCreateBuffer(CLSE::context,
+                CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                hY.size() * sizeof(T), hY.data().begin(),
+                &cl_status);
+        gY.num_values = hY.size();
+        ASSERT_EQ(CL_SUCCESS, cl_status);
     }
 
     uBLAS::vector<T> hX;

From 3ad55273e9c762a669f0198a12dbf49690b223c6 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Mon, 20 Jul 2015 14:08:26 -0500
Subject: [PATCH 03/21] Added compensated summation to csrmv_general. Assuming
 there is no floating-point underflow, this calculates answers in twice the
 precision before rounding to the native precision. In other words, like
 calculating all of the intermediate results in double before finally rounding
 to float. This results in many matrices being bitwise identical to what is
 calculated on the CPU.

---
 src/library/kernels/csrmv_general.cl | 149 +++++++++++++++++++++++----
 1 file changed, 129 insertions(+), 20 deletions(-)

diff --git a/src/library/kernels/csrmv_general.cl b/src/library/kernels/csrmv_general.cl
index 3bde939..88944f9 100644
--- a/src/library/kernels/csrmv_general.cl
+++ b/src/library/kernels/csrmv_general.cl
@@ -35,6 +35,107 @@ R"(
 # error SUBWAVE_SIZE is not  a power of two!
 #endif
 
+#define EXTENDED_PRECISION
+
+// Knuth's Two-Sum algorithm, which allows us to add together two floating
+// point numbers and exactly tranform the answer into a sum and a
+// rounding error.
+// Inputs: x and y, the two inputs to be aded together.
+// In/Out: *sumk_err, which is incremented (by reference) -- holds the
+//         error value as a result of the 2sum calculation.
+// Returns: The non-corrected sum of inputs x and y.
+VALUE_TYPE two_sum( VALUE_TYPE x,
+        VALUE_TYPE y,
+        VALUE_TYPE *sumk_err)
+{
+    VALUE_TYPE sumk_s = x + y;
+#ifdef EXTENDED_PRECISION
+    /* We use this 2Sum algorithm to perform a compensated summation,
+       which can reduce the cummulative rounding errors in our SpMV summation.
+       Our compensated sumation is based on the SumK algorithm (with K==2) from
+       Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in
+       SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005.
+
+       2Sum can be done in 6 FLOPs without a branch. However, calculating
+       double precision is slower than single precision on every existing GPU.
+       As such, replacing 2Sum with Fast2Sum when using DPFP results in slightly
+       better performance. This is especially true on non-workstation GPUs with
+       low DPFP rates. Fast2Sum is faster even though we must ensure that
+       |a| > |b|. Branch divergence is better than the DPFP slowdown.
+       Thus, for DPFP, our compensated summation algorithm is actually described
+       by both Pichat and Neumaier in "Correction d'une somme en arithmetique
+       a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972)
+       and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher
+       Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51,
+       1974), respectively. */
+    VALUE_TYPE ap, bp;
+    if (fabs(x) > fabs(y)) {
+        ap = x;
+        bp = y;
+    }
+    else {
+        ap = y;
+        bp = x;
+    }
+    (*sumk_err) += (bp - (sumk_s - ap));
+    // Original 6 FLOP 2Sum algorithm.
+    //VALUE_TYPE bp = sumk_s - x;
+    //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp));
+#endif
+    return sumk_s;
+}
+
+// A method of doing the final reduction without having to copy and paste
+// it a bunch of times.
+// The EXTENDED_PRECISION section is done as part of the PSum2 addition,
+// where we take temporary sums and errors for multiple threads and combine
+// them together using the same 2Sum method.
+// Inputs:  cur_sum: the input from which our sum starts
+//          err: the current running cascade error for this final summation
+//          partial: the local memory which holds the values to sum
+//                  (we eventually use it to pass down temp. err vals as well)
+//          lid: local ID of the work item calling this function.
+//          thread_lane: The lane within this SUBWAVE for reduction.
+//          round: This parallel summation method operates in multiple rounds
+//                  to do a parallel reduction. See the blow comment for usage.
+VALUE_TYPE sum2_reduce( VALUE_TYPE cur_sum,
+        VALUE_TYPE *err,
+        volatile __local VALUE_TYPE *partial,
+        size_t lid,
+        int thread_lane,
+        int round)
+{
+    if (SUBWAVE_SIZE > round)
+    {
+#ifdef EXTENDED_PRECISION
+        if (thread_lane < round)
+            cur_sum  = two_sum(cur_sum, partial[lid + round], err);
+
+        // We reuse the LDS entries to move the error values down into lower
+        // threads. This saves LDS space, allowing higher occupancy, but requires
+        // more barriers, which can reduce performance.
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // Have all of those upper threads pass their temporary errors
+        // into a location that the lower threads can read.
+        if (thread_lane >= round)
+            partial[lid] = *err;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (thread_lane < round) { // Add those errors in.
+            *err += partial[lid + round];
+            partial[lid] = cur_sum;
+        }
+#else
+        // This is the more traditional reduction algorithm. It is up to
+        // 25% faster (about 10% on average -- potentially worse on devices
+        // with low double-precision calculation rates), but can result in
+        // numerical inaccuracies, especially in single precision.
+        cur_sum += partial[lid + round];
+        partial[lid] = cur_sum;
+#endif
+    }
+    return cur_sum;
+}
+
 // Uses macro constants:
 // WAVE_SIZE  - "warp size", typically 64 (AMD) or 32 (NV)
 // WG_SIZE    - workgroup ("block") size, 1D representation assumed
@@ -75,39 +176,47 @@ void csrmv_general (     const INDEX_TYPE num_rows,
         const int row_end   = row_offset[row+1];
         VALUE_TYPE sum = (VALUE_TYPE) 0;
 
+        VALUE_TYPE sumk_e = 0.;
+        // It is about 5% faster to always multiply by alpha, rather than to
+        // check whether alpha is 0, 1, or other and do different code paths.
         for(int j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE)
-        {
-            if (_alpha == 1)
-                sum = fma(val[j], x[off_x + col[j]], sum);
-            else if (_alpha == 0)
-                sum = 0;
-            else
-                sum = fma(_alpha * val[j], x[off_x + col[j]], sum);//sum += val[j] * x[col[j]];
-        }
+            sum = two_sum(sum, (_alpha * val[j] * x[off_x + col[j]]), &sumk_e);
+        VALUE_TYPE new_error = 0.;
+        sum = two_sum(sum, sumk_e, &new_error);
 
-        //parllel reduction in shared memory
+        // Parallel reduction in shared memory.
         sdata[local_id] = sum;
         barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 32) sdata[local_id] = sum += sdata[local_id + 32];
+
+        // This compensated summation reduces cummulative rounding errors,
+        // which can become a problem on GPUs because our reduction order is
+        // different than what would be used on a CPU.
+        // It is based on the PSumK algorithm (with K==2) from
+        // Yamanaka, Ogita, Rump, and Oishi, "A Parallel Algorithm of
+        // Accurate Dot Product," in the Journal of Parallel Computing,
+        // 34(6-8), pp. 392-410, Jul. 2008.
+        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 32);
+        barrier( CLK_LOCAL_MEM_FENCE );
+        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 16);
         barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 16) sdata[local_id] = sum += sdata[local_id + 16];
+        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 8);
         barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 8)  sdata[local_id] = sum += sdata[local_id + 8];
+        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 4);
         barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 4)  sdata[local_id] = sum += sdata[local_id + 4];
+        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 2);
         barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 2)  sdata[local_id] = sum += sdata[local_id + 2];
+        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 1);
         barrier( CLK_LOCAL_MEM_FENCE );
-        if (SUBWAVE_SIZE > 1)                    sum += sdata[local_id + 1];
 
         if (thread_lane == 0)
         {
-            if (_beta == 1)
-                y[off_y + row] = sum + y[off_y + row];
-            else if (_beta == 0)
-                y[off_y + row] = sum;
+            if (_beta == 0)
+                y[off_y + row] = sum + new_error;
             else
-                y[off_y + row] = sum + _beta * y[off_y + row];
+            {
+                sum = two_sum(sum, _beta * y[off_y + row], &new_error);
+                y[off_y + row] = sum + new_error;
+            }
         }
     }
 }

From 7670486224b99f2f8dbb8d37edb56bdd46c5d4f0 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Mon, 20 Jul 2015 16:10:56 -0500
Subject: [PATCH 04/21] Adding compensated summation support to CSR-Adaptive.
 This requires expanding the rowBlocks buffer size by 2x in order to have a
 place to reduce error values between workgroups in the CSR-LongRows case.

---
 src/library/blas2/clsparse-csrmv.hpp       |  20 +-
 src/library/blas2/csrmv-adaptive.hpp       |  10 +-
 src/library/internal/computeRowBlocks.hpp  |   6 +-
 src/library/kernels/csrmv_adaptive.cl      | 306 +++++++++++++++++----
 src/library/transform/clsparse-coo2csr.cpp |   4 +-
 5 files changed, 269 insertions(+), 77 deletions(-)

diff --git a/src/library/blas2/clsparse-csrmv.hpp b/src/library/blas2/clsparse-csrmv.hpp
index ac1bbed..dbf8d93 100644
--- a/src/library/blas2/clsparse-csrmv.hpp
+++ b/src/library/blas2/clsparse-csrmv.hpp
@@ -29,15 +29,11 @@ csrmv (const clsparseScalarPrivate *pAlpha,
             return clsparseStructInvalid;
         }
 
-       //   We have problems with failing test cases with csrmv_adaptive on double precision
-       //   fall back to csrmv_vector
-       if( typeid( T ) == typeid( cl_double ) )
-       {
-           return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
-       }
+        // Use this for csrmv_general instead of adaptive.
+        //return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
 
-       // Call adaptive CSR kernels
-       return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
+        // Call adaptive CSR kernels
+        return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
 
     }
 }
@@ -67,12 +63,8 @@ csrmv (const clsparse::array_base<T>& pAlpha,
             return clsparseStructInvalid;
         }
 
-        //   We have problems with failing test cases with csrmv_adaptive on double precision
-        //   fall back to csrmv_vector
-        if( typeid( T ) == typeid( cl_double ) )
-        {
-            return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
-        }
+        // Use this for csrmv_general instead of adaptive.
+        //return csrmv_vector<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
 
         // Call adaptive CSR kernels
         return csrmv_adaptive<T>( pAlpha, pCsrMatx, pX, pBeta, pY, control );
diff --git a/src/library/blas2/csrmv-adaptive.hpp b/src/library/blas2/csrmv-adaptive.hpp
index 10a00a6..a2b9a3d 100644
--- a/src/library/blas2/csrmv-adaptive.hpp
+++ b/src/library/blas2/csrmv-adaptive.hpp
@@ -53,7 +53,10 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
     // if NVIDIA is used it does not allow to run the group size
     // which is not a multiplication of group_size. Don't know if that
     // have an impact on performance
-    cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size;
+    // Setting global work size to half the row block size because we are only
+    // using half the row blocks buffer for actual work.
+    // The other half is used for the extended precision reduction.
+    cl_uint global_work_size = ( (pCsrMatx->rowBlockSize/2) - 1 ) * group_size;
     cl::NDRange local( group_size );
     cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );
 
@@ -112,7 +115,10 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
     // if NVIDIA is used it does not allow to run the group size
     // which is not a multiplication of group_size. Don't know if that
     // have an impact on performance
-    cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size;
+    // Setting global work size to half the row block size because we are only
+    // using half the row blocks buffer for actual work.
+    // The other half is used for the extended precision reduction.
+    cl_uint global_work_size = ( (pCsrMatx->rowBlockSize/2) - 1 ) * group_size;
     cl::NDRange local( group_size );
     cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );
 
diff --git a/src/library/internal/computeRowBlocks.hpp b/src/library/internal/computeRowBlocks.hpp
index 387ca7f..e14aed2 100644
--- a/src/library/internal/computeRowBlocks.hpp
+++ b/src/library/internal/computeRowBlocks.hpp
@@ -98,10 +98,12 @@ void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int*
     }
 
     size_t dist = std::distance( rowBlocksBase, rowBlocks );
-    assert( dist <= rowBlockSize );
+    assert( (2 * dist) <= rowBlockSize );
 
     //   Update the size of rowBlocks to reflect the actual amount of memory used
-    rowBlockSize = dist;
+    // We're multiplying the size by two because the extended precision form of
+    // CSR-Adaptive requires more space for the final global reduction.
+    rowBlockSize = 2 * dist;
 }
 
 #endif
diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 350ba32..26fb197 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -50,6 +50,8 @@ R"(
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
 
+#define EXTENDED_PRECISION
+
 int lowerPowerOf2(int num)
 {
         num --;
@@ -67,7 +69,7 @@ int lowerPowerOf2(int num)
         return num;
 }
 
-void atomic_add_float (global FPTYPE *ptr, FPTYPE temp)
+FPTYPE atomic_add_float_extended (global FPTYPE *ptr, FPTYPE temp, FPTYPE *old_sum)
 {
 #ifdef DOUBLE
 	unsigned long newVal;
@@ -77,7 +79,9 @@ void atomic_add_float (global FPTYPE *ptr, FPTYPE temp)
 		prevVal = as_ulong(*ptr);
 		newVal = as_ulong(temp + *ptr);
 	} while (atom_cmpxchg((global unsigned long *)ptr, prevVal, newVal) != prevVal);
-
+    if (old_sum != 0)
+        *old_sum = as_double(prevVal);
+    return as_double(newVal);
 #else
 	unsigned int newVal;
 	unsigned int prevVal;
@@ -86,7 +90,140 @@ void atomic_add_float (global FPTYPE *ptr, FPTYPE temp)
 		prevVal = as_uint(*ptr);
 		newVal = as_uint(temp + *ptr);
 	} while (atomic_cmpxchg((global unsigned int *)ptr, prevVal, newVal) != prevVal);
+    if (old_sum != 0)
+        *old_sum = as_float(prevVal);
+    return as_float(newVal);
+#endif
+}
+
+void atomic_add_float (global void *ptr, FPTYPE temp)
+{
+    atomic_add_float_extended(ptr, temp, 0);
+}
+
+
+// Knuth's Two-Sum algorithm, which allows us to add together two floating
+// point numbers and exactly tranform the answer into a sum and a
+// rounding error.
+// Inputs: x and y, the two inputs to be aded together.
+// In/Out: *sumk_err, which is incremented (by reference) -- holds the
+//         error value as a result of the 2sum calculation.
+// Returns: The non-corrected sum of inputs x and y.
+FPTYPE two_sum( FPTYPE x,
+                FPTYPE y,
+                FPTYPE *sumk_err)
+{
+    FPTYPE sumk_s = x + y;
+#ifdef EXTENDED_PRECISION
+    /* We use this 2Sum algorithm to perform a compensated summation,
+       which can reduce the cummulative rounding errors in our SpMV summation.
+       Our compensated sumation is based on the SumK algorithm (with K==2) from
+       Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in
+       SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005.
+
+       2Sum can be done in 6 FLOPs without a branch. However, calculating
+       double precision is slower than single precision on every existing GPU.
+       As such, replacing 2Sum with Fast2Sum when using DPFP results in better
+       performance (~5% higher total). This is true even though we must ensure
+       that |a| > |b|. Branch divergence is better than the DPFP slowdown.
+       Thus, for DPFP, our compensated summation algorithm is actually described
+       by both Pichat and Neumaier in "Correction d'une somme en arithmetique
+       a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972)
+       and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher
+       Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51,
+       1974), respectively.
+
+       Single precision is the same performance using either algorithm, so we
+       use only Fast2Sum */
+    FPTYPE swap;
+    if (fabs(x) < fabs(y)) {
+        swap = x;
+        x = y;
+        y = swap;
+    }
+    (*sumk_err) += (y - (sumk_s - x));
+    // Original 6 FLOP 2Sum algorithm.
+    //FPTYPE bp = sumk_s - x;
+    //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp));
 #endif
+    return sumk_s;
+}
+
+FPTYPE atomic_two_sum_float(global FPTYPE *x_ptr,
+                            FPTYPE y,
+                            FPTYPE *sumk_err)
+{
+    // Have to wait until the return from the atomic op to know what X was.
+    FPTYPE sumk_s;
+#ifdef EXTENDED_PRECISION
+    FPTYPE x, swap;
+    sumk_s = atomic_add_float_extended(x_ptr, y, &x);
+    if (fabs(x) < fabs(y)) {
+        swap = x;
+        x = y;
+        y = swap;
+    }
+    (*sumk_err) += (y - (sumk_s - x));
+#else
+    atomic_add_float(x_ptr, y);
+#endif
+    return sumk_s;
+}
+
+// A method of doing the final reduction in CSR-Vector without having to copy
+// and paste it a bunch of times.
+// The EXTENDED_PRECISION section is done as part of a compensated summation
+// meant to reduce cummulative rounding errors. This can become a problem on
+// GPUs because the reduction order is different than what would be used on a
+// CPU. It is based on the PSumK algorithm (with K==2) from Yamanaka, Ogita,
+// Rump, and Oishi, "A Parallel Algorithm of Accurate Dot Product," in
+// J. Parallel Computing, 34(6-8), pp. 392-410, Jul. 2008. */
+// Inputs:  cur_sum: the input from which our sum starts
+//          err: the current running cascade error for this final summation
+//          partial: the local memory which holds the values to sum
+//                  (we eventually use it to pass down temp. err vals as well)
+//          lid: local ID of the work item calling this function.
+//          integers: This parallel summation method is meant to take values
+//                  from four different points. See the blow comment for usage.
+FPTYPE sum2_reduce( FPTYPE cur_sum,
+        FPTYPE *err,
+        __local FPTYPE *partial,
+        size_t lid,
+        int first,
+        int second,
+        int third,
+        int last)
+{
+    // A subset of the threads in this workgroup add three sets
+    // of values together using the 2Sum method.
+    // For instance, threads 0-63 would each load four values
+    // from the upper 192 locations.
+    // Example: Thread 0 loads 0, 64, 128, and 192.
+    if (lid < first)
+    {
+        cur_sum = two_sum(cur_sum, partial[lid + first], err);
+        cur_sum = two_sum(cur_sum, partial[lid + second], err);
+        cur_sum = two_sum(cur_sum, partial[lid + third], err);
+    }
+#ifdef EXTENDED_PRECISION
+    // We reuse the LDS entries to move the error values down into lower
+    // threads. This saves LDS space, allowing higher occupancy, but requires
+    // more barriers, which can reduce performance.
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // Have all of those upper threads pass their temporary errors
+    // into a location that the lower threads can read.
+    if (lid >= first && lid < last)
+        partial[lid] = *err;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid < first) // Add those errors in.
+    {
+        cur_sum = two_sum(cur_sum, partial[lid + first], err);
+        cur_sum = two_sum(cur_sum, partial[lid + second], err);
+        cur_sum = two_sum(cur_sum, partial[lid + third], err);
+        partial[lid] = cur_sum;
+    }
+#endif
+    return cur_sum;
 }
 
 __kernel void
@@ -162,9 +299,7 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
       if (gid != (get_num_groups(0) - 1))
       {
           for(int i = 0; i < BLOCKSIZE; i += 256)
-          {
-              partialSums[lid + i] = vals[col + i] * vec[cols[col + i]];
-          }
+              partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]];
       }
       else
       {
@@ -177,9 +312,7 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           // to be launched, and this loop can't be unrolled.
           int max_to_load = rowPtrs[stop_row] - rowPtrs[row];
           for(int i = 0; i < max_to_load; i += 256)
-          {
-              partialSums[lid + i] = vals[col + i] * vec[cols[col + i]];
-          }
+              partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]];
       }
       barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -191,6 +324,7 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           // After that, a single thread from each of those teams linearly walks through
           // the local memory values for that row and reduces to the final output value.
          FPTYPE temp = 0.;
+         FPTYPE sumk_e = 0.;
 
          // {numThreadsForRed} adjacent threads all work on the same row, so their
          // start and end values are the same.
@@ -206,18 +340,17 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
          {
             // only works when numThreadsForRed is a power of 2
             for(int i = 0; i < workForEachThread; i++)
-            {
-               temp += partialSums[local_first_val + i*numThreadsForRed + threadInBlock];
-            }
+               temp = two_sum(temp, partialSums[local_first_val + i*numThreadsForRed + threadInBlock], &sumk_e);
 
             // The last few values (the remainder of this row) also need to be aded in.
             int local_cur_val = local_first_val + numThreadsForRed*workForEachThread;
             if(threadInBlock < local_last_val - local_cur_val)
-            {
-               temp += partialSums[local_cur_val + threadInBlock];
-            }
+               temp = two_sum(temp, partialSums[local_cur_val + threadInBlock], &sumk_e);
          }
          barrier(CLK_LOCAL_MEM_FENCE);
+
+         FPTYPE new_error = 0.;
+         temp = two_sum(temp, sumk_e, &new_error);
          partialSums[lid] = temp;
 
          // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed}
@@ -225,20 +358,34 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
          // each of the adjacent-groups and uses it to walk through those values and reduce
          // them into a final output value for the row.
          temp = 0.;
+         sumk_e = 0.;
+         barrier(CLK_LOCAL_MEM_FENCE);
          if(lid < (stop_row - row))
          {
 #pragma unroll 4
-            for(int i = 0; i < numThreadsForRed; i++)
-            {
-               temp += partialSums[lid*numThreadsForRed + i];
-            }
-            // All of our write-outs check to see if the output vector should first be zeroed.
-            // If so, just do a write rather than a read-write. Measured to be a slight (~5%)
-            // performance improvement.
-            if (beta != 0.)
-                out[local_row] = (beta * out[local_row]) + (alpha * temp);
-            else
-                out[local_row] = (alpha * temp);
+             for(int i = 0; i < numThreadsForRed; i++)
+                 temp = two_sum(temp, partialSums[lid*numThreadsForRed + i], &sumk_e);
+         }
+#ifdef EXTENDED_PRECISION
+         // Values in partialSums[] are finished, now let's add in all of the local error values.
+         barrier(CLK_LOCAL_MEM_FENCE);
+         partialSums[lid] = new_error;
+         barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+         if(lid < (stop_row - row))
+         {
+#ifdef EXTENDED_PRECISION
+             // Sum up the local error values which are now in partialSums[]
+             for(int i = 0; i < numThreadsForRed; i++)
+                 temp = two_sum(temp, partialSums[lid*numThreadsForRed + i], &sumk_e);
+#endif
+
+             // All of our write-outs check to see if the output vector should first be zeroed.
+             // If so, just do a write rather than a read-write. Measured to be a slight (~5%)
+             // performance improvement.
+             if (beta != 0.)
+                 temp = two_sum(beta * out[local_row], temp, &sumk_e);
+             out[local_row] = temp + sumk_e;
          }
       }
       else
@@ -252,18 +399,16 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
          {
             int local_first_val = (rowPtrs[local_row] - rowPtrs[row]);
             int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row];
-            FPTYPE temp = 0;
+            FPTYPE temp = 0.;
+            FPTYPE sumk_e = 0.;
             for (int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++)
-            {
-               temp += partialSums[local_cur_val];
-            }
+                temp = two_sum(temp, partialSums[local_cur_val], &sumk_e);
 
             // After you've done the reduction into the temp register,
             // put that into the output for each row.
             if (beta != 0.)
-                out[local_row] = (beta * out[local_row]) + (alpha * temp);
-            else
-                out[local_row] = (alpha * temp);
+                temp = two_sum(beta * out[local_row], temp, &sumk_e);
+            out[local_row] = temp + sumk_e;
             local_row += get_local_size(0);
          }
       }
@@ -304,6 +449,9 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
                 else
                     out[row+1] = 0.;
             }
+#ifdef EXTENDED_PRECISION
+            rowBlocks[get_num_groups(0) + gid + 1] = 0UL;
+#endif
             atom_xor(&rowBlocks[first_wg_in_row], (1UL << WGBITS)); // Release other workgroups.
        }
        // For every other workgroup, bit 24 holds the value they wait on.
@@ -345,32 +493,74 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
             // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
             // Using a long induction variable to make sure unsigned int overflow doesn't break things.
 			FPTYPE mySum = 0.;
-			for (long j = vecStart + lid; j < vecEnd; j+=WGSIZE)
-			 {
-				 unsigned int col = cols[(unsigned int)j];
-				 mySum += vals[(unsigned int)j] * vec[col];
-			 }
-			 partialSums[lid] = mySum;
-			 barrier(CLK_LOCAL_MEM_FENCE);
-
-			 // Reduce partial sums
-			 // Needs to be modified if there is a change in the # of work-items per workgroup.
-			 if(lid < 64) partialSums[lid] += partialSums[lid + 64] + partialSums[lid + 128] + partialSums[lid + 192];
-			 barrier(CLK_LOCAL_MEM_FENCE);
-			 if(lid < 16) partialSums[lid] += partialSums[lid + 16] + partialSums[lid + 32] + partialSums[lid + 48];
-			 barrier(CLK_LOCAL_MEM_FENCE);
-			 if(lid < 4) partialSums[lid] += partialSums[lid + 4] + partialSums[lid + 8] + partialSums[lid + 12];
-			 barrier(CLK_LOCAL_MEM_FENCE);
-			 if(lid < 1) partialSums[lid] += partialSums[lid + 1] + partialSums[lid + 2] + partialSums[lid + 3];
-			 barrier(CLK_LOCAL_MEM_FENCE);
-
-			 if (lid == 0)
-				 atomic_add_float(&out[myRow], (alpha * partialSums[0]));
-
-			 // CSR-VECTOR on workgroups for two rows which are inefficient for CSR-Stream
-			 myRow++;
-             iteration++;
-		}
+            FPTYPE sumk_e = 0.;
+            for (long j = vecStart + lid; j < vecEnd; j+=WGSIZE)
+            {
+                unsigned int col = cols[(unsigned int)j];
+                mySum = two_sum(mySum, alpha * vals[(unsigned int)j] * vec[col], &sumk_e);
+            }
+
+            FPTYPE new_error = 0.;
+            mySum = two_sum(mySum, sumk_e, &new_error);
+            partialSums[lid] = mySum;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            // Reduce partial sums
+            // These numbers need to change if the # of work-items/WG changes.
+            mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 64, 128, 192, 256);
+            barrier(CLK_LOCAL_MEM_FENCE);
+            mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 16, 32, 48, 64);
+            barrier(CLK_LOCAL_MEM_FENCE);
+            mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 4, 8, 12, 16);
+            barrier(CLK_LOCAL_MEM_FENCE);
+            mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 1, 2, 3, 4);
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            if (lid == 0)
+            {
+                atomic_two_sum_float(&out[myRow], mySum, &new_error);
+
+#ifdef EXTENDED_PRECISION
+                // The end of the rowBlocks buffer is used to hold errors.
+                atomic_add_float(&(rowBlocks[get_num_groups(0)+first_wg_in_row+1]), new_error);
+                // Coordinate across all of the workgroups in this coop in order to have
+                // the last workgroup fix up the error values.
+                // If this workgroup's row is different than the next workgroup's row
+                // then this is the last workgroup -- it's this workgroup's job to add
+                // the error values into the final sum.
+                if (row != stop_row)
+                {
+                    // Go forward once your ID is the same as the low order bits of the
+                    // coop's first workgroup. That value will be used to store the number
+                    // of threads that have completed so far. Once all the previous threads
+                    // are done, it's time to send out the errors!
+                    while((atom_max(&rowBlocks[first_wg_in_row], 0UL) & ((1 << WGBITS) - 1)) != wg);
+
+#ifdef DOUBLE
+                    new_error = as_double(rowBlocks[get_num_groups(0)+first_wg_in_row+1]);
+#else
+                    new_error = as_float((int)rowBlocks[get_num_groups(0)+first_wg_in_row+1]);
+#endif
+                    atomic_add_float(&out[myRow], new_error);
+                    rowBlocks[get_num_groups(0)+first_wg_in_row+1] = 0UL;
+
+                    // Reset the rowBlocks low order bits for next time.
+                    rowBlocks[first_wg_in_row] = rowBlocks[gid] - wg; // No need for atomics here
+                }
+                else
+                {
+                    // Otherwise, increment the low order bits of the first thread in this
+                    // coop. We're using this to tell how many workgroups in a coop are done.
+                    // Do this with an atomic, since other threads may be doing this too.
+                    unsigned long blah = atom_inc(&rowBlocks[first_wg_in_row]);
+                }
+#endif
+            }
+
+            // CSR-VECTOR on workgroups for two rows which are inefficient for CSR-Stream
+            myRow++;
+            iteration++;
+        }
    }
 }
 )"
diff --git a/src/library/transform/clsparse-coo2csr.cpp b/src/library/transform/clsparse-coo2csr.cpp
index 48f2d5b..dccef0c 100644
--- a/src/library/transform/clsparse-coo2csr.cpp
+++ b/src/library/transform/clsparse-coo2csr.cpp
@@ -61,7 +61,9 @@ clsparseCsrMetaSize( clsparseCsrMatrix* csrMatx, clsparseControl control )
 
     // This allocates up front the maximum size of rowBlocks at start; likely not all the memory is used but
     // this is the fastest
-    pCsrMatx->rowBlockSize = 3 * ( pCsrMatx->num_nonzeros / BLKSIZE ) + 2;
+    // The formula is 3 * (NNZ / block size) + 2, but we double this because CSR-Adaptive uses the
+    // second half of the rowBlocks buffer for global reductions.
+    pCsrMatx->rowBlockSize = 6 * ( pCsrMatx->num_nonzeros / BLKSIZE ) + 4;
 
     return clsparseSuccess;
 }

From 01a68bc3141242aaf606e75ce27b660fa9e292e7 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Fri, 31 Jul 2015 15:16:39 -0500
Subject: [PATCH 05/21] Minor code cleanup modifications. Moved CSR-Adaptive
 configuration parameters out of the kernel. Made the 2sum algorithm in
 csrmv_general slightly faster.

---
 src/library/blas2/csrmv-adaptive.hpp  | 11 ++---
 src/library/kernels/csrmv_adaptive.cl | 62 ++++++++++++---------------
 src/library/kernels/csrmv_general.cl  | 52 +++++++++++-----------
 3 files changed, 59 insertions(+), 66 deletions(-)

diff --git a/src/library/blas2/csrmv-adaptive.hpp b/src/library/blas2/csrmv-adaptive.hpp
index a2b9a3d..4d2d4f9 100644
--- a/src/library/blas2/csrmv-adaptive.hpp
+++ b/src/library/blas2/csrmv-adaptive.hpp
@@ -24,10 +24,9 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
     std::string params = std::string( )
     + " -DROWBITS=" + std::to_string( ROW_BITS )
     + " -DWGBITS=" + std::to_string( WG_BITS )
-    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE );
-#ifdef DOUBLE
-    buildFlags += " -DDOUBLE";
-#endif
+    + " -DWGSIZE=" + std::to_string( group_size )
+    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
+    + " -DEXTENDED_PRECISION";
 
     if(typeid(T) == typeid(cl_double))
     {
@@ -90,7 +89,9 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
     std::string params = std::string( )
     + " -DROWBITS=" + std::to_string( ROW_BITS )
     + " -DWGBITS=" + std::to_string( WG_BITS )
-    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE );
+    + " -DWGSIZE=" + std::to_string( group_size )
+    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
+    + " -DEXTENDED_PRECISION";
 
     if(typeid(T) == typeid(cl_double))
     {
diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 609f24f..fae11bd 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -1,6 +1,4 @@
 R"(
-#define WGSIZE 256
-
 #ifdef DOUBLE
   #define FPTYPE double
 
@@ -18,8 +16,6 @@ R"(
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
 
-#define EXTENDED_PRECISION
-
 int lowerPowerOf2(int num)
 {
         num --;
@@ -37,7 +33,7 @@ int lowerPowerOf2(int num)
         return num;
 }
 
-FPTYPE atomic_add_float_extended (global FPTYPE *ptr, FPTYPE temp, FPTYPE *old_sum)
+FPTYPE atomic_add_float_extended( global FPTYPE *ptr, FPTYPE temp, FPTYPE *old_sum )
 {
 #ifdef DOUBLE
 	unsigned long newVal;
@@ -64,12 +60,11 @@ FPTYPE atomic_add_float_extended (global FPTYPE *ptr, FPTYPE temp, FPTYPE *old_s
 #endif
 }
 
-void atomic_add_float (global void *ptr, FPTYPE temp)
+void atomic_add_float( global void *ptr, FPTYPE temp )
 {
     atomic_add_float_extended(ptr, temp, 0);
 }
 
-
 // Knuth's Two-Sum algorithm, which allows us to add together two floating
 // point numbers and exactly tranform the answer into a sum and a
 // rounding error.
@@ -79,32 +74,30 @@ void atomic_add_float (global void *ptr, FPTYPE temp)
 // Returns: The non-corrected sum of inputs x and y.
 FPTYPE two_sum( FPTYPE x,
                 FPTYPE y,
-                FPTYPE *sumk_err)
+                FPTYPE *sumk_err )
 {
     FPTYPE sumk_s = x + y;
 #ifdef EXTENDED_PRECISION
-    /* We use this 2Sum algorithm to perform a compensated summation,
-       which can reduce the cummulative rounding errors in our SpMV summation.
-       Our compensated sumation is based on the SumK algorithm (with K==2) from
-       Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in
-       SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005.
-
-       2Sum can be done in 6 FLOPs without a branch. However, calculating
-       double precision is slower than single precision on every existing GPU.
-       As such, replacing 2Sum with Fast2Sum when using DPFP results in better
-       performance (~5% higher total). This is true even though we must ensure
-       that |a| > |b|. Branch divergence is better than the DPFP slowdown.
-       Thus, for DPFP, our compensated summation algorithm is actually described
-       by both Pichat and Neumaier in "Correction d'une somme en arithmetique
-       a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972)
-       and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher
-       Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51,
-       1974), respectively.
-
-       Single precision is the same performance using either algorithm, so we
-       use only Fast2Sum */
+    // We use this 2Sum algorithm to perform a compensated summation,
+    // which can reduce the cummulative rounding errors in our SpMV summation.
+    // Our compensated sumation is based on the SumK algorithm (with K==2) from
+    // Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in
+    // SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005.
+
+    // 2Sum can be done in 6 FLOPs without a branch. However, calculating
+    // double precision is slower than single precision on every existing GPU.
+    // As such, replacing 2Sum with Fast2Sum when using DPFP results in better
+    // performance (~5% higher total). This is true even though we must ensure
+    // that |a| > |b|. Branch divergence is better than the DPFP slowdown.
+    // Thus, for DPFP, our compensated summation algorithm is actually described
+    // by both Pichat and Neumaier in "Correction d'une somme en arithmetique
+    // a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972)
+    // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher
+    // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51,
+    // 1974), respectively.
     FPTYPE swap;
-    if (fabs(x) < fabs(y)) {
+    if (fabs(x) < fabs(y))
+    {
         swap = x;
         x = y;
         y = swap;
@@ -117,9 +110,9 @@ FPTYPE two_sum( FPTYPE x,
     return sumk_s;
 }
 
-FPTYPE atomic_two_sum_float(global FPTYPE *x_ptr,
+FPTYPE atomic_two_sum_float( global FPTYPE *x_ptr,
                             FPTYPE y,
-                            FPTYPE *sumk_err)
+                            FPTYPE *sumk_err )
 {
     // Have to wait until the return from the atomic op to know what X was.
     FPTYPE sumk_s;
@@ -145,7 +138,7 @@ FPTYPE atomic_two_sum_float(global FPTYPE *x_ptr,
 // GPUs because the reduction order is different than what would be used on a
 // CPU. It is based on the PSumK algorithm (with K==2) from Yamanaka, Ogita,
 // Rump, and Oishi, "A Parallel Algorithm of Accurate Dot Product," in
-// J. Parallel Computing, 34(6-8), pp. 392-410, Jul. 2008. */
+// J. Parallel Computing, 34(6-8), pp. 392-410, Jul. 2008.
 // Inputs:  cur_sum: the input from which our sum starts
 //          err: the current running cascade error for this final summation
 //          partial: the local memory which holds the values to sum
@@ -153,6 +146,7 @@ FPTYPE atomic_two_sum_float(global FPTYPE *x_ptr,
 //          lid: local ID of the work item calling this function.
 //          integers: This parallel summation method is meant to take values
 //                  from four different points. See the blow comment for usage.
+// Don't inline this function. It reduces performance.
 FPTYPE sum2_reduce( FPTYPE cur_sum,
         FPTYPE *err,
         __local FPTYPE *partial,
@@ -160,7 +154,7 @@ FPTYPE sum2_reduce( FPTYPE cur_sum,
         int first,
         int second,
         int third,
-        int last)
+        int last )
 {
     // A subset of the threads in this workgroup add three sets
     // of values together using the 2Sum method.
@@ -401,7 +395,7 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        unsigned int compare_value = rowBlocks[gid] & (1UL << WGBITS);
 
        // Bit 24 in the first workgroup is the flag that everyone waits on.
-	   if(gid == first_wg_in_row && lid == 0)
+	   if(gid == first_wg_in_row && lid == 0UL)
 	   {
             // The first workgroup handles the output initialization.
             if (beta != 0.)
diff --git a/src/library/kernels/csrmv_general.cl b/src/library/kernels/csrmv_general.cl
index 88944f9..0ce6208 100644
--- a/src/library/kernels/csrmv_general.cl
+++ b/src/library/kernels/csrmv_general.cl
@@ -50,34 +50,32 @@ VALUE_TYPE two_sum( VALUE_TYPE x,
 {
     VALUE_TYPE sumk_s = x + y;
 #ifdef EXTENDED_PRECISION
-    /* We use this 2Sum algorithm to perform a compensated summation,
-       which can reduce the cummulative rounding errors in our SpMV summation.
-       Our compensated sumation is based on the SumK algorithm (with K==2) from
-       Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in
-       SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005.
-
-       2Sum can be done in 6 FLOPs without a branch. However, calculating
-       double precision is slower than single precision on every existing GPU.
-       As such, replacing 2Sum with Fast2Sum when using DPFP results in slightly
-       better performance. This is especially true on non-workstation GPUs with
-       low DPFP rates. Fast2Sum is faster even though we must ensure that
-       |a| > |b|. Branch divergence is better than the DPFP slowdown.
-       Thus, for DPFP, our compensated summation algorithm is actually described
-       by both Pichat and Neumaier in "Correction d'une somme en arithmetique
-       a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972)
-       and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher
-       Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51,
-       1974), respectively. */
-    VALUE_TYPE ap, bp;
-    if (fabs(x) > fabs(y)) {
-        ap = x;
-        bp = y;
-    }
-    else {
-        ap = y;
-        bp = x;
+    // We use this 2Sum algorithm to perform a compensated summation,
+    // which can reduce the cummulative rounding errors in our SpMV summation.
+    // Our compensated sumation is based on the SumK algorithm (with K==2) from
+    // Ogita, Rump, and Oishi, "Accurate Sum and Dot Product" in
+    // SIAM J. on Scientific Computing 26(6) pp 1955-1988, Jun. 2005.
+
+    // 2Sum can be done in 6 FLOPs without a branch. However, calculating
+    // double precision is slower than single precision on every existing GPU.
+    // As such, replacing 2Sum with Fast2Sum when using DPFP results in slightly
+    // better performance. This is especially true on non-workstation GPUs with
+    // low DPFP rates. Fast2Sum is faster even though we must ensure that
+    // |a| > |b|. Branch divergence is better than the DPFP slowdown.
+    // Thus, for DPFP, our compensated summation algorithm is actually described
+    // by both Pichat and Neumaier in "Correction d'une somme en arithmetique
+    // a virgule flottante" (J. Numerische Mathematik 19(5) pp. 400-406, 1972)
+    // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher
+    // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51,
+    // 1974), respectively.
+    VALUE_TYPE swap;
+    if (fabs(x) < fabs(y))
+    {
+        swap = x;
+        x = y;
+        y = swap;
     }
-    (*sumk_err) += (bp - (sumk_s - ap));
+    (*sumk_err) += (y - (sumk_s - x));
     // Original 6 FLOP 2Sum algorithm.
     //VALUE_TYPE bp = sumk_s - x;
     //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp));

From 2475bc3536c6b8b79e2b6e7dfee4b340507c63e3 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Fri, 31 Jul 2015 15:56:05 -0500
Subject: [PATCH 06/21] New modifications to CSR-Adaptive in order to allow a
 single workgroup in CSR-LongRows to work on more than a single block of NNZs.
 This is more efficient and results in higher performance. Also split up
 CSR-Vector from the LongRows algorithm. Added some more tuning knobs to
 CSR-Adaptive with respect to these changes.

---
 src/library/blas2/csrmv-adaptive.hpp       |   4 +
 src/library/include/clSPARSE-private.hpp   |   4 +-
 src/library/internal/computeRowBlocks.hpp  |   4 +-
 src/library/io/mm-reader.cpp               |   6 +-
 src/library/kernels/csrmv_adaptive.cl      | 296 ++++++++++++---------
 src/library/transform/clsparse-coo2csr.cpp |   2 +-
 6 files changed, 178 insertions(+), 138 deletions(-)

diff --git a/src/library/blas2/csrmv-adaptive.hpp b/src/library/blas2/csrmv-adaptive.hpp
index 4d2d4f9..77ff9d0 100644
--- a/src/library/blas2/csrmv-adaptive.hpp
+++ b/src/library/blas2/csrmv-adaptive.hpp
@@ -26,6 +26,8 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
     + " -DWGBITS=" + std::to_string( WG_BITS )
     + " -DWGSIZE=" + std::to_string( group_size )
     + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
+    + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
+    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR )
     + " -DEXTENDED_PRECISION";
 
     if(typeid(T) == typeid(cl_double))
@@ -91,6 +93,8 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
     + " -DWGBITS=" + std::to_string( WG_BITS )
     + " -DWGSIZE=" + std::to_string( group_size )
     + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
+    + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
+    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR )
     + " -DEXTENDED_PRECISION";
 
     if(typeid(T) == typeid(cl_double))
diff --git a/src/library/include/clSPARSE-private.hpp b/src/library/include/clSPARSE-private.hpp
index ec3b2a3..9db9930 100644
--- a/src/library/include/clSPARSE-private.hpp
+++ b/src/library/include/clSPARSE-private.hpp
@@ -26,5 +26,7 @@
 const cl_uint WG_BITS = 24;
 const cl_uint ROW_BITS = 32;
 const cl_uint BLKSIZE = 1024;
+const cl_uint BLOCK_MULTIPLIER = 3;
+const cl_uint ROWS_FOR_VECTOR = 2;
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/library/internal/computeRowBlocks.hpp b/src/library/internal/computeRowBlocks.hpp
index e14aed2..4b3ab48 100644
--- a/src/library/internal/computeRowBlocks.hpp
+++ b/src/library/internal/computeRowBlocks.hpp
@@ -25,7 +25,7 @@
 
 //  rowBlockType is currently instantiated as ulong
 template< typename rowBlockType >
-void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int* rowDelimiters, int nRows, int blkSize )
+void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int* rowDelimiters, int nRows, int blkSize, int blkMultiplier )
 {
     rowBlockType* rowBlocksBase = rowBlocks;
 
@@ -60,7 +60,7 @@ void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int*
         // This is csr-vector case; bottom WG_BITS == workgroup ID
         else if( ( i - last_i == 1 ) && sum > blkSize )
         {
-            int numWGReq = static_cast< int >( ceil( (double)sum / blkSize ) );
+            int numWGReq = static_cast< int >( ceil( (double)sum / (blkMultiplier*blkSize) ) );
 
             // Check to ensure #workgroups can fit in WG_BITS bits, if not
             // then the last workgroup will do all the remaining work
diff --git a/src/library/io/mm-reader.cpp b/src/library/io/mm-reader.cpp
index ae78dae..eb56f7f 100644
--- a/src/library/io/mm-reader.cpp
+++ b/src/library/io/mm-reader.cpp
@@ -598,7 +598,7 @@ clsparseSCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls
         clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
         cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE );
+        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER );
     }
 
     return clsparseSuccess;
@@ -668,7 +668,7 @@ clsparseDCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls
         clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
         cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE );
+        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER );
     }
 
     return clsparseSuccess;
@@ -736,7 +736,7 @@ clsparseDCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls
 //        clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
 //        cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-//        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE );
+//        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER );
 //    }
 
 //    return clsparseSuccess;
diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index fae11bd..63f928e 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -115,11 +115,12 @@ FPTYPE atomic_two_sum_float( global FPTYPE *x_ptr,
                             FPTYPE *sumk_err )
 {
     // Have to wait until the return from the atomic op to know what X was.
-    FPTYPE sumk_s;
+    FPTYPE sumk_s = 0.;
 #ifdef EXTENDED_PRECISION
     FPTYPE x, swap;
     sumk_s = atomic_add_float_extended(x_ptr, y, &x);
-    if (fabs(x) < fabs(y)) {
+    if (fabs(x) < fabs(y))
+    {
         swap = x;
         x = y;
         y = swap;
@@ -223,13 +224,22 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
    // workgroup will spin-loop.
    unsigned int row = ((rowBlocks[gid] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL));
    unsigned int stop_row = ((rowBlocks[gid + 1] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL));
+   unsigned int num_rows = stop_row - row;
+
+   // Get the "workgroup within this long row" ID out of the bottom bits of the row block.
+   unsigned int wg = rowBlocks[gid] & ((1 << WGBITS) - 1);
+
+   // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row.
+   // If there are more items in this row, we assign more workgroups.
+   unsigned int vecStart = rowPtrs[row] + (wg * BLOCK_MULTIPLIER*BLOCKSIZE);
+   unsigned int vecEnd = (rowPtrs[row + 1] > vecStart + BLOCK_MULTIPLIER*BLOCKSIZE) ? vecStart + BLOCK_MULTIPLIER*BLOCKSIZE : rowPtrs[row + 1];
 
    // If the next row block starts more than 2 rows away, then we choose CSR-Stream.
    // If this is zero (long rows) or one (final workgroup in a long row, or a single
-   // row in a row block), we want to use the CSR-Vector algorithm.
+   // row in a row block), we want to use the CSR-Vector algorithm(s).
    // We have found, through experimentation, that CSR-Vector is generally faster
    // when working on 2 rows, due to its simplicity and better reduction method.
-   if (stop_row - row > 2)
+   if (stop_row - row > ROWS_FOR_VECTOR)
    {
        // CSR-Stream case. See Sections III.A and III.B in the SC'14 paper:
        // "Efficient Sparse Matrix-Vector Multiplication on GPUs using the CSR Storage Format"
@@ -260,7 +270,7 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
       int col = rowPtrs[row] + lid;
       if (gid != (get_num_groups(0) - 1))
       {
-          for(int i = 0; i < BLOCKSIZE; i += 256)
+          for(int i = 0; i < BLOCKSIZE; i += WGSIZE)
               partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]];
       }
       else
@@ -273,7 +283,7 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           // This causes a minor performance loss because this is the last workgroup
           // to be launched, and this loop can't be unrolled.
           int max_to_load = rowPtrs[stop_row] - rowPtrs[row];
-          for(int i = 0; i < max_to_load; i += 256)
+          for(int i = 0; i < max_to_load; i += WGSIZE)
               partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]];
       }
       barrier(CLK_LOCAL_MEM_FENCE);
@@ -357,28 +367,84 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           // However, this reduction is also much faster than CSR-Scalar, because local memory
           // is designed for scatter-gather operations.
           // We need a while loop because there may be more rows than threads in the WG.
-         while(local_row < stop_row)
-         {
-            int local_first_val = (rowPtrs[local_row] - rowPtrs[row]);
-            int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row];
-            FPTYPE temp = 0.;
-            FPTYPE sumk_e = 0.;
-            for (int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++)
-                temp = two_sum(temp, partialSums[local_cur_val], &sumk_e);
-
-            // After you've done the reduction into the temp register,
-            // put that into the output for each row.
-            if (beta != 0.)
-                temp = two_sum(beta * out[local_row], temp, &sumk_e);
-            out[local_row] = temp + sumk_e;
-            local_row += get_local_size(0);
-         }
+          while(local_row < stop_row)
+          {
+              int local_first_val = (rowPtrs[local_row] - rowPtrs[row]);
+              int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row];
+              FPTYPE temp = 0.;
+              FPTYPE sumk_e = 0.;
+              for (int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++)
+                  temp = two_sum(temp, partialSums[local_cur_val], &sumk_e);
+
+              // After you've done the reduction into the temp register,
+              // put that into the output for each row.
+              if (beta != 0.)
+                  temp = two_sum(beta * out[local_row], temp, &sumk_e);
+              out[local_row] = temp + sumk_e;
+              local_row += WGSIZE;
+          }
       }
    }
+   else if (num_rows >= 1 && !wg) // CSR-Vector case.
+   {
+       // ^^ The above check says that if this workgroup is supposed to work on <= ROWS_VECTOR
+       // number of rows then we should do the CSR-Vector algorithm. If we want this row to be
+       // done with CSR-LongRows, then all of its workgroups (except the last one) will have the
+       // same stop_row and row. The final workgroup in a LongRow will have stop_row and row
+       // different, but the internal "wg" number will be non-zero.
+       unsigned int myRow = row; // Adding another variable for row as it is getting updated below
+
+       // CSR-Vector will always do at least one iteration.
+       // If this workgroup is operating on multiple rows (because CSR-Stream is poor for small
+       // numberss of rows), then it needs to iterate until it reaches the stop_row.
+       // We don't check <= stop_row because of the potential for unsigned overflow.
+       while (myRow < stop_row)
+       {
+           // Any workgroup only calculates, at most, BLOCKSIZE items in this row.
+           // If there are more items in this row, we use CSR-LongRows.
+           vecStart = rowPtrs[myRow];
+           vecEnd = rowPtrs[myRow+1];
+
+           // Load in a bunch of partial results into your register space, rather than LDS (no contention)
+           // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
+           // Using a long induction variable to make sure unsigned int overflow doesn't break things.
+           FPTYPE mySum = 0.;
+           FPTYPE sumk_e = 0.;
+           for (long j = vecStart + lid; j < vecEnd; j+=WGSIZE)
+           {
+               unsigned int col = cols[(unsigned int)j];
+               mySum = two_sum(mySum, alpha * vals[(unsigned int)j] * vec[col], &sumk_e);
+           }
+
+           FPTYPE new_error = 0.;
+           mySum = two_sum(mySum, sumk_e, &new_error);
+           partialSums[lid] = mySum;
+           barrier(CLK_LOCAL_MEM_FENCE);
+
+           // Reduce partial sums
+           // These numbers need to change if the # of work-items/WG changes.
+           mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 64, 128, 192, 256);
+           barrier(CLK_LOCAL_MEM_FENCE);
+           mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 16, 32, 48, 64);
+           barrier(CLK_LOCAL_MEM_FENCE);
+           mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 4, 8, 12, 16);
+           barrier(CLK_LOCAL_MEM_FENCE);
+           mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 1, 2, 3, 4);
+           barrier(CLK_LOCAL_MEM_FENCE);
+
+           if (lid == 0UL)
+           {
+               if (beta != 0.)
+                   mySum = two_sum(beta * out[myRow], mySum, &new_error);
+               out[myRow] = mySum + new_error;
+           }
+           // CSR-VECTOR on workgroups for two rows which are inefficient for CSR-Stream
+           myRow++;
+       }
+   }
    else
    {
-       // In CSR-Vector, we may have more than one workgroup calculating this row
-	   // or one workgroup calculating two rows.
+       // In CSR-LongRows, we have more than one workgroup calculating this row.
        // The output values for those types of rows are stored using atomic_add, because
        // more than one parallel workgroup's value makes up the final answer.
        // Unfortunately, this makes it difficult to do y=Ax, rather than y=Ax+y, because
@@ -391,138 +457,106 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        // First, figure out which workgroup you are in the row. Bottom 24 bits.
        // You can use that to find the global ID for the first workgroup calculating
        // this long row.
-	   size_t first_wg_in_row = gid - (rowBlocks[gid] & ((1UL << WGBITS) - 1UL));
+       size_t first_wg_in_row = gid - (rowBlocks[gid] & ((1UL << WGBITS) - 1UL));
        unsigned int compare_value = rowBlocks[gid] & (1UL << WGBITS);
 
        // Bit 24 in the first workgroup is the flag that everyone waits on.
-	   if(gid == first_wg_in_row && lid == 0UL)
-	   {
-            // The first workgroup handles the output initialization.
-            if (beta != 0.)
-                out[row] *= beta;
-            else
-                out[row] = 0.;
-            // We currently have, at most, two rows in a CSR-Vector calculation.
-            // If we have two, we need to initialize the second output as well.
-            if(stop_row - row == 2)
-            {
-                if (beta != 0.)
-                    out[row+1] *= beta;
-                else
-                    out[row+1] = 0.;
-            }
+       if(gid == first_wg_in_row && lid == 0UL)
+       {
+           // The first workgroup handles the output initialization.
+           if (beta != 0.)
+               out[row] *= beta;
+           else
+               out[row] = 0.;
 #ifdef EXTENDED_PRECISION
-            rowBlocks[get_num_groups(0) + gid + 1] = 0UL;
+           rowBlocks[get_num_groups(0) + gid + 1] = 0UL;
 #endif
-            atom_xor(&rowBlocks[first_wg_in_row], (1UL << WGBITS)); // Release other workgroups.
+           atom_xor(&rowBlocks[first_wg_in_row], (1UL << WGBITS)); // Release other workgroups.
        }
        // For every other workgroup, bit 24 holds the value they wait on.
        // If your bit 24 == first_wg's bit 24, you spin loop.
        // The first workgroup will eventually flip this bit, and you can move forward.
        barrier(CLK_GLOBAL_MEM_FENCE);
-	   while(gid != first_wg_in_row &&
-             lid==0 &&
-             ((atom_max(&rowBlocks[first_wg_in_row], 0UL) & (1UL << WGBITS)) == compare_value));
+       while(gid != first_wg_in_row &&
+               lid == 0 &&
+               ((atom_max(&rowBlocks[first_wg_in_row], 0UL) & (1UL << WGBITS)) == compare_value));
        barrier(CLK_GLOBAL_MEM_FENCE);
 
        // After you've passed the barrier, update your local flag to make sure that
        // the next time through, you know what to wait on.
-       if (gid != first_wg_in_row && lid == 0)
+       if (gid != first_wg_in_row && lid == 0UL)
            rowBlocks[gid] ^= (1UL << WGBITS);
 
-       unsigned int myRow = row; // Adding another variable for row as it is getting updated below
-       char iteration = 0;
-
-       // CSR-Vector will always do at least one iteration.
        // All but the final workgroup in a long-row collaboration have the same start_row
        // and stop_row. They only run for one iteration.
-       // If this workgroup is operating on multiple rows (because CSR-Stream is poor for small
-       // #s of rows), then it is (currently) not part of a collaboration on a long
-       // row. As such, it needs to iterate until it reaches the stop_row.
-       // We don't check <= stop_row because of the potential for unsigned overflow.
-		while (iteration == 0 || myRow < stop_row)
-		{
-            // Get the "workgroup within this long row" ID out of the bottom bits of the row block.
-            // If this is the only workgroup working on this row, this will be zero, so still correct.
-			unsigned int wg = rowBlocks[gid] & ((1 << WGBITS) - 1);
-
-            // Any workgroup only calculates, at most, BLOCKSIZE items in this row.
-            // If there are more items in this row, we assign more workgroups.
-            unsigned int vecStart = rowPtrs[myRow] + (wg * BLOCKSIZE);
-			unsigned int vecEnd = (rowPtrs[myRow + 1] > vecStart + BLOCKSIZE) ? vecStart + BLOCKSIZE : rowPtrs[myRow+1];
-
-            // Load in a bunch of partial results into your register space, rather than LDS (no contention)
-            // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
-            // Using a long induction variable to make sure unsigned int overflow doesn't break things.
-			FPTYPE mySum = 0.;
-            FPTYPE sumk_e = 0.;
-            for (long j = vecStart + lid; j < vecEnd; j+=WGSIZE)
-            {
-                unsigned int col = cols[(unsigned int)j];
-                mySum = two_sum(mySum, alpha * vals[(unsigned int)j] * vec[col], &sumk_e);
-            }
-
-            FPTYPE new_error = 0.;
-            mySum = two_sum(mySum, sumk_e, &new_error);
-            partialSums[lid] = mySum;
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            // Reduce partial sums
-            // These numbers need to change if the # of work-items/WG changes.
-            mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 64, 128, 192, 256);
-            barrier(CLK_LOCAL_MEM_FENCE);
-            mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 16, 32, 48, 64);
-            barrier(CLK_LOCAL_MEM_FENCE);
-            mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 4, 8, 12, 16);
-            barrier(CLK_LOCAL_MEM_FENCE);
-            mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 1, 2, 3, 4);
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            if (lid == 0)
-            {
-                atomic_two_sum_float(&out[myRow], mySum, &new_error);
+       // Load in a bunch of partial results into your register space, rather than LDS (no contention)
+       // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
+       // Using a long induction variable to make sure unsigned int overflow doesn't break things.
+       FPTYPE mySum = 0.;
+       FPTYPE sumk_e = 0.;
+
+       int col = vecStart + lid;
+       for(int j = 0; j < (int)(vecEnd - col); j += WGSIZE)
+           mySum = two_sum(mySum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+
+       FPTYPE new_error = 0.;
+       mySum = two_sum(mySum, sumk_e, &new_error);
+       partialSums[lid] = mySum;
+       barrier(CLK_LOCAL_MEM_FENCE);
+
+       // Reduce partial sums
+       // Needs to be modified if there is a change in the # of work-items per workgroup.
+       mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 64, 128, 192, 256);
+       barrier(CLK_LOCAL_MEM_FENCE);
+       mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 16, 32, 48, 64);
+       barrier(CLK_LOCAL_MEM_FENCE);
+       mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 4, 8, 12, 16);
+       barrier(CLK_LOCAL_MEM_FENCE);
+       mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 1, 2, 3, 4);
+       barrier(CLK_LOCAL_MEM_FENCE);
+
+       if (lid == 0)
+       {
+           atomic_two_sum_float(&out[row], mySum, &new_error);
 
 #ifdef EXTENDED_PRECISION
-                // The end of the rowBlocks buffer is used to hold errors.
-                atomic_add_float(&(rowBlocks[get_num_groups(0)+first_wg_in_row+1]), new_error);
-                // Coordinate across all of the workgroups in this coop in order to have
-                // the last workgroup fix up the error values.
-                // If this workgroup's row is different than the next workgroup's row
-                // then this is the last workgroup -- it's this workgroup's job to add
-                // the error values into the final sum.
-                if (row != stop_row)
-                {
-                    // Go forward once your ID is the same as the low order bits of the
-                    // coop's first workgroup. That value will be used to store the number
-                    // of threads that have completed so far. Once all the previous threads
-                    // are done, it's time to send out the errors!
-                    while((atom_max(&rowBlocks[first_wg_in_row], 0UL) & ((1 << WGBITS) - 1)) != wg);
+           // The last half of the rowBlocks buffer is used to hold errors.
+           atomic_add_float(&(rowBlocks[get_num_groups(0)+first_wg_in_row+1]), new_error);
+           // Coordinate across all of the workgroups in this coop in order to have
+           // the last workgroup fix up the error values.
+           // If this workgroup's row is different than the next workgroup's row
+           // then this is the last workgroup -- it's this workgroup's job to add
+           // the error values into the final sum.
+           if (row != stop_row)
+           {
+               // Go forward once your ID is the same as the low order bits of the
+               // coop's first workgroup. That value will be used to store the number
+               // of threads that have completed so far. Once all the previous threads
+               // are done, it's time to send out the errors!
+               while((atom_max(&rowBlocks[first_wg_in_row], 0UL) & ((1UL << WGBITS) - 1)) != wg);
 
 #ifdef DOUBLE
-                    new_error = as_double(rowBlocks[get_num_groups(0)+first_wg_in_row+1]);
+               new_error = as_double(rowBlocks[get_num_groups(0)+first_wg_in_row+1]);
 #else
-                    new_error = as_float((int)rowBlocks[get_num_groups(0)+first_wg_in_row+1]);
+               new_error = as_float((int)rowBlocks[get_num_groups(0)+first_wg_in_row+1]);
 #endif
-                    atomic_add_float(&out[myRow], new_error);
-                    rowBlocks[get_num_groups(0)+first_wg_in_row+1] = 0UL;
-
-                    // Reset the rowBlocks low order bits for next time.
-                    rowBlocks[first_wg_in_row] = rowBlocks[gid] - wg; // No need for atomics here
-                }
-                else
-                {
-                    // Otherwise, increment the low order bits of the first thread in this
-                    // coop. We're using this to tell how many workgroups in a coop are done.
-                    // Do this with an atomic, since other threads may be doing this too.
-                    unsigned long blah = atom_inc(&rowBlocks[first_wg_in_row]);
-                }
+               // Don't need to work atomically here, because this is the only workgroup
+               // left working on this row.
+               out[row] += new_error;
+               rowBlocks[get_num_groups(0)+first_wg_in_row+1] = 0UL;
+
+               // Reset the rowBlocks low order bits for next time.
+               rowBlocks[first_wg_in_row] = rowBlocks[gid] - wg;
+           }
+           else
+           {
+               // Otherwise, increment the low order bits of the first thread in this
+               // coop. We're using this to tell how many workgroups in a coop are done.
+               // Do this with an atomic, since other threads may be doing this too.
+               unsigned long no_warn = atom_inc(&rowBlocks[first_wg_in_row]);
+           }
 #endif
-            }
-
-            // CSR-VECTOR on workgroups for two rows which are inefficient for CSR-Stream
-            myRow++;
-            iteration++;
-        }
+       }
    }
 }
 )"
diff --git a/src/library/transform/clsparse-coo2csr.cpp b/src/library/transform/clsparse-coo2csr.cpp
index dccef0c..a463e14 100644
--- a/src/library/transform/clsparse-coo2csr.cpp
+++ b/src/library/transform/clsparse-coo2csr.cpp
@@ -86,7 +86,7 @@ clsparseCsrMetaCompute( clsparseCsrMatrix* csrMatx, clsparseControl control )
     clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
     cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-    ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE );
+    ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER );
 
     return clsparseSuccess;
 }

From 430a7aa9c8e00a0063e0cec6471380bbeb758991 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Fri, 31 Jul 2015 16:09:10 -0500
Subject: [PATCH 07/21] Modifications to the CSR-Stream case. In particular, a
 faster parallel reduction mechanism when there are relatively few rows within
 the row block.

---
 src/library/kernels/csrmv_adaptive.cl | 190 ++++++++++++++++----------
 1 file changed, 121 insertions(+), 69 deletions(-)

diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 63f928e..3297457 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -189,6 +189,58 @@ FPTYPE sum2_reduce( FPTYPE cur_sum,
     return cur_sum;
 }
 
+// A simpler reduction than the one above, also built to use EXTENDED_PRECISION
+// Rather than working on large widths, this simply loads a single value from
+// an upper entry of the LDS and drops it into the local region.
+// It is meant to be called in parallel on an LDS buffer, over enough iterations,
+// reduce multiple rows into a small series of output rows.
+// A version of this method is also used in csrmv_general.
+// Inputs:  cur_sum: the input from which our sum starts
+//          err: the current running cascade error for this final summation
+//          partial: the local memory which holds the values to sum
+//                  (we eventually use it to pass down temp. err vals as well)
+//          lid: local ID of the work item calling this function.
+//          thread_lane: The lane within this thread's row.
+//          max_size: This parallel summation method operates in multiple rounds
+//                  to do a parallel reduction. This is the length of each row.
+//          round: As you reduce data down, this tells you how many output values
+//                 you won't during each round.
+FPTYPE simple_sum2_reduce( FPTYPE cur_sum,
+        FPTYPE *err,
+        volatile __local FPTYPE *partial,
+        size_t lid,
+        int thread_lane,
+        int max_size,
+        int reduc_size )
+{
+    if ( max_size > reduc_size )
+    {
+#ifdef EXTENDED_PRECISION
+        if (thread_lane < reduc_size)
+            cur_sum  = two_sum(cur_sum, partial[lid + reduc_size], err);
+
+        // We reuse the LDS entries to move the error values down into lower
+        // threads. This saves LDS space, allowing higher occupancy, but requires
+        // more barriers, which can reduce performance.
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // Have all of those upper threads pass their temporary errors
+        // into a location that the lower threads can read.
+        if (thread_lane >= reduc_size)
+            partial[lid] = *err;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (thread_lane < reduc_size) // Add those errors in.
+        {
+            *err += partial[lid + reduc_size];
+            partial[lid] = cur_sum;
+        }
+#else
+        cur_sum += partial[lid + reduc_size];
+        partial[lid] = cur_sum;
+#endif
+    }
+    return cur_sum;
+}
+
 __kernel void
 csrmv_adaptive(__global const FPTYPE * restrict vals,
                        __global const int * restrict cols,
@@ -196,8 +248,8 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
                        __global const FPTYPE * restrict vec,
                        __global FPTYPE * restrict out,
                        __global unsigned long * restrict rowBlocks,
-                       __global FPTYPE * pAlpha,
-                       __global FPTYPE * pBeta)
+                       __global FPTYPE * restrict pAlpha,
+                       __global FPTYPE * restrict pBeta)
 {
    __local FPTYPE partialSums[BLOCKSIZE];
    size_t gid = get_group_id(0);
@@ -239,7 +291,7 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
    // row in a row block), we want to use the CSR-Vector algorithm(s).
    // We have found, through experimentation, that CSR-Vector is generally faster
    // when working on 2 rows, due to its simplicity and better reduction method.
-   if (stop_row - row > ROWS_FOR_VECTOR)
+   if (num_rows > ROWS_FOR_VECTOR)
    {
        // CSR-Stream case. See Sections III.A and III.B in the SC'14 paper:
        // "Efficient Sparse Matrix-Vector Multiplication on GPUs using the CSR Storage Format"
@@ -293,72 +345,72 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           // In this case, we want to have the workgroup perform a tree-style reduction
           // of each row. {numThreadsForRed} adjacent threads team up to linearly reduce
           // a row into {numThreadsForRed} locations in local memory.
-          // After that, a single thread from each of those teams linearly walks through
-          // the local memory values for that row and reduces to the final output value.
-         FPTYPE temp = 0.;
-         FPTYPE sumk_e = 0.;
-
-         // {numThreadsForRed} adjacent threads all work on the same row, so their
-         // start and end values are the same.
-         size_t st = lid/numThreadsForRed;
-         int local_first_val = (rowPtrs[row + st] - rowPtrs[row]);
-         int local_last_val = rowPtrs[row + st + 1] - rowPtrs[row];
-         int workForEachThread = (local_last_val - local_first_val)/numThreadsForRed;
-         size_t threadInBlock = lid & (numThreadsForRed - 1);
-
-         // Not all row blocks are full -- they may have an odd number of rows. As such,
-         // we need to ensure that adjacent-groups only work on real data for this rowBlock.
-         if(st < (stop_row - row))
-         {
-            // only works when numThreadsForRed is a power of 2
-            for(int i = 0; i < workForEachThread; i++)
-               temp = two_sum(temp, partialSums[local_first_val + i*numThreadsForRed + threadInBlock], &sumk_e);
-
-            // The last few values (the remainder of this row) also need to be aded in.
-            int local_cur_val = local_first_val + numThreadsForRed*workForEachThread;
-            if(threadInBlock < local_last_val - local_cur_val)
-               temp = two_sum(temp, partialSums[local_cur_val + threadInBlock], &sumk_e);
-         }
-         barrier(CLK_LOCAL_MEM_FENCE);
-
-         FPTYPE new_error = 0.;
-         temp = two_sum(temp, sumk_e, &new_error);
-         partialSums[lid] = temp;
-
-         // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed}
-         // values sitting in the local memory. This next step takes the first thread from
-         // each of the adjacent-groups and uses it to walk through those values and reduce
-         // them into a final output value for the row.
-         temp = 0.;
-         sumk_e = 0.;
-         barrier(CLK_LOCAL_MEM_FENCE);
-         if(lid < (stop_row - row))
-         {
-#pragma unroll 4
-             for(int i = 0; i < numThreadsForRed; i++)
-                 temp = two_sum(temp, partialSums[lid*numThreadsForRed + i], &sumk_e);
-         }
-#ifdef EXTENDED_PRECISION
-         // Values in partialSums[] are finished, now let's add in all of the local error values.
-         barrier(CLK_LOCAL_MEM_FENCE);
-         partialSums[lid] = new_error;
-         barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-         if(lid < (stop_row - row))
-         {
-#ifdef EXTENDED_PRECISION
-             // Sum up the local error values which are now in partialSums[]
-             for(int i = 0; i < numThreadsForRed; i++)
-                 temp = two_sum(temp, partialSums[lid*numThreadsForRed + i], &sumk_e);
-#endif
-
-             // All of our write-outs check to see if the output vector should first be zeroed.
-             // If so, just do a write rather than a read-write. Measured to be a slight (~5%)
-             // performance improvement.
-             if (beta != 0.)
-                 temp = two_sum(beta * out[local_row], temp, &sumk_e);
-             out[local_row] = temp + sumk_e;
-         }
+          // After that, the entire workgroup does a parallel reduction, and each
+          // row ends up with an individual answer.
+          FPTYPE temp = 0.;
+          FPTYPE sumk_e = 0.;
+
+          // {numThreadsForRed} adjacent threads all work on the same row, so their
+          // start and end values are the same.
+          // numThreadsForRed guaranteed to be a power of two, so the clz code below
+          // avoids an integer divide. ~2% perf gain in EXTRA_PRECISION.
+          //size_t st = lid/numThreadsForRed;
+          size_t st = lid >> (31 - clz(numThreadsForRed));
+          int local_first_val = (rowPtrs[row + st] - rowPtrs[row]);
+          int local_last_val = rowPtrs[row + st + 1] - rowPtrs[row];
+          int workForEachThread = (local_last_val - local_first_val) >> (31 - clz(numThreadsForRed));
+          size_t threadInBlock = lid & (numThreadsForRed - 1);
+
+          // Not all row blocks are full -- they may have an odd number of rows. As such,
+          // we need to ensure that adjacent-groups only work on real data for this rowBlock.
+          if(st < num_rows)
+          {
+              // only works when numThreadsForRed is a power of 2
+              for(int i = 0; i < workForEachThread; i++)
+                  temp = two_sum(temp, partialSums[local_first_val + i*numThreadsForRed + threadInBlock], &sumk_e);
+
+              // The last few values (the remainder of this row) also need to be aded in.
+              int local_cur_val = local_first_val + numThreadsForRed*workForEachThread;
+              if(threadInBlock < local_last_val - local_cur_val)
+                  temp = two_sum(temp, partialSums[local_cur_val + threadInBlock], &sumk_e);
+          }
+          barrier(CLK_LOCAL_MEM_FENCE);
+
+          FPTYPE new_error = 0.;
+          temp = two_sum(temp, sumk_e, &new_error);
+          partialSums[lid] = temp;
+
+          // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed}
+          // values sitting in the local memory. This means that, roughly, the beginning of
+          // LDS is full up to {workgroup size} entries.
+          // Now we perform a parallel reduction that sums together the answers for each
+          // row in parallel, leaving us an answer in 'temp' for each row.
+          barrier(CLK_LOCAL_MEM_FENCE);
+          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 128);
+          barrier( CLK_LOCAL_MEM_FENCE );
+          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 64);
+          barrier( CLK_LOCAL_MEM_FENCE );
+          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 32);
+          barrier( CLK_LOCAL_MEM_FENCE );
+          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 16);
+          barrier( CLK_LOCAL_MEM_FENCE );
+          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 8);
+          barrier( CLK_LOCAL_MEM_FENCE );
+          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 4);
+          barrier( CLK_LOCAL_MEM_FENCE );
+          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 2);
+          barrier( CLK_LOCAL_MEM_FENCE );
+          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 1);
+
+          if (threadInBlock == 0 && st < num_rows)
+          {
+              // All of our write-outs check to see if the output vector should first be zeroed.
+              // If so, just do a write rather than a read-write. Measured to be a slight (~5%)
+              // performance improvement.
+              if (beta != 0.)
+                  temp = two_sum(beta * out[row+st], temp, &new_error);
+              out[row+st] = temp + new_error;
+          }
       }
       else
       {

From 54108c764a39f57fde6f599533bd1895122d038c Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Fri, 31 Jul 2015 17:46:22 -0500
Subject: [PATCH 08/21] Changed CSR-Adaptive to improve the CSR-Stream
 performance. Calculate the number of threads assigned to the parallel
 CSR-Stream reduction on the CPU instead of making each GPU workgroup do it.
 Change the action away from being a division and replace with some faster bit
 math.

---
 src/library/internal/computeRowBlocks.hpp | 96 ++++++++++++++++++-----
 src/library/kernels/csrmv_adaptive.cl     | 19 +++--
 2 files changed, 89 insertions(+), 26 deletions(-)

diff --git a/src/library/internal/computeRowBlocks.hpp b/src/library/internal/computeRowBlocks.hpp
index 4b3ab48..6a01f4c 100644
--- a/src/library/internal/computeRowBlocks.hpp
+++ b/src/library/internal/computeRowBlocks.hpp
@@ -5,12 +5,56 @@
 #include <iterator>
 #include <cassert>
 
+#ifndef __has_builtin
+  #define __has_builtin(x) 0
+#endif
+
+static inline unsigned int flp2(unsigned int x)
+{
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+    x |= (x >> 8);
+    x |= (x >> 16);
+    return x - (x >> 1);
+}
+
+// Short rows in CSR-Adaptive are batched together into a single row block.
+// If there are a relatively small number of these, then we choose to do
+// a horizontal reduction (groups of threads all reduce the same row).
+// If there are many threads (e.g. more threads than the maximum size
+// of our workgroup) then we choose to have each thread serially reduce
+// the row.
+// This function calculates the number of threads that could team up
+// to reduce these groups of rows. For instance, if you have a
+// workgroup size of 256 and 4 rows, you could have 64 threads
+// working on each row. If you have 5 rows, only 32 threads could
+// reliably work on each row because our reduction assumes power-of-2.
+template< typename rowBlockType >
+static inline rowBlockType numThreadsForReduction(rowBlockType num_rows)
+{
+#if defined(__INTEL_COMPILER)
+    return 256 >> (_bit_scan_reverse(num_rows-1)+1);
+#elif (defined(__clang__) && __has_builtin(__builtin_clz)) || \
+      !defined(__clang) && \
+      defined(__GNUG__) && ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 30202)
+    return (256 >> (8*sizeof(int)-__builtin_clz(num_rows-1)));
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+    int bit_returned;
+    _BitScanReverse(&bit_returned, (num_rows-1));
+    return 256 >> (bit_returned+1);
+#else
+    return flp2(256/num_rows);
+#endif
+}
+
 // The row blocks buffer holds a packed set of information used to inform each
 // workgroup about how to do its work:
 //
 // |6666 5555 5555 5544 4444 4444 3333 3333|3322 2222|2222 1111 1111 1100 0000 0000|
 // |3210 9876 5432 1098 7654 3210 9876 5432|1098 7654|3210 9876 5432 1098 7654 3210|
-// |------------Row Information------------|----flag^|---WG ID within a long row---|
+// |------------Row Information------------|--------^|---WG ID within a long row---|
+// |                                       |    flag/|or # reduce threads for short|
 //
 // The upper 32 bits of each rowBlock entry tell the workgroup the ID of the first
 // row it will be working on. When one workgroup calculates multiple rows, this
@@ -18,6 +62,9 @@
 // The lower 24 bits are used whenever multiple workgroups calculate a single long
 // row. This tells each workgroup its ID within that row, so it knows which
 // part of the row to operate on.
+// Alternately, on "short" row blocks, the lower bits are used to communicate
+// the number of threads that should be used for the reduction. Pre-calculating
+// this on the CPU-side results in a noticable performance uplift on many matrices.
 // Bit 24 is a flag bit used so that the multiple WGs calculating a long row can
 // know when the first workgroup for that row has finished initializing the output
 // value. While this bit is the same as the first workgroup's flag bit, this
@@ -27,8 +74,9 @@
 template< typename rowBlockType >
 void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int* rowDelimiters, int nRows, int blkSize, int blkMultiplier )
 {
-    rowBlockType* rowBlocksBase = rowBlocks;
+    rowBlockType* rowBlocksBase;
 
+    rowBlocksBase = rowBlocks;
     *rowBlocks = 0;
     rowBlocks++;
     rowBlockType sum = 0;
@@ -43,22 +91,12 @@ void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int*
 
     for( i = 1; i <= nRows; i++ )
     {
-        sum += ( rowDelimiters[ i ] - rowDelimiters[ i - 1 ] );
-
-        // more than one row results in non-zero elements to be greater than blockSize
-        // This is csr-stream case; bottom WG_BITS == 0
-        if( ( i - last_i > 1 ) && sum > blkSize )
-        {
-            *rowBlocks = ( (i - 1) << (64 - ROW_BITS) );
-            rowBlocks++;
-            i--;
-            last_i = i;
-            sum = 0;
-        }
+        int row_length = ( rowDelimiters[ i ] - rowDelimiters[ i - 1 ] );
+        sum += row_length;
 
         // exactly one row results in non-zero elements to be greater than blockSize
         // This is csr-vector case; bottom WG_BITS == workgroup ID
-        else if( ( i - last_i == 1 ) && sum > blkSize )
+        if( ( i - last_i == 1 ) && sum > blkSize )
         {
             int numWGReq = static_cast< int >( ceil( (double)sum / (blkMultiplier*blkSize) ) );
 
@@ -79,11 +117,26 @@ void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int*
             last_i = i;
             sum = 0;
         }
-        // sum of non-zero elements is exactly equal to blockSize
-        // This is csr-stream case; bottom WG_BITS == 0
+        // more than one row results in non-zero elements to be greater than blockSize
+        // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads
+        else if( ( i - last_i > 1 ) && sum > blkSize )
+        {
+            i--; // This row won't fit, so back off one.
+            *rowBlocks = ( i << (64 - ROW_BITS) );
+            // Fill in the low-order bits with the numThreadsForRed
+            if ((i - last_i) > 2)
+                *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
+            rowBlocks++;
+            last_i = i;
+            sum = 0;
+        }
+        // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads
         else if( sum == blkSize )
         {
             *rowBlocks = ( i << (64 - ROW_BITS) );
+            // Fill in the low-order bits with the numThreadsForRed
+            if ((i - last_i) > 2)
+                *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
             rowBlocks++;
             last_i = i;
             sum = 0;
@@ -91,16 +144,19 @@ void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int*
 
     }
 
-    if ((*(rowBlocks-1) >> (64 - ROW_BITS)) != static_cast< rowBlockType>( nRows))
+    // If we didn't fill a row block with the last row, make sure we don't lose it.
+    if ( (*(rowBlocks-1) >> (64 - ROW_BITS)) != static_cast< rowBlockType>(nRows) )
     {
         *rowBlocks = ( static_cast< rowBlockType >( nRows ) << (64 - ROW_BITS) );
+        // Fill in the low-order bits with the numThreadsForRed
+        if ((nRows - last_i) > 2)
+            *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
         rowBlocks++;
     }
 
     size_t dist = std::distance( rowBlocksBase, rowBlocks );
     assert( (2 * dist) <= rowBlockSize );
-
-    //   Update the size of rowBlocks to reflect the actual amount of memory used
+    // Update the size of rowBlocks to reflect the actual amount of memory used
     // We're multiplying the size by two because the extended precision form of
     // CSR-Adaptive requires more space for the final global reduction.
     rowBlockSize = 2 * dist;
diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 3297457..2b411ec 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -262,7 +262,8 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
    //
    // |6666 5555 5555 5544 4444 4444 3333 3333|3322 2222|2222 1111 1111 1100 0000 0000|
    // |3210 9876 5432 1098 7654 3210 9876 5432|1098 7654|3210 9876 5432 1098 7654 3210|
-   // |------------Row Information------------|----flag^|---WG ID within a long row---|
+   // |------------Row Information------------|--------^|---WG ID within a long row---|
+   // |                                       |    flag/|or # reduce threads for short|
    //
    // The upper 32 bits of each rowBlock entry tell the workgroup the ID of the first
    // row it will be working on. When one workgroup calculates multiple rows, this
@@ -270,6 +271,9 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
    // The lower 24 bits are used whenever multiple workgroups calculate a single long
    // row. This tells each workgroup its ID within that row, so it knows which
    // part of the row to operate on.
+   // Alternately, on "short" row blocks, the lower bits are used to communicate
+   // the number of threads that should be used for the reduction. Pre-calculating
+   // this on the CPU-side results in a noticable performance uplift on many matrices.
    // Bit 24 is a flag bit used so that the multiple WGs calculating a long row can
    // know when the first workgroup for that row has finished initializing the output
    // value. While this bit is the same as the first workgroup's flag bit, this
@@ -308,14 +312,16 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        // long) rows, it's actually better to perform a tree-style reduction where
        // multiple threads team up to reduce the same row.
 
-       // The calculations below tell you how many threads this workgroup can allocate
+       // The calculation below tell you how many threads this workgroup can allocate
        // to each row, assuming that every row gets the same number of threads.
        // We want the closest lower-power-of-2 to this number -- that is how many
        // threads can work in each row's reduction using our algorithm.
-      int possibleThreadsRed = get_local_size(0)/(stop_row - row);
-      int numThreadsForRed = lowerPowerOf2(possibleThreadsRed);
-
-      unsigned int local_row = row + lid;
+       // We want the closest lower (or equal) power-of-2 to this number --
+       // that is how many threads can work in each row's reduction using our algorithm.
+       // For instance, with workgroup size 256, 2 rows = 128 threads, 3 rows = 64
+       // threads, 4 rows = 64 threads, 5 rows = 32 threads, etc.
+       //int numThreadsForRed = get_local_size(0) >> ((CHAR_BIT*sizeof(unsigned int))-clz(num_rows-1));
+       int numThreadsForRed = wg; // Same calculation as above, done on host.
 
        // Stream all of this row block's matrix values into local memory.
        // Perform the matvec in parallel with this work.
@@ -419,6 +425,7 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           // However, this reduction is also much faster than CSR-Scalar, because local memory
           // is designed for scatter-gather operations.
           // We need a while loop because there may be more rows than threads in the WG.
+          unsigned int local_row = row + lid;
           while(local_row < stop_row)
           {
               int local_first_val = (rowPtrs[local_row] - rowPtrs[row]);

From 021d3d63c2961a39b90fccd56d0042beecdd4914 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Fri, 31 Jul 2015 19:19:20 -0500
Subject: [PATCH 09/21] Removing a function that is no longer in use.

---
 src/library/kernels/csrmv_adaptive.cl | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 2b411ec..13ba114 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -16,23 +16,6 @@ R"(
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
 
-int lowerPowerOf2(int num)
-{
-        num --;
-
-        num |= num >> 1;
-        num |= num >> 2;
-        num |= num >> 4;
-        num |= num >> 8;
-        num |= num >> 16;
-
-        num ++;
-
-        num >>= 1;
-
-        return num;
-}
-
 FPTYPE atomic_add_float_extended( global FPTYPE *ptr, FPTYPE temp, FPTYPE *old_sum )
 {
 #ifdef DOUBLE

From 25b573789ba916e191db887f5263e1ed5ef2f281 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Fri, 31 Jul 2015 19:19:53 -0500
Subject: [PATCH 10/21] Change row block generation function to break row
 blocks that are a series of short rows and then a new long-ish row.
 CSR-Stream runs into performance issues when trying to reduce multiple rows
 of extremely varying lengths, so we just put these rows into different
 workgroups.

---
 src/library/internal/computeRowBlocks.hpp  | 142 ++++++++++++++++-----
 src/library/io/mm-reader.cpp               |   9 +-
 src/library/kernels/csrmv_adaptive.cl      |  30 ++++-
 src/library/transform/clsparse-coo2csr.cpp |   2 +-
 4 files changed, 140 insertions(+), 43 deletions(-)

diff --git a/src/library/internal/computeRowBlocks.hpp b/src/library/internal/computeRowBlocks.hpp
index 6a01f4c..bd3342d 100644
--- a/src/library/internal/computeRowBlocks.hpp
+++ b/src/library/internal/computeRowBlocks.hpp
@@ -72,13 +72,17 @@ static inline rowBlockType numThreadsForReduction(rowBlockType num_rows)
 
 //  rowBlockType is currently instantiated as ulong
 template< typename rowBlockType >
-void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int* rowDelimiters, int nRows, int blkSize, int blkMultiplier )
+void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int* rowDelimiters, int nRows, int blkSize, int blkMultiplier, int rows_for_vector, bool allocate_row_blocks = true )
 {
     rowBlockType* rowBlocksBase;
+    int total_row_blocks = 1; // Start at one because of rowBlock[0]
 
-    rowBlocksBase = rowBlocks;
-    *rowBlocks = 0;
-    rowBlocks++;
+    if (allocate_row_blocks)
+    {
+        rowBlocksBase = rowBlocks;
+        *rowBlocks = 0;
+        rowBlocks++;
+    }
     rowBlockType sum = 0;
     rowBlockType i, last_i = 0;
 
@@ -89,77 +93,151 @@ void ComputeRowBlocks( rowBlockType* rowBlocks, size_t& rowBlockSize, const int*
         return;
     }
 
+    int consecutive_long_rows = 0;
     for( i = 1; i <= nRows; i++ )
     {
         int row_length = ( rowDelimiters[ i ] - rowDelimiters[ i - 1 ] );
         sum += row_length;
 
+        // The following section of code calculates whether you're moving between
+        // a series of "short" rows and a series of "long" rows.
+        // This is because the reduction in CSR-Adaptive likes things to be
+        // roughly the same length. Long rows can be reduced horizontally.
+        // Short rows can be reduced one-thread-per-row. Try not to mix them.
+        if ( row_length > 128 )
+            consecutive_long_rows++;
+        else if ( consecutive_long_rows > 0 )
+        {
+            // If it turns out we WERE in a long-row region, cut if off now.
+            if (row_length < 32) // Now we're in a short-row region
+                consecutive_long_rows = -1;
+            else
+                consecutive_long_rows++;
+        }
+
+        // If you just entered into a "long" row from a series of short rows,
+        // then we need to make sure we cut off those short rows. Put them in
+        // their own workgroup.
+        if ( consecutive_long_rows == 1 )
+        {
+            // Assuming there *was* a previous workgroup. If not, nothing to do here.
+            if( i - last_i > 1 )
+            {
+                if (allocate_row_blocks)
+                {
+                    *rowBlocks = ( (i - 1) << (64 - ROW_BITS) );
+                    // If this row fits into CSR-Stream, calculate how many rows
+                    // can be used to do a parallel reduction.
+                    // Fill in the low-order bits with the numThreadsForRed
+                    if (((i-1) - last_i) > rows_for_vector)
+                        *(rowBlocks-1) |= numThreadsForReduction((i - 1) - last_i);
+                    rowBlocks++;
+                }
+                total_row_blocks++;
+                last_i = i-1;
+                sum = row_length;
+            }
+        }
+        else if (consecutive_long_rows == -1)
+        {
+            // We see the first short row after some long ones that
+            // didn't previously fill up a row block.
+            if (allocate_row_blocks)
+            {
+                *rowBlocks = ( (i - 1) << (64 - ROW_BITS) );
+                if (((i-1) - last_i) > rows_for_vector)
+                    *(rowBlocks-1) |= numThreadsForReduction((i - 1) - last_i);
+                rowBlocks++;
+            }
+            total_row_blocks++;
+            last_i = i-1;
+            sum = row_length;
+            consecutive_long_rows = 0;
+         }
+
+        // Now, what's up with this row? What did it do?
+
         // exactly one row results in non-zero elements to be greater than blockSize
         // This is csr-vector case; bottom WG_BITS == workgroup ID
         if( ( i - last_i == 1 ) && sum > blkSize )
         {
-            int numWGReq = static_cast< int >( ceil( (double)sum / (blkMultiplier*blkSize) ) );
+            int numWGReq = static_cast< int >( ceil( (double)row_length / (blkMultiplier*blkSize) ) );
 
             // Check to ensure #workgroups can fit in WG_BITS bits, if not
             // then the last workgroup will do all the remaining work
             numWGReq = ( numWGReq < (int)pow( 2, WG_BITS ) ) ? numWGReq : (int)pow( 2, WG_BITS );
 
-            for( int w = 1; w < numWGReq; w++ )
+            if (allocate_row_blocks)
             {
-                *rowBlocks = ( (i - 1) << (64 - ROW_BITS) );
-                *rowBlocks |= static_cast< rowBlockType >( w );
+                for( int w = 1; w < numWGReq; w++ )
+                {
+                    *rowBlocks = ( (i - 1) << (64 - ROW_BITS) );
+                    *rowBlocks |= static_cast< rowBlockType >( w );
+                    rowBlocks++;
+                }
+                *rowBlocks = ( i << (64 - ROW_BITS) );
                 rowBlocks++;
             }
-
-            *rowBlocks = ( i << (64 -ROW_BITS) );
-            rowBlocks++;
-
+            total_row_blocks += numWGReq;
             last_i = i;
             sum = 0;
+            consecutive_long_rows = 0;
         }
         // more than one row results in non-zero elements to be greater than blockSize
         // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads
         else if( ( i - last_i > 1 ) && sum > blkSize )
         {
             i--; // This row won't fit, so back off one.
-            *rowBlocks = ( i << (64 - ROW_BITS) );
-            // Fill in the low-order bits with the numThreadsForRed
-            if ((i - last_i) > 2)
-                *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
-            rowBlocks++;
+            if (allocate_row_blocks)
+            {
+                *rowBlocks = ( i << (64 - ROW_BITS) );
+                if ((i - last_i) > rows_for_vector)
+                    *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
+                rowBlocks++;
+            }
+            total_row_blocks++;
             last_i = i;
             sum = 0;
+            consecutive_long_rows = 0;
         }
         // This is csr-stream case; bottom WG_BITS = number of parallel reduction threads
         else if( sum == blkSize )
         {
-            *rowBlocks = ( i << (64 - ROW_BITS) );
-            // Fill in the low-order bits with the numThreadsForRed
-            if ((i - last_i) > 2)
-                *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
-            rowBlocks++;
+            if (allocate_row_blocks)
+            {
+                *rowBlocks = ( i << (64 - ROW_BITS) );
+                if ((i - last_i) > rows_for_vector)
+                    *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
+                rowBlocks++;
+            }
+            total_row_blocks++;
             last_i = i;
             sum = 0;
+            consecutive_long_rows = 0;
         }
-
     }
 
     // If we didn't fill a row block with the last row, make sure we don't lose it.
-    if ( (*(rowBlocks-1) >> (64 - ROW_BITS)) != static_cast< rowBlockType>(nRows) )
+    if ( allocate_row_blocks && (*(rowBlocks-1) >> (64 - ROW_BITS)) != static_cast< rowBlockType>(nRows) )
     {
         *rowBlocks = ( static_cast< rowBlockType >( nRows ) << (64 - ROW_BITS) );
-        // Fill in the low-order bits with the numThreadsForRed
-        if ((nRows - last_i) > 2)
+        if ((nRows - last_i) > rows_for_vector)
             *(rowBlocks-1) |= numThreadsForReduction(i - last_i);
         rowBlocks++;
     }
+    total_row_blocks++;
 
-    size_t dist = std::distance( rowBlocksBase, rowBlocks );
-    assert( (2 * dist) <= rowBlockSize );
-    // Update the size of rowBlocks to reflect the actual amount of memory used
-    // We're multiplying the size by two because the extended precision form of
-    // CSR-Adaptive requires more space for the final global reduction.
-    rowBlockSize = 2 * dist;
+    if (allocate_row_blocks)
+    {
+        size_t dist = std::distance( rowBlocksBase, rowBlocks );
+        assert( (2 * dist) <= rowBlockSize );
+        // Update the size of rowBlocks to reflect the actual amount of memory used
+        // We're multiplying the size by two because the extended precision form of
+        // CSR-Adaptive requires more space for the final global reduction.
+        rowBlockSize = 2 * dist;
+    }
+    else
+        rowBlockSize = 2 * total_row_blocks;
 }
 
 #endif
diff --git a/src/library/io/mm-reader.cpp b/src/library/io/mm-reader.cpp
index eb56f7f..ea4cbeb 100644
--- a/src/library/io/mm-reader.cpp
+++ b/src/library/io/mm-reader.cpp
@@ -596,9 +596,10 @@ clsparseSCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls
     if( pCsrMatx->rowBlockSize )
     {
         clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
+        ComputeRowBlocks( (cl_ulong*)NULL, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, false );
         cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER );
+        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true );
     }
 
     return clsparseSuccess;
@@ -666,9 +667,10 @@ clsparseDCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls
     if( pCsrMatx->rowBlockSize )
     {
         clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
+        ComputeRowBlocks( (cl_ulong*)NULL, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, false );
         cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER );
+        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true );
     }
 
     return clsparseSuccess;
@@ -734,9 +736,10 @@ clsparseDCsrMatrixfromFile(clsparseCsrMatrix* csrMatx, const char* filePath, cls
 //    if( pCsrMatx->rowBlockSize )
 //    {
 //        clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
+//        ComputeRowBlocks( (cl_ulong*)NULL, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, false );
 //        cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-//        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER );
+//        ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, iCsrRowOffsets, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true );
 //    }
 
 //    return clsparseSuccess;
diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 13ba114..78ff48b 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -295,10 +295,8 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        // long) rows, it's actually better to perform a tree-style reduction where
        // multiple threads team up to reduce the same row.
 
-       // The calculation below tell you how many threads this workgroup can allocate
+       // The calculation below tells you how many threads this workgroup can allocate
        // to each row, assuming that every row gets the same number of threads.
-       // We want the closest lower-power-of-2 to this number -- that is how many
-       // threads can work in each row's reduction using our algorithm.
        // We want the closest lower (or equal) power-of-2 to this number --
        // that is how many threads can work in each row's reduction using our algorithm.
        // For instance, with workgroup size 256, 2 rows = 128 threads, 3 rows = 64
@@ -520,7 +518,7 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        // The first workgroup will eventually flip this bit, and you can move forward.
        barrier(CLK_GLOBAL_MEM_FENCE);
        while(gid != first_wg_in_row &&
-               lid == 0 &&
+               lid == 0UL &&
                ((atom_max(&rowBlocks[first_wg_in_row], 0UL) & (1UL << WGBITS)) == compare_value));
        barrier(CLK_GLOBAL_MEM_FENCE);
 
@@ -538,8 +536,26 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        FPTYPE sumk_e = 0.;
 
        int col = vecStart + lid;
-       for(int j = 0; j < (int)(vecEnd - col); j += WGSIZE)
-           mySum = two_sum(mySum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+       if (row == stop_row) // inner thread, we can hardcode/unroll this loop
+       {
+           // Don't put BLOCK_MULTIPLIER*BLOCKSIZE as the stop point, because
+           // some GPU compilers will *aggressively* unroll this loop.
+           // That increases register pressure and reduces occupancy.
+           for (int j = 0; j < (int)(vecEnd - col); j += WGSIZE)
+           {
+               mySum = two_sum(mySum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+#if 2*WGSIZE <= BLOCK_MULTIPLIER*BLOCKSIZE
+               // If you can, unroll this loop once. It somewhat helps performance.
+               j += WGSIZE;
+               mySum = two_sum(mySum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+#endif
+           }
+       }
+       else
+       {
+           for(int j = 0; j < (int)(vecEnd - col); j += WGSIZE)
+               mySum = two_sum(mySum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+       }
 
        FPTYPE new_error = 0.;
        mySum = two_sum(mySum, sumk_e, &new_error);
@@ -557,7 +573,7 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 1, 2, 3, 4);
        barrier(CLK_LOCAL_MEM_FENCE);
 
-       if (lid == 0)
+       if (lid == 0UL)
        {
            atomic_two_sum_float(&out[row], mySum, &new_error);
 
diff --git a/src/library/transform/clsparse-coo2csr.cpp b/src/library/transform/clsparse-coo2csr.cpp
index a463e14..1453ad5 100644
--- a/src/library/transform/clsparse-coo2csr.cpp
+++ b/src/library/transform/clsparse-coo2csr.cpp
@@ -86,7 +86,7 @@ clsparseCsrMetaCompute( clsparseCsrMatrix* csrMatx, clsparseControl control )
     clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
     cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );
 
-    ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER );
+    ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE, BLOCK_MULTIPLIER, ROWS_FOR_VECTOR, true );
 
     return clsparseSuccess;
 }

From ebf357c397d11d3ec05b37e90c40b12df823f7f1 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Mon, 3 Aug 2015 15:23:01 -0500
Subject: [PATCH 11/21] Code cleanup and modifications to improve readability
 and performance on SpM-DV algorithms.

---
 src/library/kernels/csrmv_adaptive.cl | 223 +++++++++-----------------
 src/library/kernels/csrmv_general.cl  |  20 +--
 2 files changed, 79 insertions(+), 164 deletions(-)

diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 78ff48b..02e9a42 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -16,7 +16,9 @@ R"(
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
 
-FPTYPE atomic_add_float_extended( global FPTYPE *ptr, FPTYPE temp, FPTYPE *old_sum )
+FPTYPE atomic_add_float_extended( global FPTYPE * restrict const ptr,
+                                  const FPTYPE temp,
+                                  FPTYPE * restrict const old_sum )
 {
 #ifdef DOUBLE
 	unsigned long newVal;
@@ -43,7 +45,7 @@ FPTYPE atomic_add_float_extended( global FPTYPE *ptr, FPTYPE temp, FPTYPE *old_s
 #endif
 }
 
-void atomic_add_float( global void *ptr, FPTYPE temp )
+void atomic_add_float( global void * const ptr, const FPTYPE temp )
 {
     atomic_add_float_extended(ptr, temp, 0);
 }
@@ -57,7 +59,7 @@ void atomic_add_float( global void *ptr, FPTYPE temp )
 // Returns: The non-corrected sum of inputs x and y.
 FPTYPE two_sum( FPTYPE x,
                 FPTYPE y,
-                FPTYPE *sumk_err )
+                FPTYPE * restrict const sumk_err )
 {
     FPTYPE sumk_s = x + y;
 #ifdef EXTENDED_PRECISION
@@ -93,9 +95,9 @@ FPTYPE two_sum( FPTYPE x,
     return sumk_s;
 }
 
-FPTYPE atomic_two_sum_float( global FPTYPE *x_ptr,
+FPTYPE atomic_two_sum_float( global FPTYPE * restrict const x_ptr,
                             FPTYPE y,
-                            FPTYPE *sumk_err )
+                            FPTYPE * restrict const sumk_err )
 {
     // Have to wait until the return from the atomic op to know what X was.
     FPTYPE sumk_s = 0.;
@@ -123,60 +125,6 @@ FPTYPE atomic_two_sum_float( global FPTYPE *x_ptr,
 // CPU. It is based on the PSumK algorithm (with K==2) from Yamanaka, Ogita,
 // Rump, and Oishi, "A Parallel Algorithm of Accurate Dot Product," in
 // J. Parallel Computing, 34(6-8), pp. 392-410, Jul. 2008.
-// Inputs:  cur_sum: the input from which our sum starts
-//          err: the current running cascade error for this final summation
-//          partial: the local memory which holds the values to sum
-//                  (we eventually use it to pass down temp. err vals as well)
-//          lid: local ID of the work item calling this function.
-//          integers: This parallel summation method is meant to take values
-//                  from four different points. See the blow comment for usage.
-// Don't inline this function. It reduces performance.
-FPTYPE sum2_reduce( FPTYPE cur_sum,
-        FPTYPE *err,
-        __local FPTYPE *partial,
-        size_t lid,
-        int first,
-        int second,
-        int third,
-        int last )
-{
-    // A subset of the threads in this workgroup add three sets
-    // of values together using the 2Sum method.
-    // For instance, threads 0-63 would each load four values
-    // from the upper 192 locations.
-    // Example: Thread 0 loads 0, 64, 128, and 192.
-    if (lid < first)
-    {
-        cur_sum = two_sum(cur_sum, partial[lid + first], err);
-        cur_sum = two_sum(cur_sum, partial[lid + second], err);
-        cur_sum = two_sum(cur_sum, partial[lid + third], err);
-    }
-#ifdef EXTENDED_PRECISION
-    // We reuse the LDS entries to move the error values down into lower
-    // threads. This saves LDS space, allowing higher occupancy, but requires
-    // more barriers, which can reduce performance.
-    barrier(CLK_LOCAL_MEM_FENCE);
-    // Have all of those upper threads pass their temporary errors
-    // into a location that the lower threads can read.
-    if (lid >= first && lid < last)
-        partial[lid] = *err;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (lid < first) // Add those errors in.
-    {
-        cur_sum = two_sum(cur_sum, partial[lid + first], err);
-        cur_sum = two_sum(cur_sum, partial[lid + second], err);
-        cur_sum = two_sum(cur_sum, partial[lid + third], err);
-        partial[lid] = cur_sum;
-    }
-#endif
-    return cur_sum;
-}
-
-// A simpler reduction than the one above, also built to use EXTENDED_PRECISION
-// Rather than working on large widths, this simply loads a single value from
-// an upper entry of the LDS and drops it into the local region.
-// It is meant to be called in parallel on an LDS buffer, over enough iterations,
-// reduce multiple rows into a small series of output rows.
 // A version of this method is also used in csrmv_general.
 // Inputs:  cur_sum: the input from which our sum starts
 //          err: the current running cascade error for this final summation
@@ -186,15 +134,15 @@ FPTYPE sum2_reduce( FPTYPE cur_sum,
 //          thread_lane: The lane within this thread's row.
 //          max_size: This parallel summation method operates in multiple rounds
 //                  to do a parallel reduction. This is the length of each row.
-//          round: As you reduce data down, this tells you how many output values
-//                 you won't during each round.
-FPTYPE simple_sum2_reduce( FPTYPE cur_sum,
-        FPTYPE *err,
-        volatile __local FPTYPE *partial,
-        size_t lid,
-        int thread_lane,
-        int max_size,
-        int reduc_size )
+//          reduc_size: As you reduce data down, this tells you how far away
+//                 you will grab a value and add to your own local sum value.
+FPTYPE sum2_reduce( FPTYPE cur_sum,
+        FPTYPE * restrict const err,
+        volatile __local FPTYPE * restrict const partial,
+        const size_t lid,
+        const int thread_lane,
+        const int max_size,
+        const int reduc_size )
 {
     if ( max_size > reduc_size )
     {
@@ -225,18 +173,18 @@ FPTYPE simple_sum2_reduce( FPTYPE cur_sum,
 }
 
 __kernel void
-csrmv_adaptive(__global const FPTYPE * restrict vals,
-                       __global const int * restrict cols,
-                       __global const int * restrict rowPtrs,
-                       __global const FPTYPE * restrict vec,
-                       __global FPTYPE * restrict out,
-                       __global unsigned long * restrict rowBlocks,
-                       __global FPTYPE * restrict pAlpha,
-                       __global FPTYPE * restrict pBeta)
+csrmv_adaptive(__global const FPTYPE * restrict const vals,
+                       __global const int * restrict const cols,
+                       __global const int * restrict const rowPtrs,
+                       __global const FPTYPE * restrict const vec,
+                       __global FPTYPE * restrict const out,
+                       __global unsigned long * restrict const rowBlocks,
+                       __global const FPTYPE * restrict const pAlpha,
+                       __global const FPTYPE * restrict const pBeta)
 {
    __local FPTYPE partialSums[BLOCKSIZE];
-   size_t gid = get_group_id(0);
-   size_t lid = get_local_id(0);
+   const size_t gid = get_group_id(0);
+   const size_t lid = get_local_id(0);
    const FPTYPE alpha = *pAlpha;
    const FPTYPE beta = *pBeta;
 
@@ -273,6 +221,10 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
    unsigned int vecStart = rowPtrs[row] + (wg * BLOCK_MULTIPLIER*BLOCKSIZE);
    unsigned int vecEnd = (rowPtrs[row + 1] > vecStart + BLOCK_MULTIPLIER*BLOCKSIZE) ? vecStart + BLOCK_MULTIPLIER*BLOCKSIZE : rowPtrs[row + 1];
 
+   FPTYPE temp_sum = 0.;
+   FPTYPE sumk_e = 0.;
+   FPTYPE new_error = 0.;
+
    // If the next row block starts more than 2 rows away, then we choose CSR-Stream.
    // If this is zero (long rows) or one (final workgroup in a long row, or a single
    // row in a row block), we want to use the CSR-Vector algorithm(s).
@@ -334,8 +286,6 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           // a row into {numThreadsForRed} locations in local memory.
           // After that, the entire workgroup does a parallel reduction, and each
           // row ends up with an individual answer.
-          FPTYPE temp = 0.;
-          FPTYPE sumk_e = 0.;
 
           // {numThreadsForRed} adjacent threads all work on the same row, so their
           // start and end values are the same.
@@ -354,40 +304,28 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           {
               // only works when numThreadsForRed is a power of 2
               for(int i = 0; i < workForEachThread; i++)
-                  temp = two_sum(temp, partialSums[local_first_val + i*numThreadsForRed + threadInBlock], &sumk_e);
+                  temp_sum = two_sum(temp_sum, partialSums[local_first_val + i*numThreadsForRed + threadInBlock], &sumk_e);
 
               // The last few values (the remainder of this row) also need to be aded in.
               int local_cur_val = local_first_val + numThreadsForRed*workForEachThread;
               if(threadInBlock < local_last_val - local_cur_val)
-                  temp = two_sum(temp, partialSums[local_cur_val + threadInBlock], &sumk_e);
+                  temp_sum = two_sum(temp_sum, partialSums[local_cur_val + threadInBlock], &sumk_e);
           }
           barrier(CLK_LOCAL_MEM_FENCE);
 
-          FPTYPE new_error = 0.;
-          temp = two_sum(temp, sumk_e, &new_error);
-          partialSums[lid] = temp;
+          temp_sum = two_sum(temp_sum, sumk_e, &new_error);
+          partialSums[lid] = temp_sum;
 
           // Step one of this two-stage reduction is done. Now each row has {numThreadsForRed}
           // values sitting in the local memory. This means that, roughly, the beginning of
           // LDS is full up to {workgroup size} entries.
           // Now we perform a parallel reduction that sums together the answers for each
-          // row in parallel, leaving us an answer in 'temp' for each row.
-          barrier(CLK_LOCAL_MEM_FENCE);
-          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 128);
-          barrier( CLK_LOCAL_MEM_FENCE );
-          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 64);
-          barrier( CLK_LOCAL_MEM_FENCE );
-          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 32);
-          barrier( CLK_LOCAL_MEM_FENCE );
-          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 16);
-          barrier( CLK_LOCAL_MEM_FENCE );
-          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 8);
-          barrier( CLK_LOCAL_MEM_FENCE );
-          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 4);
-          barrier( CLK_LOCAL_MEM_FENCE );
-          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 2);
-          barrier( CLK_LOCAL_MEM_FENCE );
-          temp = simple_sum2_reduce(temp, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, 1);
+          // row in parallel, leaving us an answer in 'temp_sum' for each row.
+          for (int i = (WGSIZE >> 1); i > 0; i >>= 1)
+          {
+              barrier( CLK_LOCAL_MEM_FENCE);
+              temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, i);
+          }
 
           if (threadInBlock == 0 && st < num_rows)
           {
@@ -395,8 +333,8 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
               // If so, just do a write rather than a read-write. Measured to be a slight (~5%)
               // performance improvement.
               if (beta != 0.)
-                  temp = two_sum(beta * out[row+st], temp, &new_error);
-              out[row+st] = temp + new_error;
+                  temp_sum = two_sum(beta * out[row+st], temp_sum, &new_error);
+              out[row+st] = temp_sum + new_error;
           }
       }
       else
@@ -411,16 +349,16 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
           {
               int local_first_val = (rowPtrs[local_row] - rowPtrs[row]);
               int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row];
-              FPTYPE temp = 0.;
-              FPTYPE sumk_e = 0.;
+              temp_sum = 0.;
+              sumk_e = 0.;
               for (int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++)
-                  temp = two_sum(temp, partialSums[local_cur_val], &sumk_e);
+                  temp_sum = two_sum(temp_sum, partialSums[local_cur_val], &sumk_e);
 
-              // After you've done the reduction into the temp register,
+              // After you've done the reduction into the temp_sum register,
               // put that into the output for each row.
               if (beta != 0.)
-                  temp = two_sum(beta * out[local_row], temp, &sumk_e);
-              out[local_row] = temp + sumk_e;
+                  temp_sum = two_sum(beta * out[local_row], temp_sum, &sumk_e);
+              out[local_row] = temp_sum + sumk_e;
               local_row += WGSIZE;
           }
       }
@@ -442,43 +380,38 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        {
            // Any workgroup only calculates, at most, BLOCKSIZE items in this row.
            // If there are more items in this row, we use CSR-LongRows.
+           temp_sum = 0.;
+           sumk_e = 0.;
+           new_error = 0.;
            vecStart = rowPtrs[myRow];
            vecEnd = rowPtrs[myRow+1];
 
            // Load in a bunch of partial results into your register space, rather than LDS (no contention)
            // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
            // Using a long induction variable to make sure unsigned int overflow doesn't break things.
-           FPTYPE mySum = 0.;
-           FPTYPE sumk_e = 0.;
            for (long j = vecStart + lid; j < vecEnd; j+=WGSIZE)
            {
                unsigned int col = cols[(unsigned int)j];
-               mySum = two_sum(mySum, alpha * vals[(unsigned int)j] * vec[col], &sumk_e);
+               temp_sum = two_sum(temp_sum, alpha * vals[(unsigned int)j] * vec[col], &sumk_e);
            }
 
-           FPTYPE new_error = 0.;
-           mySum = two_sum(mySum, sumk_e, &new_error);
-           partialSums[lid] = mySum;
-           barrier(CLK_LOCAL_MEM_FENCE);
+           temp_sum = two_sum(temp_sum, sumk_e, &new_error);
+           partialSums[lid] = temp_sum;
 
            // Reduce partial sums
-           // These numbers need to change if the # of work-items/WG changes.
-           mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 64, 128, 192, 256);
-           barrier(CLK_LOCAL_MEM_FENCE);
-           mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 16, 32, 48, 64);
-           barrier(CLK_LOCAL_MEM_FENCE);
-           mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 4, 8, 12, 16);
-           barrier(CLK_LOCAL_MEM_FENCE);
-           mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 1, 2, 3, 4);
-           barrier(CLK_LOCAL_MEM_FENCE);
+           for (int i = (WGSIZE >> 1); i > 0; i >>= 1)
+           {
+               barrier( CLK_LOCAL_MEM_FENCE);
+               temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, lid, WGSIZE, i);
+           }
 
            if (lid == 0UL)
            {
                if (beta != 0.)
-                   mySum = two_sum(beta * out[myRow], mySum, &new_error);
-               out[myRow] = mySum + new_error;
+                   temp_sum = two_sum(beta * out[myRow], temp_sum, &new_error);
+               out[myRow] = temp_sum + new_error;
            }
-           // CSR-VECTOR on workgroups for two rows which are inefficient for CSR-Stream
+           // CSR-Vector can possibly work on multiple rows one after the other.
            myRow++;
        }
    }
@@ -531,10 +464,6 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
        // and stop_row. They only run for one iteration.
        // Load in a bunch of partial results into your register space, rather than LDS (no contention)
        // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
-       // Using a long induction variable to make sure unsigned int overflow doesn't break things.
-       FPTYPE mySum = 0.;
-       FPTYPE sumk_e = 0.;
-
        int col = vecStart + lid;
        if (row == stop_row) // inner thread, we can hardcode/unroll this loop
        {
@@ -543,39 +472,33 @@ csrmv_adaptive(__global const FPTYPE * restrict vals,
            // That increases register pressure and reduces occupancy.
            for (int j = 0; j < (int)(vecEnd - col); j += WGSIZE)
            {
-               mySum = two_sum(mySum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+               temp_sum = two_sum(temp_sum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
 #if 2*WGSIZE <= BLOCK_MULTIPLIER*BLOCKSIZE
                // If you can, unroll this loop once. It somewhat helps performance.
                j += WGSIZE;
-               mySum = two_sum(mySum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+               temp_sum = two_sum(temp_sum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
 #endif
            }
        }
        else
        {
            for(int j = 0; j < (int)(vecEnd - col); j += WGSIZE)
-               mySum = two_sum(mySum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+               temp_sum = two_sum(temp_sum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
        }
 
-       FPTYPE new_error = 0.;
-       mySum = two_sum(mySum, sumk_e, &new_error);
-       partialSums[lid] = mySum;
-       barrier(CLK_LOCAL_MEM_FENCE);
+       temp_sum = two_sum(temp_sum, sumk_e, &new_error);
+       partialSums[lid] = temp_sum;
 
        // Reduce partial sums
-       // Needs to be modified if there is a change in the # of work-items per workgroup.
-       mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 64, 128, 192, 256);
-       barrier(CLK_LOCAL_MEM_FENCE);
-       mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 16, 32, 48, 64);
-       barrier(CLK_LOCAL_MEM_FENCE);
-       mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 4, 8, 12, 16);
-       barrier(CLK_LOCAL_MEM_FENCE);
-       mySum = sum2_reduce(mySum, &new_error, partialSums, lid, 1, 2, 3, 4);
-       barrier(CLK_LOCAL_MEM_FENCE);
+       for (int i = (WGSIZE >> 1); i > 0; i >>= 1)
+       {
+           barrier( CLK_LOCAL_MEM_FENCE);
+           temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, lid, WGSIZE, i);
+       }
 
        if (lid == 0UL)
        {
-           atomic_two_sum_float(&out[row], mySum, &new_error);
+           atomic_two_sum_float(&out[row], temp_sum, &new_error);
 
 #ifdef EXTENDED_PRECISION
            // The last half of the rowBlocks buffer is used to hold errors.
diff --git a/src/library/kernels/csrmv_general.cl b/src/library/kernels/csrmv_general.cl
index 0ce6208..b5d6065 100644
--- a/src/library/kernels/csrmv_general.cl
+++ b/src/library/kernels/csrmv_general.cl
@@ -172,7 +172,7 @@ void csrmv_general (     const INDEX_TYPE num_rows,
     {
         const int row_start = row_offset[row];
         const int row_end   = row_offset[row+1];
-        VALUE_TYPE sum = (VALUE_TYPE) 0;
+        VALUE_TYPE sum = 0.;
 
         VALUE_TYPE sumk_e = 0.;
         // It is about 5% faster to always multiply by alpha, rather than to
@@ -184,7 +184,6 @@ void csrmv_general (     const INDEX_TYPE num_rows,
 
         // Parallel reduction in shared memory.
         sdata[local_id] = sum;
-        barrier( CLK_LOCAL_MEM_FENCE );
 
         // This compensated summation reduces cummulative rounding errors,
         // which can become a problem on GPUs because our reduction order is
@@ -193,18 +192,11 @@ void csrmv_general (     const INDEX_TYPE num_rows,
         // Yamanaka, Ogita, Rump, and Oishi, "A Parallel Algorithm of
         // Accurate Dot Product," in the Journal of Parallel Computing,
         // 34(6-8), pp. 392-410, Jul. 2008.
-        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 32);
-        barrier( CLK_LOCAL_MEM_FENCE );
-        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 16);
-        barrier( CLK_LOCAL_MEM_FENCE );
-        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 8);
-        barrier( CLK_LOCAL_MEM_FENCE );
-        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 4);
-        barrier( CLK_LOCAL_MEM_FENCE );
-        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 2);
-        barrier( CLK_LOCAL_MEM_FENCE );
-        sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, 1);
-        barrier( CLK_LOCAL_MEM_FENCE );
+        for (int i = (WG_SIZE >> 1); i > 0; i >>= 1)
+        {
+            barrier( CLK_LOCAL_MEM_FENCE );
+            sum = sum2_reduce(sum, &new_error, sdata, local_id, thread_lane, i);
+        }
 
         if (thread_lane == 0)
         {

From b23c29c3a8084a3a7e1ae29c9d226fa0b8eb8cbb Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Mon, 3 Aug 2015 15:23:40 -0500
Subject: [PATCH 12/21] Changing the csrmv_adaptive configuration parameter
 from sending 2 rows at a time to CSR-Vector to only 1. It turns out that,
 after recent modifications to CSR-Stream, it is more efficient to use
 CSR-Stream for this case.

---
 src/library/include/clSPARSE-private.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/library/include/clSPARSE-private.hpp b/src/library/include/clSPARSE-private.hpp
index 9db9930..c531bc4 100644
--- a/src/library/include/clSPARSE-private.hpp
+++ b/src/library/include/clSPARSE-private.hpp
@@ -27,6 +27,6 @@ const cl_uint WG_BITS = 24;
 const cl_uint ROW_BITS = 32;
 const cl_uint BLKSIZE = 1024;
 const cl_uint BLOCK_MULTIPLIER = 3;
-const cl_uint ROWS_FOR_VECTOR = 2;
+const cl_uint ROWS_FOR_VECTOR = 1;
 
 #endif

From 5ae446837f8a644820d16b78602e773c434a3485 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Mon, 10 Aug 2015 20:02:51 -0500
Subject: [PATCH 13/21] Adding consts on a bunch of constant variables.
 Changing some data types from size_t to unsigned int, since we currently do
 not work on extremely large data structures. Changing around some other data
 types. In general, all of this results in some minor performance gains on
 Fiji GPUs.

---
 src/library/kernels/csrmv_adaptive.cl | 105 ++++++++++++--------------
 1 file changed, 50 insertions(+), 55 deletions(-)

diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 02e9a42..85d73e7 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -61,7 +61,7 @@ FPTYPE two_sum( FPTYPE x,
                 FPTYPE y,
                 FPTYPE * restrict const sumk_err )
 {
-    FPTYPE sumk_s = x + y;
+    const FPTYPE sumk_s = x + y;
 #ifdef EXTENDED_PRECISION
     // We use this 2Sum algorithm to perform a compensated summation,
     // which can reduce the cummulative rounding errors in our SpMV summation.
@@ -80,16 +80,15 @@ FPTYPE two_sum( FPTYPE x,
     // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher
     // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51,
     // 1974), respectively.
-    FPTYPE swap;
     if (fabs(x) < fabs(y))
     {
-        swap = x;
+        const FPTYPE swap = x;
         x = y;
         y = swap;
     }
     (*sumk_err) += (y - (sumk_s - x));
     // Original 6 FLOP 2Sum algorithm.
-    //FPTYPE bp = sumk_s - x;
+    //const FPTYPE bp = sumk_s - x;
     //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp));
 #endif
     return sumk_s;
@@ -102,11 +101,11 @@ FPTYPE atomic_two_sum_float( global FPTYPE * restrict const x_ptr,
     // Have to wait until the return from the atomic op to know what X was.
     FPTYPE sumk_s = 0.;
 #ifdef EXTENDED_PRECISION
-    FPTYPE x, swap;
+    FPTYPE x;
     sumk_s = atomic_add_float_extended(x_ptr, y, &x);
     if (fabs(x) < fabs(y))
     {
-        swap = x;
+        const FPTYPE swap = x;
         x = y;
         y = swap;
     }
@@ -138,30 +137,29 @@ FPTYPE atomic_two_sum_float( global FPTYPE * restrict const x_ptr,
 //                 you will grab a value and add to your own local sum value.
 FPTYPE sum2_reduce( FPTYPE cur_sum,
         FPTYPE * restrict const err,
-        volatile __local FPTYPE * restrict const partial,
-        const size_t lid,
-        const int thread_lane,
-        const int max_size,
-        const int reduc_size )
+        __local FPTYPE * restrict const partial,
+        const unsigned int lid,
+        const unsigned int thread_lane,
+        const unsigned int max_size,
+        const unsigned int reduc_size )
 {
     if ( max_size > reduc_size )
     {
 #ifdef EXTENDED_PRECISION
+        const unsigned int partial_dest = lid + reduc_size;
         if (thread_lane < reduc_size)
-            cur_sum  = two_sum(cur_sum, partial[lid + reduc_size], err);
-
+            cur_sum  = two_sum(cur_sum, partial[partial_dest], err);
         // We reuse the LDS entries to move the error values down into lower
         // threads. This saves LDS space, allowing higher occupancy, but requires
         // more barriers, which can reduce performance.
         barrier(CLK_LOCAL_MEM_FENCE);
         // Have all of those upper threads pass their temporary errors
         // into a location that the lower threads can read.
-        if (thread_lane >= reduc_size)
-            partial[lid] = *err;
+        partial[lid] = (thread_lane >= reduc_size) ? *err : partial[lid];
         barrier(CLK_LOCAL_MEM_FENCE);
         if (thread_lane < reduc_size) // Add those errors in.
         {
-            *err += partial[lid + reduc_size];
+            *err += partial[partial_dest];
             partial[lid] = cur_sum;
         }
 #else
@@ -174,8 +172,8 @@ FPTYPE sum2_reduce( FPTYPE cur_sum,
 
 __kernel void
 csrmv_adaptive(__global const FPTYPE * restrict const vals,
-                       __global const int * restrict const cols,
-                       __global const int * restrict const rowPtrs,
+                       __global const unsigned int * restrict const cols,
+                       __global const unsigned int * restrict const rowPtrs,
                        __global const FPTYPE * restrict const vec,
                        __global FPTYPE * restrict const out,
                        __global unsigned long * restrict const rowBlocks,
@@ -183,8 +181,8 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
                        __global const FPTYPE * restrict const pBeta)
 {
    __local FPTYPE partialSums[BLOCKSIZE];
-   const size_t gid = get_group_id(0);
-   const size_t lid = get_local_id(0);
+   const unsigned int gid = get_group_id(0);
+   const unsigned int lid = get_local_id(0);
    const FPTYPE alpha = *pAlpha;
    const FPTYPE beta = *pBeta;
 
@@ -210,11 +208,11 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
    // value. While this bit is the same as the first workgroup's flag bit, this
    // workgroup will spin-loop.
    unsigned int row = ((rowBlocks[gid] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL));
-   unsigned int stop_row = ((rowBlocks[gid + 1] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL));
-   unsigned int num_rows = stop_row - row;
+   const unsigned int stop_row = ((rowBlocks[gid + 1] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL));
+   const unsigned int num_rows = stop_row - row;
 
    // Get the "workgroup within this long row" ID out of the bottom bits of the row block.
-   unsigned int wg = rowBlocks[gid] & ((1 << WGBITS) - 1);
+   const unsigned int wg = rowBlocks[gid] & ((1 << WGBITS) - 1);
 
    // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row.
    // If there are more items in this row, we assign more workgroups.
@@ -254,11 +252,11 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
        // For instance, with workgroup size 256, 2 rows = 128 threads, 3 rows = 64
        // threads, 4 rows = 64 threads, 5 rows = 32 threads, etc.
        //int numThreadsForRed = get_local_size(0) >> ((CHAR_BIT*sizeof(unsigned int))-clz(num_rows-1));
-       int numThreadsForRed = wg; // Same calculation as above, done on host.
+       const unsigned int numThreadsForRed = wg; // Same calculation as above, done on host.
 
        // Stream all of this row block's matrix values into local memory.
        // Perform the matvec in parallel with this work.
-      int col = rowPtrs[row] + lid;
+       const unsigned int col = rowPtrs[row] + lid;
       if (gid != (get_num_groups(0) - 1))
       {
           for(int i = 0; i < BLOCKSIZE; i += WGSIZE)
@@ -273,7 +271,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
           // However, this may change in the future (e.g. with shared virtual memory.)
           // This causes a minor performance loss because this is the last workgroup
           // to be launched, and this loop can't be unrolled.
-          int max_to_load = rowPtrs[stop_row] - rowPtrs[row];
+          const unsigned int max_to_load = rowPtrs[stop_row] - rowPtrs[row];
           for(int i = 0; i < max_to_load; i += WGSIZE)
               partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]];
       }
@@ -292,15 +290,14 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
           // numThreadsForRed guaranteed to be a power of two, so the clz code below
           // avoids an integer divide. ~2% perf gain in EXTRA_PRECISION.
           //size_t st = lid/numThreadsForRed;
-          size_t st = lid >> (31 - clz(numThreadsForRed));
-          int local_first_val = (rowPtrs[row + st] - rowPtrs[row]);
-          int local_last_val = rowPtrs[row + st + 1] - rowPtrs[row];
-          int workForEachThread = (local_last_val - local_first_val) >> (31 - clz(numThreadsForRed));
-          size_t threadInBlock = lid & (numThreadsForRed - 1);
+          const unsigned int local_row = row + (lid >> (31 - clz(numThreadsForRed)));
+          const unsigned int local_first_val = rowPtrs[local_row] - rowPtrs[row];
+          const unsigned int local_last_val = rowPtrs[local_row + 1] - rowPtrs[row];
+          const unsigned int threadInBlock = lid & (numThreadsForRed - 1);
 
           // Not all row blocks are full -- they may have an odd number of rows. As such,
           // we need to ensure that adjacent-groups only work on real data for this rowBlock.
-          if(st < num_rows)
+          if(local_row < stop_row)
           {
               // only works when numThreadsForRed is a power of 2
               for(int i = 0; i < workForEachThread; i++)
@@ -323,18 +320,18 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
           // row in parallel, leaving us an answer in 'temp_sum' for each row.
           for (int i = (WGSIZE >> 1); i > 0; i >>= 1)
           {
-              barrier( CLK_LOCAL_MEM_FENCE);
+              barrier( CLK_LOCAL_MEM_FENCE );
               temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, i);
           }
 
-          if (threadInBlock == 0 && st < num_rows)
+          if (threadInBlock == 0 && local_row < stop_row)
           {
               // All of our write-outs check to see if the output vector should first be zeroed.
               // If so, just do a write rather than a read-write. Measured to be a slight (~5%)
               // performance improvement.
               if (beta != 0.)
-                  temp_sum = two_sum(beta * out[row+st], temp_sum, &new_error);
-              out[row+st] = temp_sum + new_error;
+                  temp_sum = two_sum(beta * out[local_row], temp_sum, &new_error);
+              out[local_row] = temp_sum + new_error;
           }
       }
       else
@@ -370,28 +367,26 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
        // done with CSR-LongRows, then all of its workgroups (except the last one) will have the
        // same stop_row and row. The final workgroup in a LongRow will have stop_row and row
        // different, but the internal "wg" number will be non-zero.
-       unsigned int myRow = row; // Adding another variable for row as it is getting updated below
 
-       // CSR-Vector will always do at least one iteration.
        // If this workgroup is operating on multiple rows (because CSR-Stream is poor for small
-       // numberss of rows), then it needs to iterate until it reaches the stop_row.
+       // numbers of rows), then it needs to iterate until it reaches the stop_row.
        // We don't check <= stop_row because of the potential for unsigned overflow.
-       while (myRow < stop_row)
+       while (row < stop_row)
        {
            // Any workgroup only calculates, at most, BLOCKSIZE items in this row.
            // If there are more items in this row, we use CSR-LongRows.
            temp_sum = 0.;
            sumk_e = 0.;
            new_error = 0.;
-           vecStart = rowPtrs[myRow];
-           vecEnd = rowPtrs[myRow+1];
+           vecStart = rowPtrs[row];
+           vecEnd = rowPtrs[row+1];
 
            // Load in a bunch of partial results into your register space, rather than LDS (no contention)
            // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
            // Using a long induction variable to make sure unsigned int overflow doesn't break things.
            for (long j = vecStart + lid; j < vecEnd; j+=WGSIZE)
            {
-               unsigned int col = cols[(unsigned int)j];
+               const unsigned int col = cols[(unsigned int)j];
                temp_sum = two_sum(temp_sum, alpha * vals[(unsigned int)j] * vec[col], &sumk_e);
            }
 
@@ -408,11 +403,10 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
            if (lid == 0UL)
            {
                if (beta != 0.)
-                   temp_sum = two_sum(beta * out[myRow], temp_sum, &new_error);
-               out[myRow] = temp_sum + new_error;
+                   temp_sum = two_sum(beta * out[row], temp_sum, &new_error);
+               out[row] = temp_sum + new_error;
            }
-           // CSR-Vector can possibly work on multiple rows one after the other.
-           myRow++;
+           row++;
        }
    }
    else
@@ -430,8 +424,8 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
        // First, figure out which workgroup you are in the row. Bottom 24 bits.
        // You can use that to find the global ID for the first workgroup calculating
        // this long row.
-       size_t first_wg_in_row = gid - (rowBlocks[gid] & ((1UL << WGBITS) - 1UL));
-       unsigned int compare_value = rowBlocks[gid] & (1UL << WGBITS);
+       const unsigned int first_wg_in_row = gid - (rowBlocks[gid] & ((1UL << WGBITS) - 1UL));
+       const unsigned int compare_value = rowBlocks[gid] & (1UL << WGBITS);
 
        // Bit 24 in the first workgroup is the flag that everyone waits on.
        if(gid == first_wg_in_row && lid == 0UL)
@@ -464,7 +458,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
        // and stop_row. They only run for one iteration.
        // Load in a bunch of partial results into your register space, rather than LDS (no contention)
        // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
-       int col = vecStart + lid;
+       const unsigned int col = vecStart + lid;
        if (row == stop_row) // inner thread, we can hardcode/unroll this loop
        {
            // Don't put BLOCK_MULTIPLIER*BLOCKSIZE as the stop point, because
@@ -501,8 +495,9 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
            atomic_two_sum_float(&out[row], temp_sum, &new_error);
 
 #ifdef EXTENDED_PRECISION
+           unsigned int error_loc = get_num_groups(0) + first_wg_in_row + 1;
            // The last half of the rowBlocks buffer is used to hold errors.
-           atomic_add_float(&(rowBlocks[get_num_groups(0)+first_wg_in_row+1]), new_error);
+           atomic_add_float(&(rowBlocks[error_loc]), new_error);
            // Coordinate across all of the workgroups in this coop in order to have
            // the last workgroup fix up the error values.
            // If this workgroup's row is different than the next workgroup's row
@@ -517,14 +512,14 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
                while((atom_max(&rowBlocks[first_wg_in_row], 0UL) & ((1UL << WGBITS) - 1)) != wg);
 
 #ifdef DOUBLE
-               new_error = as_double(rowBlocks[get_num_groups(0)+first_wg_in_row+1]);
+               new_error = as_double(rowBlocks[error_loc]);
 #else
-               new_error = as_float((int)rowBlocks[get_num_groups(0)+first_wg_in_row+1]);
+               new_error = as_float((int)rowBlocks[error_loc]);
 #endif
                // Don't need to work atomically here, because this is the only workgroup
                // left working on this row.
                out[row] += new_error;
-               rowBlocks[get_num_groups(0)+first_wg_in_row+1] = 0UL;
+               rowBlocks[error_loc] = 0UL;
 
                // Reset the rowBlocks low order bits for next time.
                rowBlocks[first_wg_in_row] = rowBlocks[gid] - wg;
@@ -534,7 +529,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
                // Otherwise, increment the low order bits of the first thread in this
                // coop. We're using this to tell how many workgroups in a coop are done.
                // Do this with an atomic, since other threads may be doing this too.
-               unsigned long no_warn = atom_inc(&rowBlocks[first_wg_in_row]);
+               const unsigned long no_warn = atom_inc(&rowBlocks[first_wg_in_row]);
            }
 #endif
        }

From 7daac8879cb78b506b6c82734e398334de4d00e0 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Mon, 10 Aug 2015 20:13:48 -0500
Subject: [PATCH 14/21] Adding floating point FMA into CSR-Adaptive. This
 results in minor performance increases on DPFP-starved GPUs when working in
 double precision mode.

---
 src/library/kernels/csrmv_adaptive.cl | 45 ++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 85d73e7..787d1d0 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -94,6 +94,35 @@ FPTYPE two_sum( FPTYPE x,
     return sumk_s;
 }
 
+// Performs (x_vals * x_vec) + y using an FMA.
+// Ideally, we would perform an error-free transformation here and return the
+// appropriate error. However, the EFT of an FMA is very expensive. As such,
+// if we are in EXTENDED_PRECISION mode, this function devolves into two_sum
+// with x_vals and x_vec inputs multiplied separately from the compensated add.
+FPTYPE two_fma( const FPTYPE x_vals,
+        const FPTYPE x_vec,
+        FPTYPE y,
+        FPTYPE * restrict const sumk_err )
+{
+#ifdef EXTENDED_PRECISION
+    FPTYPE x = x_vals * x_vec;
+    const FPTYPE sumk_s = x + y;
+    if (fabs(x) < fabs(y))
+    {
+        const FPTYPE swap = x;
+        x = y;
+        y = swap;
+    }
+    (*sumk_err) += (y - (sumk_s - x));
+    // 2Sum in the FMA case. Poor performance on low-DPFP GPUs.
+    //const FPTYPE bp = fma(-x_vals, x_vec, sumk_s);
+    //(*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp));
+    return sumk_s;
+#else
+    return fma(x_vals, x_vec, y);
+#endif
+}
+
 FPTYPE atomic_two_sum_float( global FPTYPE * restrict const x_ptr,
                             FPTYPE y,
                             FPTYPE * restrict const sumk_err )
@@ -330,7 +359,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
               // If so, just do a write rather than a read-write. Measured to be a slight (~5%)
               // performance improvement.
               if (beta != 0.)
-                  temp_sum = two_sum(beta * out[local_row], temp_sum, &new_error);
+                  temp_sum = two_fma(beta, out[local_row], temp_sum, &new_error);
               out[local_row] = temp_sum + new_error;
           }
       }
@@ -349,12 +378,12 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
               temp_sum = 0.;
               sumk_e = 0.;
               for (int local_cur_val = local_first_val; local_cur_val < local_last_val; local_cur_val++)
-                  temp_sum = two_sum(temp_sum, partialSums[local_cur_val], &sumk_e);
+                  temp_sum = two_sum(partialSums[local_cur_val], temp_sum, &sumk_e);
 
               // After you've done the reduction into the temp_sum register,
               // put that into the output for each row.
               if (beta != 0.)
-                  temp_sum = two_sum(beta * out[local_row], temp_sum, &sumk_e);
+                  temp_sum = two_fma(beta, out[local_row], temp_sum, &sumk_e);
               out[local_row] = temp_sum + sumk_e;
               local_row += WGSIZE;
           }
@@ -387,7 +416,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
            for (long j = vecStart + lid; j < vecEnd; j+=WGSIZE)
            {
                const unsigned int col = cols[(unsigned int)j];
-               temp_sum = two_sum(temp_sum, alpha * vals[(unsigned int)j] * vec[col], &sumk_e);
+               temp_sum = two_fma(alpha*vals[(unsigned int)j], vec[col], temp_sum, &sumk_e);
            }
 
            temp_sum = two_sum(temp_sum, sumk_e, &new_error);
@@ -403,7 +432,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
            if (lid == 0UL)
            {
                if (beta != 0.)
-                   temp_sum = two_sum(beta * out[row], temp_sum, &new_error);
+                   temp_sum = two_fma(beta, out[row], temp_sum, &new_error);
                out[row] = temp_sum + new_error;
            }
            row++;
@@ -466,18 +495,18 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
            // That increases register pressure and reduces occupancy.
            for (int j = 0; j < (int)(vecEnd - col); j += WGSIZE)
            {
-               temp_sum = two_sum(temp_sum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+               temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e);
 #if 2*WGSIZE <= BLOCK_MULTIPLIER*BLOCKSIZE
                // If you can, unroll this loop once. It somewhat helps performance.
                j += WGSIZE;
-               temp_sum = two_sum(temp_sum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+               temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e);
 #endif
            }
        }
        else
        {
            for(int j = 0; j < (int)(vecEnd - col); j += WGSIZE)
-               temp_sum = two_sum(temp_sum, alpha * vals[col + j] * vec[cols[col + j]], &sumk_e);
+               temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e);
        }
 
        temp_sum = two_sum(temp_sum, sumk_e, &new_error);

From 0e773e2e71a20f1ed4cc59ce1da952e9800d7591 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Mon, 10 Aug 2015 20:15:15 -0500
Subject: [PATCH 15/21] Changing around how CSR-Stream reduction is performed.
 Removing a lot of multiplications that snuck their way into the code. 32-bit
 integer multiply is slow (up to 16x slower than addition on AMD GPUs), so
 replaced with full-speed addition or full-speed 24-bit multiply when
 required. Results in major performance gains on Fiji GPUs.

---
 src/library/kernels/csrmv_adaptive.cl | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 787d1d0..59a1220 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -245,7 +245,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
 
    // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row.
    // If there are more items in this row, we assign more workgroups.
-   unsigned int vecStart = rowPtrs[row] + (wg * BLOCK_MULTIPLIER*BLOCKSIZE);
+   unsigned int vecStart = mad24(wg, (unsigned int)(BLOCK_MULTIPLIER*BLOCKSIZE), rowPtrs[row]);
    unsigned int vecEnd = (rowPtrs[row + 1] > vecStart + BLOCK_MULTIPLIER*BLOCKSIZE) ? vecStart + BLOCK_MULTIPLIER*BLOCKSIZE : rowPtrs[row + 1];
 
    FPTYPE temp_sum = 0.;
@@ -328,14 +328,13 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
           // we need to ensure that adjacent-groups only work on real data for this rowBlock.
           if(local_row < stop_row)
           {
-              // only works when numThreadsForRed is a power of 2
-              for(int i = 0; i < workForEachThread; i++)
-                  temp_sum = two_sum(temp_sum, partialSums[local_first_val + i*numThreadsForRed + threadInBlock], &sumk_e);
-
-              // The last few values (the remainder of this row) also need to be aded in.
-              int local_cur_val = local_first_val + numThreadsForRed*workForEachThread;
-              if(threadInBlock < local_last_val - local_cur_val)
-                  temp_sum = two_sum(temp_sum, partialSums[local_cur_val + threadInBlock], &sumk_e);
+              // This is dangerous -- will infinite loop if your last value is within
+              // numThreadsForRed of MAX_UINT. Noticable performance gain to avoid a
+              // long induction variable here, though.
+              for(unsigned int local_cur_val = local_first_val + threadInBlock;
+                      local_cur_val < local_last_val;
+                      local_cur_val += numThreadsForRed)
+                  temp_sum = two_sum(partialSums[local_cur_val], temp_sum, &sumk_e);
           }
           barrier(CLK_LOCAL_MEM_FENCE);
 

From 30f32f880c3d53975b5ee6c321d9ae07be5ea06c Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Tue, 11 Aug 2015 02:59:04 -0500
Subject: [PATCH 16/21] Made changes to preprocessor macros to align with
 csrmv_general. Basic work to better decide whether our target hardware
 supports appropriate atomics. Currently does not work with targets that
 support fp64 but not 64-bit integer atomics.

---
 src/library/blas2/csrmv-adaptive.hpp  |  45 ++++--
 src/library/kernels/csrmv_adaptive.cl | 199 ++++++++++++++++++--------
 2 files changed, 173 insertions(+), 71 deletions(-)

diff --git a/src/library/blas2/csrmv-adaptive.hpp b/src/library/blas2/csrmv-adaptive.hpp
index 77ff9d0..4d77fae 100644
--- a/src/library/blas2/csrmv-adaptive.hpp
+++ b/src/library/blas2/csrmv-adaptive.hpp
@@ -22,20 +22,31 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
     const cl_uint group_size = 256;
 
     std::string params = std::string( )
+    + " -DINDEX_TYPE=uint"
     + " -DROWBITS=" + std::to_string( ROW_BITS )
     + " -DWGBITS=" + std::to_string( WG_BITS )
-    + " -DWGSIZE=" + std::to_string( group_size )
+    + " -DWG_SIZE=" + std::to_string( group_size )
     + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
     + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
     + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR )
     + " -DEXTENDED_PRECISION";
 
+    std::string options;
     if(typeid(T) == typeid(cl_double))
-    {
-            std::string options = std::string() + " -DDOUBLE";
-            params.append(options);
-    }
-
+        options = std::string() + " -DVALUE_TYPE=double -DDOUBLE";
+    else if(typeid(T) == typeid(cl_float))
+        options = std::string() + " -DVALUE_TYPE=float";
+    else if(typeid(T) == typeid(cl_uint))
+        options = std::string() + " -DVALUE_TYPE=uint";
+    else if(typeid(T) == typeid(cl_int))
+        options = std::string() + " -DVALUE_TYPE=int";
+    else if(typeid(T) == typeid(cl_ulong))
+        options = std::string() + " -DVALUE_TYPE=ulong -DLONG";
+    else if(typeid(T) == typeid(cl_long))
+        options = std::string() + " -DVALUE_TYPE=long -DLONG";
+    else
+        return clsparseInvalidKernelArgs;
+    params.append(options);
 
     cl::Kernel kernel = KernelCache::get( control->queue,
                                           "csrmv_adaptive",
@@ -89,19 +100,31 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
     const cl_uint group_size = 256;
 
     std::string params = std::string( )
+    + " -DINDEX_TYPE=uint"
     + " -DROWBITS=" + std::to_string( ROW_BITS )
     + " -DWGBITS=" + std::to_string( WG_BITS )
-    + " -DWGSIZE=" + std::to_string( group_size )
+    + " -DWG_SIZE=" + std::to_string( group_size )
     + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
     + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
     + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR )
     + " -DEXTENDED_PRECISION";
 
+    std::string options;
     if(typeid(T) == typeid(cl_double))
-    {
-        std::string options = std::string() + " -DDOUBLE";
-        params.append(options);
-    }
+        options = std::string() + " -DVALUE_TYPE=double -DDOUBLE";
+    else if(typeid(T) == typeid(cl_float))
+        options = std::string() + " -DVALUE_TYPE=float";
+    else if(typeid(T) == typeid(cl_uint))
+        options = std::string() + " -DVALUE_TYPE=uint";
+    else if(typeid(T) == typeid(cl_int))
+        options = std::string() + " -DVALUE_TYPE=int";
+    else if(typeid(T) == typeid(cl_ulong))
+        options = std::string() + " -DVALUE_TYPE=ulong -DLONG";
+    else if(typeid(T) == typeid(cl_long))
+        options = std::string() + " -DVALUE_TYPE=long -DLONG";
+    else
+        return clsparseInvalidKernelArgs;
+    params.append(options);
 
     cl::Kernel kernel = KernelCache::get( control->queue,
                                           "csrmv_adaptive",
diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 59a1220..d84b0e7 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -1,28 +1,79 @@
 R"(
-#ifdef DOUBLE
-  #define FPTYPE double
+#ifndef VALUE_TYPE
+#error VALUE_TYPE undefined!
+#endif
 
+// No reason to include these beyond version 1.2, where double is not an extension.
+#if defined(DOUBLE) && __OPENCL_VERSION__ < CL_VERSION_1_2
   #ifdef cl_khr_fp64
-      #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
   #elif defined(cl_amd_fp64)
-      #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
   #else
-      #error "Double precision floating point not supported by OpenCL implementation."
+    #error "Double precision floating point not supported by OpenCL implementation."
   #endif
+#endif
+
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
+  #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+  #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+  #define ATOM64
+#endif
+
+#if __OPENCL_VERSION__ > CL_VERSION_1_0
+  #define ATOM32
+#elif defined(cl_khr_global_int32_base_atomics) && defined(cl_khr_global_int32_extended_atomics)
+  #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : require
+  #pragma OPENCL_EXTENSION cl_khr_global_int32_extended_atomics : require
+  #define ATOM32
 #else
-  #define FPTYPE float
+  #error "Required integer atomics not supported by this OpenCL implemenation."
+#endif
+
+#if (defined(DOUBLE) || defined(LONG)) && !defined(ATOM64)
+#error "Requesting 64-bit type but this OpenCL implementation does not support 64-bit atomics."
 #endif
 
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
-#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
+#ifndef INDEX_TYPE
+#error "INDEX_TYPE undefined!"
+#endif
+
+#ifndef VALUE_TYPE
+#error "VALUE_TYPE undefined!"
+#endif
+
+#ifndef ROWBITS
+#error "ROWBITS undefined!"
+#endif
+
+#ifndef WGBITS
+#error "WGBITS undefined!"
+#endif
+
+#ifndef WG_SIZE
+#error "WG_SIZE undefined!"
+#endif
 
-FPTYPE atomic_add_float_extended( global FPTYPE * restrict const ptr,
-                                  const FPTYPE temp,
-                                  FPTYPE * restrict const old_sum )
+#ifndef BLOCKSIZE
+#error "BLOCKSIZE undefined!"
+#endif
+
+#ifndef BLOCK_MULTIPLIER
+#error "BLOCK_MULTIPLIER undefined!"
+#endif
+
+#ifndef ROWS_FOR_VECTOR
+#error "ROWS_FOR_VECTOR undefined!"
+#endif
+
+VALUE_TYPE atomic_add_float_extended( global VALUE_TYPE * restrict const ptr,
+                                  const VALUE_TYPE temp,
+                                  VALUE_TYPE * restrict const old_sum )
 {
 #ifdef DOUBLE
 	unsigned long newVal;
 	unsigned long prevVal;
+  #ifdef ATOM64
 	do
 	{
 		prevVal = as_ulong(*ptr);
@@ -30,10 +81,14 @@ FPTYPE atomic_add_float_extended( global FPTYPE * restrict const ptr,
 	} while (atom_cmpxchg((global unsigned long *)ptr, prevVal, newVal) != prevVal);
     if (old_sum != 0)
         *old_sum = as_double(prevVal);
+  #else
+    newVal = 0;
+  #endif
     return as_double(newVal);
 #else
 	unsigned int newVal;
 	unsigned int prevVal;
+  #ifdef ATOM32
 	do
 	{
 		prevVal = as_uint(*ptr);
@@ -41,11 +96,14 @@ FPTYPE atomic_add_float_extended( global FPTYPE * restrict const ptr,
 	} while (atomic_cmpxchg((global unsigned int *)ptr, prevVal, newVal) != prevVal);
     if (old_sum != 0)
         *old_sum = as_float(prevVal);
+  #else
+    newVal = 0;
+  #endif
     return as_float(newVal);
 #endif
 }
 
-void atomic_add_float( global void * const ptr, const FPTYPE temp )
+void atomic_add_float( global void * const ptr, const VALUE_TYPE temp )
 {
     atomic_add_float_extended(ptr, temp, 0);
 }
@@ -57,11 +115,11 @@ void atomic_add_float( global void * const ptr, const FPTYPE temp )
 // In/Out: *sumk_err, which is incremented (by reference) -- holds the
 //         error value as a result of the 2sum calculation.
 // Returns: The non-corrected sum of inputs x and y.
-FPTYPE two_sum( FPTYPE x,
-                FPTYPE y,
-                FPTYPE * restrict const sumk_err )
+VALUE_TYPE two_sum( VALUE_TYPE x,
+                VALUE_TYPE y,
+                VALUE_TYPE * restrict const sumk_err )
 {
-    const FPTYPE sumk_s = x + y;
+    const VALUE_TYPE sumk_s = x + y;
 #ifdef EXTENDED_PRECISION
     // We use this 2Sum algorithm to perform a compensated summation,
     // which can reduce the cummulative rounding errors in our SpMV summation.
@@ -82,13 +140,13 @@ FPTYPE two_sum( FPTYPE x,
     // 1974), respectively.
     if (fabs(x) < fabs(y))
     {
-        const FPTYPE swap = x;
+        const VALUE_TYPE swap = x;
         x = y;
         y = swap;
     }
     (*sumk_err) += (y - (sumk_s - x));
     // Original 6 FLOP 2Sum algorithm.
-    //const FPTYPE bp = sumk_s - x;
+    //const VALUE_TYPE bp = sumk_s - x;
     //(*sumk_err) += ((x - (sumk_s - bp)) + (y - bp));
 #endif
     return sumk_s;
@@ -99,23 +157,23 @@ FPTYPE two_sum( FPTYPE x,
 // appropriate error. However, the EFT of an FMA is very expensive. As such,
 // if we are in EXTENDED_PRECISION mode, this function devolves into two_sum
 // with x_vals and x_vec inputs multiplied separately from the compensated add.
-FPTYPE two_fma( const FPTYPE x_vals,
-        const FPTYPE x_vec,
-        FPTYPE y,
-        FPTYPE * restrict const sumk_err )
+VALUE_TYPE two_fma( const VALUE_TYPE x_vals,
+        const VALUE_TYPE x_vec,
+        VALUE_TYPE y,
+        VALUE_TYPE * restrict const sumk_err )
 {
 #ifdef EXTENDED_PRECISION
-    FPTYPE x = x_vals * x_vec;
-    const FPTYPE sumk_s = x + y;
+    VALUE_TYPE x = x_vals * x_vec;
+    const VALUE_TYPE sumk_s = x + y;
     if (fabs(x) < fabs(y))
     {
-        const FPTYPE swap = x;
+        const VALUE_TYPE swap = x;
         x = y;
         y = swap;
     }
     (*sumk_err) += (y - (sumk_s - x));
     // 2Sum in the FMA case. Poor performance on low-DPFP GPUs.
-    //const FPTYPE bp = fma(-x_vals, x_vec, sumk_s);
+    //const VALUE_TYPE bp = fma(-x_vals, x_vec, sumk_s);
     //(*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp));
     return sumk_s;
 #else
@@ -123,18 +181,18 @@ FPTYPE two_fma( const FPTYPE x_vals,
 #endif
 }
 
-FPTYPE atomic_two_sum_float( global FPTYPE * restrict const x_ptr,
-                            FPTYPE y,
-                            FPTYPE * restrict const sumk_err )
+VALUE_TYPE atomic_two_sum_float( global VALUE_TYPE * restrict const x_ptr,
+                            VALUE_TYPE y,
+                            VALUE_TYPE * restrict const sumk_err )
 {
     // Have to wait until the return from the atomic op to know what X was.
-    FPTYPE sumk_s = 0.;
+    VALUE_TYPE sumk_s = 0.;
 #ifdef EXTENDED_PRECISION
-    FPTYPE x;
+    VALUE_TYPE x;
     sumk_s = atomic_add_float_extended(x_ptr, y, &x);
     if (fabs(x) < fabs(y))
     {
-        const FPTYPE swap = x;
+        const VALUE_TYPE swap = x;
         x = y;
         y = swap;
     }
@@ -164,9 +222,9 @@ FPTYPE atomic_two_sum_float( global FPTYPE * restrict const x_ptr,
 //                  to do a parallel reduction. This is the length of each row.
 //          reduc_size: As you reduce data down, this tells you how far away
 //                 you will grab a value and add to your own local sum value.
-FPTYPE sum2_reduce( FPTYPE cur_sum,
-        FPTYPE * restrict const err,
-        __local FPTYPE * restrict const partial,
+VALUE_TYPE sum2_reduce( VALUE_TYPE cur_sum,
+        VALUE_TYPE * restrict const err,
+        __local VALUE_TYPE * restrict const partial,
         const unsigned int lid,
         const unsigned int thread_lane,
         const unsigned int max_size,
@@ -200,20 +258,20 @@ FPTYPE sum2_reduce( FPTYPE cur_sum,
 }
 
 __kernel void
-csrmv_adaptive(__global const FPTYPE * restrict const vals,
+csrmv_adaptive(__global const VALUE_TYPE * restrict const vals,
                        __global const unsigned int * restrict const cols,
                        __global const unsigned int * restrict const rowPtrs,
-                       __global const FPTYPE * restrict const vec,
-                       __global FPTYPE * restrict const out,
+                       __global const VALUE_TYPE * restrict const vec,
+                       __global VALUE_TYPE * restrict const out,
                        __global unsigned long * restrict const rowBlocks,
-                       __global const FPTYPE * restrict const pAlpha,
-                       __global const FPTYPE * restrict const pBeta)
+                       __global const VALUE_TYPE * restrict const pAlpha,
+                       __global const VALUE_TYPE * restrict const pBeta)
 {
-   __local FPTYPE partialSums[BLOCKSIZE];
+   __local VALUE_TYPE partialSums[BLOCKSIZE];
    const unsigned int gid = get_group_id(0);
    const unsigned int lid = get_local_id(0);
-   const FPTYPE alpha = *pAlpha;
-   const FPTYPE beta = *pBeta;
+   const VALUE_TYPE alpha = *pAlpha;
+   const VALUE_TYPE beta = *pBeta;
 
    // The row blocks buffer holds a packed set of information used to inform each
    // workgroup about how to do its work:
@@ -248,9 +306,9 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
    unsigned int vecStart = mad24(wg, (unsigned int)(BLOCK_MULTIPLIER*BLOCKSIZE), rowPtrs[row]);
    unsigned int vecEnd = (rowPtrs[row + 1] > vecStart + BLOCK_MULTIPLIER*BLOCKSIZE) ? vecStart + BLOCK_MULTIPLIER*BLOCKSIZE : rowPtrs[row + 1];
 
-   FPTYPE temp_sum = 0.;
-   FPTYPE sumk_e = 0.;
-   FPTYPE new_error = 0.;
+   VALUE_TYPE temp_sum = 0.;
+   VALUE_TYPE sumk_e = 0.;
+   VALUE_TYPE new_error = 0.;
 
    // If the next row block starts more than 2 rows away, then we choose CSR-Stream.
    // If this is zero (long rows) or one (final workgroup in a long row, or a single
@@ -288,7 +346,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
        const unsigned int col = rowPtrs[row] + lid;
       if (gid != (get_num_groups(0) - 1))
       {
-          for(int i = 0; i < BLOCKSIZE; i += WGSIZE)
+          for(int i = 0; i < BLOCKSIZE; i += WG_SIZE)
               partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]];
       }
       else
@@ -301,7 +359,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
           // This causes a minor performance loss because this is the last workgroup
           // to be launched, and this loop can't be unrolled.
           const unsigned int max_to_load = rowPtrs[stop_row] - rowPtrs[row];
-          for(int i = 0; i < max_to_load; i += WGSIZE)
+          for(int i = 0; i < max_to_load; i += WG_SIZE)
               partialSums[lid + i] = alpha * vals[col + i] * vec[cols[col + i]];
       }
       barrier(CLK_LOCAL_MEM_FENCE);
@@ -346,7 +404,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
           // LDS is full up to {workgroup size} entries.
           // Now we perform a parallel reduction that sums together the answers for each
           // row in parallel, leaving us an answer in 'temp_sum' for each row.
-          for (int i = (WGSIZE >> 1); i > 0; i >>= 1)
+          for (int i = (WG_SIZE >> 1); i > 0; i >>= 1)
           {
               barrier( CLK_LOCAL_MEM_FENCE );
               temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, threadInBlock, numThreadsForRed, i);
@@ -384,7 +442,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
               if (beta != 0.)
                   temp_sum = two_fma(beta, out[local_row], temp_sum, &sumk_e);
               out[local_row] = temp_sum + sumk_e;
-              local_row += WGSIZE;
+              local_row += WG_SIZE;
           }
       }
    }
@@ -412,7 +470,7 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
            // Load in a bunch of partial results into your register space, rather than LDS (no contention)
            // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
            // Using a long induction variable to make sure unsigned int overflow doesn't break things.
-           for (long j = vecStart + lid; j < vecEnd; j+=WGSIZE)
+           for (long j = vecStart + lid; j < vecEnd; j+=WG_SIZE)
            {
                const unsigned int col = cols[(unsigned int)j];
                temp_sum = two_fma(alpha*vals[(unsigned int)j], vec[col], temp_sum, &sumk_e);
@@ -422,10 +480,10 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
            partialSums[lid] = temp_sum;
 
            // Reduce partial sums
-           for (int i = (WGSIZE >> 1); i > 0; i >>= 1)
+           for (int i = (WG_SIZE >> 1); i > 0; i >>= 1)
            {
                barrier( CLK_LOCAL_MEM_FENCE);
-               temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, lid, WGSIZE, i);
+               temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, lid, WG_SIZE, i);
            }
 
            if (lid == 0UL)
@@ -454,6 +512,9 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
        // this long row.
        const unsigned int first_wg_in_row = gid - (rowBlocks[gid] & ((1UL << WGBITS) - 1UL));
        const unsigned int compare_value = rowBlocks[gid] & (1UL << WGBITS);
+#ifdef ATOM32
+       __global unsigned long * const temp_ptr = &rowBlocks[first_wg_in_row];
+#endif
 
        // Bit 24 in the first workgroup is the flag that everyone waits on.
        if(gid == first_wg_in_row && lid == 0UL)
@@ -466,15 +527,25 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
 #ifdef EXTENDED_PRECISION
            rowBlocks[get_num_groups(0) + gid + 1] = 0UL;
 #endif
+#ifdef ATOM64
            atom_xor(&rowBlocks[first_wg_in_row], (1UL << WGBITS)); // Release other workgroups.
+#elif ATOM32
+           atomic_xor((__global unsigned int *)temp_ptr, (1UL << WGBITS)); // Release other workgroups.
+#endif
        }
        // For every other workgroup, bit 24 holds the value they wait on.
        // If your bit 24 == first_wg's bit 24, you spin loop.
        // The first workgroup will eventually flip this bit, and you can move forward.
        barrier(CLK_GLOBAL_MEM_FENCE);
+#ifdef ATOM64
        while(gid != first_wg_in_row &&
-               lid == 0UL &&
+               lid == 0U &&
                ((atom_max(&rowBlocks[first_wg_in_row], 0UL) & (1UL << WGBITS)) == compare_value));
+#elif ATOM32
+       while(gid != first_wg_in_row &&
+               lid == 0U &&
+               ((atomic_max((_global unsigned int *)temp_ptr, 0UL) & (1UL << WGBITS)) == compare_value));
+#endif
        barrier(CLK_GLOBAL_MEM_FENCE);
 
        // After you've passed the barrier, update your local flag to make sure that
@@ -492,19 +563,19 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
            // Don't put BLOCK_MULTIPLIER*BLOCKSIZE as the stop point, because
            // some GPU compilers will *aggressively* unroll this loop.
            // That increases register pressure and reduces occupancy.
-           for (int j = 0; j < (int)(vecEnd - col); j += WGSIZE)
+           for (int j = 0; j < (int)(vecEnd - col); j += WG_SIZE)
            {
                temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e);
-#if 2*WGSIZE <= BLOCK_MULTIPLIER*BLOCKSIZE
+#if 2*WG_SIZE <= BLOCK_MULTIPLIER*BLOCKSIZE
                // If you can, unroll this loop once. It somewhat helps performance.
-               j += WGSIZE;
+               j += WG_SIZE;
                temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e);
 #endif
            }
        }
        else
        {
-           for(int j = 0; j < (int)(vecEnd - col); j += WGSIZE)
+           for(int j = 0; j < (int)(vecEnd - col); j += WG_SIZE)
                temp_sum = two_fma(alpha*vals[col + j], vec[cols[col + j]], temp_sum, &sumk_e);
        }
 
@@ -512,10 +583,10 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
        partialSums[lid] = temp_sum;
 
        // Reduce partial sums
-       for (int i = (WGSIZE >> 1); i > 0; i >>= 1)
+       for (int i = (WG_SIZE >> 1); i > 0; i >>= 1)
        {
            barrier( CLK_LOCAL_MEM_FENCE);
-           temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, lid, WGSIZE, i);
+           temp_sum = sum2_reduce(temp_sum, &new_error, partialSums, lid, lid, WG_SIZE, i);
        }
 
        if (lid == 0UL)
@@ -537,7 +608,11 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
                // coop's first workgroup. That value will be used to store the number
                // of threads that have completed so far. Once all the previous threads
                // are done, it's time to send out the errors!
+#ifdef ATOM64
                while((atom_max(&rowBlocks[first_wg_in_row], 0UL) & ((1UL << WGBITS) - 1)) != wg);
+#elif ATOM32
+               while((atomic_max((_global unsigned int *)temp_ptr, 0UL) & ((1UL << WGBITS) - 1)) != wg);
+#endif
 
 #ifdef DOUBLE
                new_error = as_double(rowBlocks[error_loc]);
@@ -557,7 +632,11 @@ csrmv_adaptive(__global const FPTYPE * restrict const vals,
                // Otherwise, increment the low order bits of the first thread in this
                // coop. We're using this to tell how many workgroups in a coop are done.
                // Do this with an atomic, since other threads may be doing this too.
+#ifdef ATOM64
                const unsigned long no_warn = atom_inc(&rowBlocks[first_wg_in_row]);
+#elif
+               const unsigned int no_warn = atomic_inc((_global unsigned int *)temp_ptr);
+#endif
            }
 #endif
        }

From 05cfc0d440cb90a08a81de10807383f9a28a47a9 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Wed, 12 Aug 2015 15:26:03 -0500
Subject: [PATCH 17/21] Changes to allow us to use double precision even when
 the system does not support 64-bit atomics. Fall back to using only
 CSR-Vector in this case. Also made some changes to CSR-LongRows beta
 initialization to fix a memory consistency issue.

---
 src/library/kernels/csrmv_adaptive.cl | 118 ++++++++++++++++----------
 1 file changed, 72 insertions(+), 46 deletions(-)

diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index d84b0e7..f09c5b6 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -30,10 +30,6 @@ R"(
   #error "Required integer atomics not supported by this OpenCL implemenation."
 #endif
 
-#if (defined(DOUBLE) || defined(LONG)) && !defined(ATOM64)
-#error "Requesting 64-bit type but this OpenCL implementation does not support 64-bit atomics."
-#endif
-
 #ifndef INDEX_TYPE
 #error "INDEX_TYPE undefined!"
 #endif
@@ -66,6 +62,55 @@ R"(
 #error "ROWS_FOR_VECTOR undefined!"
 #endif
 
+// Internal functions to wrap atomics, depending on if we support 64-bit
+// atomics or not. Helps keep the code clean in the other parts of the code.
+// All of the 32-bit atomics are built assuming we're on a little endian architecture.
+inline unsigned long clsparse_atomic_xor(__global unsigned long * restrict const ptr,
+                                    const unsigned long xor_val)
+{
+#ifdef ATOM64
+    return atom_xor(ptr, xor_val);
+#else
+    return atomic_max((__global unsigned int*)ptr, (unsigned int)xor_val);
+#endif
+}
+
+inline unsigned long clsparse_atomic_max(__global unsigned long * restrict const ptr,
+                                    const unsigned long compare)
+{
+#ifdef ATOM64
+    return atom_max(ptr, compare);
+#else
+    return atomic_max((__global unsigned int*)ptr, (unsigned int)compare);
+#endif
+}
+
+inline unsigned long clsparse_atomic_inc(__global unsigned long * restrict const inc_this)
+{
+#ifdef ATOM64
+    return atom_inc(inc_this);
+#else
+    return atomic_inc((__global unsigned int *)inc_this);
+#endif
+}
+
+inline unsigned long clsparse_atomic_cmpxchg(__global unsigned long * restrict const ptr,
+                                    const unsigned long compare,
+                                    const unsigned long val)
+{
+#ifdef DOUBLE
+  #ifdef ATOM64
+    return atom_cmpxchg(ptr, compare, val);
+  #else
+    // Should never run this. Don't use a path that requires cmpxchg for doubles
+    // if you don't support 64-bit atomics.
+    return compare;
+  #endif
+#else
+    return atomic_cmpxchg((__global unsigned int*)ptr, compare, val);
+#endif
+}
+
 VALUE_TYPE atomic_add_float_extended( global VALUE_TYPE * restrict const ptr,
                                   const VALUE_TYPE temp,
                                   VALUE_TYPE * restrict const old_sum )
@@ -73,32 +118,24 @@ VALUE_TYPE atomic_add_float_extended( global VALUE_TYPE * restrict const ptr,
 #ifdef DOUBLE
 	unsigned long newVal;
 	unsigned long prevVal;
-  #ifdef ATOM64
 	do
 	{
 		prevVal = as_ulong(*ptr);
 		newVal = as_ulong(temp + *ptr);
-	} while (atom_cmpxchg((global unsigned long *)ptr, prevVal, newVal) != prevVal);
+	} while (clsparse_atomic_cmpxchg((__global unsigned long *)ptr, prevVal, newVal) != prevVal);
     if (old_sum != 0)
         *old_sum = as_double(prevVal);
-  #else
-    newVal = 0;
-  #endif
     return as_double(newVal);
 #else
 	unsigned int newVal;
 	unsigned int prevVal;
-  #ifdef ATOM32
 	do
 	{
 		prevVal = as_uint(*ptr);
 		newVal = as_uint(temp + *ptr);
-	} while (atomic_cmpxchg((global unsigned int *)ptr, prevVal, newVal) != prevVal);
+	} while (clsparse_atomic_cmpxchg((__global unsigned long *)ptr, prevVal, newVal) != prevVal);
     if (old_sum != 0)
         *old_sum = as_float(prevVal);
-  #else
-    newVal = 0;
-  #endif
     return as_float(newVal);
 #endif
 }
@@ -295,17 +332,29 @@ csrmv_adaptive(__global const VALUE_TYPE * restrict const vals,
    // value. While this bit is the same as the first workgroup's flag bit, this
    // workgroup will spin-loop.
    unsigned int row = ((rowBlocks[gid] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL));
-   const unsigned int stop_row = ((rowBlocks[gid + 1] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL));
-   const unsigned int num_rows = stop_row - row;
+   unsigned int stop_row = ((rowBlocks[gid + 1] >> (64-ROWBITS)) & ((1UL << ROWBITS) - 1UL));
+   unsigned int num_rows = stop_row - row;
 
    // Get the "workgroup within this long row" ID out of the bottom bits of the row block.
-   const unsigned int wg = rowBlocks[gid] & ((1 << WGBITS) - 1);
+   unsigned int wg = rowBlocks[gid] & ((1 << WGBITS) - 1);
 
    // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row.
    // If there are more items in this row, we assign more workgroups.
    unsigned int vecStart = mad24(wg, (unsigned int)(BLOCK_MULTIPLIER*BLOCKSIZE), rowPtrs[row]);
    unsigned int vecEnd = (rowPtrs[row + 1] > vecStart + BLOCK_MULTIPLIER*BLOCKSIZE) ? vecStart + BLOCK_MULTIPLIER*BLOCKSIZE : rowPtrs[row + 1];
 
+#if (defined(DOUBLE) || defined(LONG)) && !defined(ATOM64)
+   // In here because we don't support 64-bit atomics while working on 64-bit data.
+   // As such, we can't use CSR-LongRows. Time to do a fixup -- first WG does the
+   // entire row with CSR-Vector. Other rows immediately exit.
+   if (num_rows == 0 || (num_rows == 1 && wg)) // CSR-LongRows case
+   {
+       num_rows = ROWS_FOR_VECTOR;
+       stop_row = wg ? row : (row + 1);
+       wg = 0;
+   }
+#endif
+
    VALUE_TYPE temp_sum = 0.;
    VALUE_TYPE sumk_e = 0.;
    VALUE_TYPE new_error = 0.;
@@ -512,40 +561,25 @@ csrmv_adaptive(__global const VALUE_TYPE * restrict const vals,
        // this long row.
        const unsigned int first_wg_in_row = gid - (rowBlocks[gid] & ((1UL << WGBITS) - 1UL));
        const unsigned int compare_value = rowBlocks[gid] & (1UL << WGBITS);
-#ifdef ATOM32
-       __global unsigned long * const temp_ptr = &rowBlocks[first_wg_in_row];
-#endif
 
        // Bit 24 in the first workgroup is the flag that everyone waits on.
        if(gid == first_wg_in_row && lid == 0UL)
        {
            // The first workgroup handles the output initialization.
-           if (beta != 0.)
-               out[row] *= beta;
-           else
-               out[row] = 0.;
+           volatile VALUE_TYPE out_val = out[row];
+           temp_sum = (beta - 1.) * out_val;
 #ifdef EXTENDED_PRECISION
            rowBlocks[get_num_groups(0) + gid + 1] = 0UL;
 #endif
-#ifdef ATOM64
-           atom_xor(&rowBlocks[first_wg_in_row], (1UL << WGBITS)); // Release other workgroups.
-#elif ATOM32
-           atomic_xor((__global unsigned int *)temp_ptr, (1UL << WGBITS)); // Release other workgroups.
-#endif
+           clsparse_atomic_xor(&rowBlocks[first_wg_in_row], (1UL << WGBITS)); // Release other workgroups.
        }
        // For every other workgroup, bit 24 holds the value they wait on.
        // If your bit 24 == first_wg's bit 24, you spin loop.
        // The first workgroup will eventually flip this bit, and you can move forward.
        barrier(CLK_GLOBAL_MEM_FENCE);
-#ifdef ATOM64
        while(gid != first_wg_in_row &&
                lid == 0U &&
-               ((atom_max(&rowBlocks[first_wg_in_row], 0UL) & (1UL << WGBITS)) == compare_value));
-#elif ATOM32
-       while(gid != first_wg_in_row &&
-               lid == 0U &&
-               ((atomic_max((_global unsigned int *)temp_ptr, 0UL) & (1UL << WGBITS)) == compare_value));
-#endif
+               ((clsparse_atomic_max(&rowBlocks[first_wg_in_row], 0UL) & (1UL << WGBITS)) == compare_value));
        barrier(CLK_GLOBAL_MEM_FENCE);
 
        // After you've passed the barrier, update your local flag to make sure that
@@ -608,11 +642,7 @@ csrmv_adaptive(__global const VALUE_TYPE * restrict const vals,
                // coop's first workgroup. That value will be used to store the number
                // of threads that have completed so far. Once all the previous threads
                // are done, it's time to send out the errors!
-#ifdef ATOM64
-               while((atom_max(&rowBlocks[first_wg_in_row], 0UL) & ((1UL << WGBITS) - 1)) != wg);
-#elif ATOM32
-               while((atomic_max((_global unsigned int *)temp_ptr, 0UL) & ((1UL << WGBITS) - 1)) != wg);
-#endif
+               while((clsparse_atomic_max(&rowBlocks[first_wg_in_row], 0UL) & ((1UL << WGBITS) - 1)) != wg);
 
 #ifdef DOUBLE
                new_error = as_double(rowBlocks[error_loc]);
@@ -632,11 +662,7 @@ csrmv_adaptive(__global const VALUE_TYPE * restrict const vals,
                // Otherwise, increment the low order bits of the first thread in this
                // coop. We're using this to tell how many workgroups in a coop are done.
                // Do this with an atomic, since other threads may be doing this too.
-#ifdef ATOM64
-               const unsigned long no_warn = atom_inc(&rowBlocks[first_wg_in_row]);
-#elif
-               const unsigned int no_warn = atomic_inc((_global unsigned int *)temp_ptr);
-#endif
+               const unsigned long no_warn = clsparse_atomic_inc(&rowBlocks[first_wg_in_row]);
            }
 #endif
        }

From dd85239fedfc904db05e8a1832e39383caf48c61 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Wed, 12 Aug 2015 16:24:41 -0500
Subject: [PATCH 18/21] Adding FMAs to CSR-General and some consts and other
 type correctness.

---
 src/library/kernels/csrmv_general.cl | 73 +++++++++++++++++++---------
 1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/src/library/kernels/csrmv_general.cl b/src/library/kernels/csrmv_general.cl
index b5d6065..b6bd750 100644
--- a/src/library/kernels/csrmv_general.cl
+++ b/src/library/kernels/csrmv_general.cl
@@ -46,9 +46,9 @@ R"(
 // Returns: The non-corrected sum of inputs x and y.
 VALUE_TYPE two_sum( VALUE_TYPE x,
         VALUE_TYPE y,
-        VALUE_TYPE *sumk_err)
+        VALUE_TYPE * restrict const sumk_err)
 {
-    VALUE_TYPE sumk_s = x + y;
+    const VALUE_TYPE sumk_s = x + y;
 #ifdef EXTENDED_PRECISION
     // We use this 2Sum algorithm to perform a compensated summation,
     // which can reduce the cummulative rounding errors in our SpMV summation.
@@ -68,10 +68,9 @@ VALUE_TYPE two_sum( VALUE_TYPE x,
     // and "Rundungsfehleranalyse einiger Verfahren zur Summation endlicher
     // Summen (ZAMM Z. Angewandte Mathematik und Mechanik 54(1) pp. 39-51,
     // 1974), respectively.
-    VALUE_TYPE swap;
     if (fabs(x) < fabs(y))
     {
-        swap = x;
+        const VALUE_TYPE swap = x;
         x = y;
         y = swap;
     }
@@ -83,6 +82,35 @@ VALUE_TYPE two_sum( VALUE_TYPE x,
     return sumk_s;
 }
 
+// Performs (x_vals * x_vec) + y using an FMA.
+// Ideally, we would perform an error-free transformation here and return the
+// appropriate error. However, the EFT of an FMA is very expensive. As such,
+// if we are in EXTENDED_PRECISION mode, this function devolves into two_sum
+// with x_vals and x_vec inputs multiplied separately from the compensated add.
+VALUE_TYPE two_fma( const VALUE_TYPE x_vals,
+        const VALUE_TYPE x_vec,
+        VALUE_TYPE y,
+        VALUE_TYPE * restrict const sumk_err )
+{
+#ifdef EXTENDED_PRECISION
+    VALUE_TYPE x = x_vals * x_vec;
+    const VALUE_TYPE sumk_s = x + y;
+    if (fabs(x) < fabs(y))
+    {
+        const VALUE_TYPE swap = x;
+        x = y;
+        y = swap;
+    }
+    (*sumk_err) += (y - (sumk_s - x));
+    // 2Sum in the FMA case. Poor performance on low-DPFP GPUs.
+    //const VALUE_TYPE bp = fma(-x_vals, x_vec, sumk_s);
+    //(*sumk_err) += (fma(x_vals, x_vec, -(sumk_s - bp)) + (y - bp));
+    return sumk_s;
+#else
+    return fma(x_vals, x_vec, y);
+#endif
+}
+
 // A method of doing the final reduction without having to copy and paste
 // it a bunch of times.
 // The EXTENDED_PRECISION section is done as part of the PSum2 addition,
@@ -97,18 +125,18 @@ VALUE_TYPE two_sum( VALUE_TYPE x,
 //          round: This parallel summation method operates in multiple rounds
 //                  to do a parallel reduction. See the blow comment for usage.
 VALUE_TYPE sum2_reduce( VALUE_TYPE cur_sum,
-        VALUE_TYPE *err,
-        volatile __local VALUE_TYPE *partial,
-        size_t lid,
-        int thread_lane,
-        int round)
+        VALUE_TYPE * restrict const err,
+        volatile __local VALUE_TYPE * restrict const partial,
+        const INDEX_TYPE lid,
+        const INDEX_TYPE thread_lane,
+        const INDEX_TYPE round)
 {
     if (SUBWAVE_SIZE > round)
     {
 #ifdef EXTENDED_PRECISION
+        const unsigned int partial_dest = lid + round;
         if (thread_lane < round)
-            cur_sum  = two_sum(cur_sum, partial[lid + round], err);
-
+            cur_sum  = two_sum(cur_sum, partial[partial_dest], err);
         // We reuse the LDS entries to move the error values down into lower
         // threads. This saves LDS space, allowing higher occupancy, but requires
         // more barriers, which can reduce performance.
@@ -119,7 +147,7 @@ VALUE_TYPE sum2_reduce( VALUE_TYPE cur_sum,
             partial[lid] = *err;
         barrier(CLK_LOCAL_MEM_FENCE);
         if (thread_lane < round) { // Add those errors in.
-            *err += partial[lid + round];
+            *err += partial[partial_dest];
             partial[lid] = cur_sum;
         }
 #else
@@ -158,27 +186,27 @@ void csrmv_general (     const INDEX_TYPE num_rows,
     local volatile VALUE_TYPE sdata [WG_SIZE + SUBWAVE_SIZE / 2];
 
     //const int vectors_per_block = WG_SIZE/SUBWAVE_SIZE;
-    const int global_id   = get_global_id(0);         // global workitem id
-    const int local_id    = get_local_id(0);          // local workitem id
-    const int thread_lane = local_id & (SUBWAVE_SIZE - 1);
-    const int vector_id   = global_id / SUBWAVE_SIZE; // global vector id
+    const INDEX_TYPE global_id   = get_global_id(0);         // global workitem id
+    const INDEX_TYPE local_id    = get_local_id(0);          // local workitem id
+    const INDEX_TYPE thread_lane = local_id & (SUBWAVE_SIZE - 1);
+    const INDEX_TYPE vector_id   = global_id / SUBWAVE_SIZE; // global vector id
     //const int vector_lane = local_id / SUBWAVE_SIZE;  // vector id within the workgroup
-    const int num_vectors = get_global_size(0) / SUBWAVE_SIZE;
+    const INDEX_TYPE num_vectors = get_global_size(0) / SUBWAVE_SIZE;
 
     const VALUE_TYPE _alpha = alpha[off_alpha];
     const VALUE_TYPE _beta = beta[off_beta];
 
     for(INDEX_TYPE row = vector_id; row < num_rows; row += num_vectors)
     {
-        const int row_start = row_offset[row];
-        const int row_end   = row_offset[row+1];
+        const INDEX_TYPE row_start = row_offset[row];
+        const INDEX_TYPE row_end   = row_offset[row+1];
         VALUE_TYPE sum = 0.;
 
         VALUE_TYPE sumk_e = 0.;
         // It is about 5% faster to always multiply by alpha, rather than to
         // check whether alpha is 0, 1, or other and do different code paths.
-        for(int j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE)
-            sum = two_sum(sum, (_alpha * val[j] * x[off_x + col[j]]), &sumk_e);
+        for(INDEX_TYPE j = row_start + thread_lane; j < row_end; j += SUBWAVE_SIZE)
+            sum = two_fma(_alpha * val[j], x[off_x + col[j]], sum, &sumk_e);
         VALUE_TYPE new_error = 0.;
         sum = two_sum(sum, sumk_e, &new_error);
 
@@ -192,6 +220,7 @@ void csrmv_general (     const INDEX_TYPE num_rows,
         // Yamanaka, Ogita, Rump, and Oishi, "A Parallel Algorithm of
         // Accurate Dot Product," in the Journal of Parallel Computing,
         // 34(6-8), pp. 392-410, Jul. 2008.
+        #pragma unroll
         for (int i = (WG_SIZE >> 1); i > 0; i >>= 1)
         {
             barrier( CLK_LOCAL_MEM_FENCE );
@@ -204,7 +233,7 @@ void csrmv_general (     const INDEX_TYPE num_rows,
                 y[off_y + row] = sum + new_error;
             else
             {
-                sum = two_sum(sum, _beta * y[off_y + row], &new_error);
+                sum = two_fma(_beta, y[off_y + row], sum, &new_error);
                 y[off_y + row] = sum + new_error;
             }
         }

From 508426867dad9863b5f8f05de849e9dfdbb070d9 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Thu, 13 Aug 2015 01:46:33 -0500
Subject: [PATCH 19/21] Making extended precision (compensated summation) part
 of the clSparse opaque command structure. Adding it as a command-line option
 for the test-blas2 program.

---
 src/include/clSPARSE.h                    |  3 +++
 src/library/blas2/csrmv-adaptive.hpp      | 12 ++++++++----
 src/library/blas2/csrmv-vector.hpp        |  8 ++++++--
 src/library/internal/clsparse-control.cpp | 13 +++++++++++++
 src/library/internal/clsparse-control.hpp |  3 +++
 src/library/kernels/csrmv_general.cl      |  2 --
 src/tests/test-blas2.cpp                  | 12 +++++++++---
 7 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/src/include/clSPARSE.h b/src/include/clSPARSE.h
index 3d5c86a..38c2dd1 100644
--- a/src/include/clSPARSE.h
+++ b/src/include/clSPARSE.h
@@ -90,6 +90,9 @@ extern "C" {
     CLSPARSE_EXPORT clsparseStatus
         clsparseEnableAsync( clsparseControl control, cl_bool async );
 
+    //enable/disable the use of compensated summation
+    CLSPARSE_EXPORT clsparseStatus
+        clsparseEnableExtendedPrecision( clsparseControl control, cl_bool async );
 
     //setup events to sync
     //TODO:: NOT WORKING! NDRange throws Failure
diff --git a/src/library/blas2/csrmv-adaptive.hpp b/src/library/blas2/csrmv-adaptive.hpp
index 12601d0..95ff105 100644
--- a/src/library/blas2/csrmv-adaptive.hpp
+++ b/src/library/blas2/csrmv-adaptive.hpp
@@ -45,8 +45,7 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
     + " -DWG_SIZE=" + std::to_string( group_size )
     + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
     + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
-    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR )
-    + " -DEXTENDED_PRECISION";
+    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR );
 
     std::string options;
     if(typeid(T) == typeid(cl_double))
@@ -63,6 +62,9 @@ csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
         options = std::string() + " -DVALUE_TYPE=long -DLONG";
     else
         return clsparseInvalidKernelArgs;
+
+    if(control->extended_precision)
+        options += " -DEXTENDED_PRECISION";
     params.append(options);
 
     cl::Kernel kernel = KernelCache::get( control->queue,
@@ -123,8 +125,7 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
     + " -DWG_SIZE=" + std::to_string( group_size )
     + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
     + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
-    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR )
-    + " -DEXTENDED_PRECISION";
+    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR );
 
     std::string options;
     if(typeid(T) == typeid(cl_double))
@@ -141,6 +142,9 @@ csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
         options = std::string() + " -DVALUE_TYPE=long -DLONG";
     else
         return clsparseInvalidKernelArgs;
+
+    if(control->extended_precision)
+        options += " -DEXTENDED_PRECISION";
     params.append(options);
 
     cl::Kernel kernel = KernelCache::get( control->queue,
diff --git a/src/library/blas2/csrmv-vector.hpp b/src/library/blas2/csrmv-vector.hpp
index 2b8b637..406933f 100644
--- a/src/library/blas2/csrmv-vector.hpp
+++ b/src/library/blas2/csrmv-vector.hpp
@@ -48,7 +48,7 @@ csrmv_vector(const clsparseScalarPrivate* pAlpha,
     if (nnz_per_row < 8)  {  subwave_size = 4;  }
     if (nnz_per_row < 4)  {  subwave_size = 2;  }
 
-    const std::string params = std::string() +
+    std::string params = std::string() +
             "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
             + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
             + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
@@ -56,6 +56,8 @@ csrmv_vector(const clsparseScalarPrivate* pAlpha,
             + " -DWAVE_SIZE=" + std::to_string(wave_size)
             + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);
 
+    if(control->extended_precision)
+        params += " -DEXTENDED_PRECISION";
 
     cl::Kernel kernel = KernelCache::get(control->queue,
                                          "csrmv_general",
@@ -124,7 +126,7 @@ csrmv_vector(const clsparse::array_base<T>& pAlpha,
     if (nnz_per_row < 8)  {  subwave_size = 4;  }
     if (nnz_per_row < 4)  {  subwave_size = 2;  }
 
-    const std::string params = std::string() +
+    std::string params = std::string() +
             "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
             + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
             + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
@@ -132,6 +134,8 @@ csrmv_vector(const clsparse::array_base<T>& pAlpha,
             + " -DWAVE_SIZE=" + std::to_string(wave_size)
             + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);
 
+    if(control->extended_precision)
+        params += " -DEXTENDED_PRECISION";
 
     cl::Kernel kernel = KernelCache::get(control->queue,
                                          "csrmv_general",
diff --git a/src/library/internal/clsparse-control.cpp b/src/library/internal/clsparse-control.cpp
index 36234b1..e3a6a25 100644
--- a/src/library/internal/clsparse-control.cpp
+++ b/src/library/internal/clsparse-control.cpp
@@ -104,6 +104,7 @@ clsparseCreateControl( cl_command_queue queue, clsparseStatus *status )
     control->wavefront_size = 0;
     control->max_wg_size = 0;
     control->async = false;
+    control->extended_precision = false;
 
     collectEnvParams( control );
 
@@ -144,6 +145,18 @@ clsparseEnableAsync( clsparseControl control, cl_bool async )
     return clsparseSuccess;
 }
 
+clsparseStatus
+clsparseEnableExtendedPrecision( clsparseControl control, cl_bool extended_precision )
+{
+    if( control == NULL )
+    {
+        return clsparseInvalidControlObject;
+    }
+
+    control->extended_precision = extended_precision;
+    return clsparseSuccess;
+}
+
 clsparseStatus
 clsparseReleaseControl( clsparseControl control )
 {
diff --git a/src/library/internal/clsparse-control.hpp b/src/library/internal/clsparse-control.hpp
index 4b44602..5a722c7 100644
--- a/src/library/internal/clsparse-control.hpp
+++ b/src/library/internal/clsparse-control.hpp
@@ -54,6 +54,9 @@ struct _clsparseControl
     size_t wavefront_size;
     size_t max_wg_size;
 
+    // Should we attempt to perform compensated summation?
+    cl_bool extended_precision;
+
     // current device max compute units;
     cl_uint max_compute_units;
 
diff --git a/src/library/kernels/csrmv_general.cl b/src/library/kernels/csrmv_general.cl
index 1948d02..8908aab 100644
--- a/src/library/kernels/csrmv_general.cl
+++ b/src/library/kernels/csrmv_general.cl
@@ -51,8 +51,6 @@ R"(
 # error SUBWAVE_SIZE is not  a power of two!
 #endif
 
-#define EXTENDED_PRECISION
-
 // Knuth's Two-Sum algorithm, which allows us to add together two floating
 // point numbers and exactly tranform the answer into a sum and a
 // rounding error.
diff --git a/src/tests/test-blas2.cpp b/src/tests/test-blas2.cpp
index d43b25d..92aaba8 100644
--- a/src/tests/test-blas2.cpp
+++ b/src/tests/test-blas2.cpp
@@ -33,7 +33,7 @@ clsparseControl ClSparseEnvironment::control = NULL;
 cl_command_queue ClSparseEnvironment::queue = NULL;
 cl_context ClSparseEnvironment::context = NULL;
 //cl_uint ClSparseEnvironment::N = 1024;
-
+static cl_bool extended_precision = false;
 
 namespace po = boost::program_options;
 namespace uBLAS = boost::numeric::ublas;
@@ -132,6 +132,9 @@ class Blas2 : public ::testing::Test
         clsparseStatus status;
         cl_int cl_status;
 
+        if (extended_precision)
+            clsparseEnableExtendedPrecision(CLSE::control, true);
+
         if (typeid(T) == typeid(cl_float) )
         {
             status = clsparseScsrmv(&gAlpha, &CSRE::csrSMatrix, &gX,
@@ -376,8 +379,8 @@ int main (int argc, char* argv[])
             ("alpha,a", po::value(&alpha)->default_value(1.0),
              "Alpha parameter for eq: \n\ty = alpha * M * x + beta * y")
             ("beta,b", po::value(&beta)->default_value(1.0),
-             "Beta parameter for eq: \n\ty = alpha * M * x + beta * y");
-
+             "Beta parameter for eq: \n\ty = alpha * M * x + beta * y")
+            ("precision,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy.");
 
     po::variables_map vm;
     po::parsed_options parsed =
@@ -421,6 +424,9 @@ int main (int argc, char* argv[])
 
     }
 
+    if (vm["precision"].as<bool>())
+        extended_precision = true;
+
     ::testing::InitGoogleTest(&argc, argv);
     //order does matter!
     ::testing::AddGlobalTestEnvironment( new CLSE(pID, dID));

From 690025414c71a9198e094babe9a28ed3339ac8d2 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Thu, 13 Aug 2015 02:11:37 -0500
Subject: [PATCH 20/21] Extending clsparse-bench to allow command-line control
 of compensated summation. Fixed naming convention for this in test-blas2.

---
 .../clsparse-bench/functions/clfunc_xSpMdV.hpp     |  3 ++-
 src/benchmarks/clsparse-bench/src/main.cpp         | 14 +++++++++-----
 src/library/kernels/csrmv_adaptive.cl              |  1 +
 src/library/kernels/csrmv_general.cl               |  1 +
 src/tests/test-blas2.cpp                           |  4 ++--
 5 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp b/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp
index ba30512..bf80607 100644
--- a/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp
+++ b/src/benchmarks/clsparse-bench/functions/clfunc_xSpMdV.hpp
@@ -25,7 +25,7 @@ template <typename T>
 class xSpMdV: public clsparseFunc
 {
 public:
-    xSpMdV( PFCLSPARSETIMER sparseGetTimer, size_t profileCount, cl_device_type devType ): clsparseFunc( devType, CL_QUEUE_PROFILING_ENABLE ), gpuTimer( nullptr ), cpuTimer( nullptr )
+    xSpMdV( PFCLSPARSETIMER sparseGetTimer, size_t profileCount, cl_bool extended_precision, cl_device_type devType ): clsparseFunc( devType, CL_QUEUE_PROFILING_ENABLE ), gpuTimer( nullptr ), cpuTimer( nullptr )
     {
         //	Create and initialize our timer class, if the external timer shared library loaded
         if( sparseGetTimer )
@@ -42,6 +42,7 @@ class xSpMdV: public clsparseFunc
             cpuTimerID = cpuTimer->getUniqueID( "CPU xSpMdV", 0 );
         }
 
+        clsparseEnableExtendedPrecision( control, extended_precision );
 
         clsparseEnableAsync( control, false );
     }
diff --git a/src/benchmarks/clsparse-bench/src/main.cpp b/src/benchmarks/clsparse-bench/src/main.cpp
index b73b73b..62efa6a 100644
--- a/src/benchmarks/clsparse-bench/src/main.cpp
+++ b/src/benchmarks/clsparse-bench/src/main.cpp
@@ -120,14 +120,15 @@ int main( int argc, char *argv[ ] )
     desc.add_options( )
         ( "help,h", "produces this help message" )
         ( "dirpath,d", po::value( &root_dir ), "Matrix directory" )
-        ( "alpha", po::value<cl_double>( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" )
-        ( "beta", po::value<cl_double>( &beta )->default_value( 0.0f ), "specifies the scalar beta" )
+        ( "alpha,a", po::value<cl_double>( &alpha )->default_value( 1.0f ), "specifies the scalar alpha" )
+        ( "beta,b", po::value<cl_double>( &beta )->default_value( 0.0f ), "specifies the scalar beta" )
         ( "rows", po::value<size_t>( &rows )->default_value( 16 ), "specifies the number of rows for matrix data" )
         ( "columns", po::value<size_t>( &columns )->default_value( 16 ), "specifies the number of columns for matrix data" )
         ( "function,f", po::value<std::string>( &function )->default_value( "SpMdV" ), "Sparse functions to test. Options: "
                     "SpMdV, SpMdM, CG, BiCGStab, Csr2Dense, Dense2Csr, Csr2Coo, Coo2Csr" )
         ( "precision,r", po::value<std::string>( &precision )->default_value( "s" ), "Options: s,d,c,z" )
-        ( "profile,p", po::value<size_t>( &profileCount )->default_value( 20 ), "Time and report the kernel speed (default: profiling off)" )
+        ( "profile,p", po::value<size_t>( &profileCount )->default_value( 20 ), "Number of times to run the desired test function" )
+        ( "extended,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy by emulating extended precision" )
         ;
 
     po::variables_map vm;
@@ -160,6 +161,9 @@ int main( int argc, char *argv[ ] )
         std::cerr << "Could not find the external timing library; timings disabled" << std::endl;
     }
 
+    cl_bool extended_precision = false;
+    if (vm["extended"].as<bool>())
+                extended_precision = true;
 
     //	Timer module discovered and loaded successfully
     //	Initialize function pointers to call into the shared module
@@ -170,9 +174,9 @@ int main( int argc, char *argv[ ] )
     if( boost::iequals( function, "SpMdV" ) )
     {
         if( precision == "s" )
-            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU ) );
+            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< float >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU ) );
         else if( precision == "d" )
-            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, CL_DEVICE_TYPE_GPU ) );
+            my_function = std::unique_ptr< clsparseFunc >( new xSpMdV< double >( sparseGetTimer, profileCount, extended_precision, CL_DEVICE_TYPE_GPU ) );
         else
         {
             std::cerr << "Unknown spmdv precision" << std::endl;
diff --git a/src/library/kernels/csrmv_adaptive.cl b/src/library/kernels/csrmv_adaptive.cl
index 3018215..2a1f04e 100644
--- a/src/library/kernels/csrmv_adaptive.cl
+++ b/src/library/kernels/csrmv_adaptive.cl
@@ -300,6 +300,7 @@ VALUE_TYPE sum2_reduce( VALUE_TYPE cur_sum,
         }
 #else
         cur_sum += partial[lid + reduc_size];
+        barrier( CLK_LOCAL_MEM_FENCE );
         partial[lid] = cur_sum;
 #endif
     }
diff --git a/src/library/kernels/csrmv_general.cl b/src/library/kernels/csrmv_general.cl
index 8908aab..59d36c9 100644
--- a/src/library/kernels/csrmv_general.cl
+++ b/src/library/kernels/csrmv_general.cl
@@ -170,6 +170,7 @@ VALUE_TYPE sum2_reduce( VALUE_TYPE cur_sum,
         // with low double-precision calculation rates), but can result in
         // numerical inaccuracies, especially in single precision.
         cur_sum += partial[lid + round];
+        barrier( CLK_LOCAL_MEM_FENCE );
         partial[lid] = cur_sum;
 #endif
     }
diff --git a/src/tests/test-blas2.cpp b/src/tests/test-blas2.cpp
index 92aaba8..2ad19ff 100644
--- a/src/tests/test-blas2.cpp
+++ b/src/tests/test-blas2.cpp
@@ -380,7 +380,7 @@ int main (int argc, char* argv[])
              "Alpha parameter for eq: \n\ty = alpha * M * x + beta * y")
             ("beta,b", po::value(&beta)->default_value(1.0),
              "Beta parameter for eq: \n\ty = alpha * M * x + beta * y")
-            ("precision,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy.");
+            ("extended,e", po::bool_switch()->default_value(false), "Use compensated summation to improve accuracy by emulating extended precision.");
 
     po::variables_map vm;
     po::parsed_options parsed =
@@ -424,7 +424,7 @@ int main (int argc, char* argv[])
 
     }
 
-    if (vm["precision"].as<bool>())
+    if (vm["extended"].as<bool>())
         extended_precision = true;
 
     ::testing::InitGoogleTest(&argc, argv);

From ee78e57344f32d77055b1b37ae223a33a73f3f10 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Thu, 13 Aug 2015 13:54:25 -0500
Subject: [PATCH 21/21] Minor modifications to test-blas2 so that it works OK
 even though GPU single-precision compensated summation can have errors due to
 lack of denorm support.

---
 src/tests/test-blas2.cpp | 58 ++++++++++++++++++++++++++++++----------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/src/tests/test-blas2.cpp b/src/tests/test-blas2.cpp
index 2ad19ff..25c12c3 100644
--- a/src/tests/test-blas2.cpp
+++ b/src/tests/test-blas2.cpp
@@ -184,14 +184,29 @@ class Blas2 : public ::testing::Test
                 //printf("\tFloat hY[%d] = %.*e (0x%08" PRIx32 "), ", i, 9, hY[i], *(uint32_t *)&hY[i]);
                 //printf("host_result[%d] = %.*e (0x%08" PRIx32 ")\n", i, 9, host_result[i], *(uint32_t *)&host_result[i]);
             }
-            printf("Float Min ulps: %" PRIu64 "\n", min_ulps);
-            printf("Float Max ulps: %" PRIu64 "\n", max_ulps);
-            printf("Float Total ulps: %" PRIu64 "\n", total_ulps);
-            printf("Float Average ulps: %f (Size: %lu)\n", (double)total_ulps/(double)hY.size(), hY.size());
+            if (extended_precision)
+            {
+                printf("Float Min ulps: %" PRIu64 "\n", min_ulps);
+                printf("Float Max ulps: %" PRIu64 "\n", max_ulps);
+                printf("Float Total ulps: %" PRIu64 "\n", total_ulps);
+                printf("Float Average ulps: %f (Size: %lu)\n", (double)total_ulps/(double)hY.size(), hY.size());
+            }
 
             for (int i = 0; i < hY.size(); i++)
             {
-                double compare_val = fabs(hY[i]*1e-5);
+                double compare_val = 0.;
+                if (extended_precision)
+                {
+                    // The limit here is somewhat weak because some GPUs don't
+                    // support correctly rounded denorms in SPFP mode.
+                    if (boost::math::isnormal(hY[i]))
+                        compare_val = fabs(hY[i]*1e-3);
+                }
+                else
+                {
+                    if (boost::math::isnormal(hY[i]))
+                        compare_val = fabs(hY[i]*0.1);
+                }
                 if (compare_val < 10*FLT_EPSILON)
                     compare_val = 10*FLT_EPSILON;
                 ASSERT_NEAR(hY[i], host_result[i], compare_val);
@@ -255,17 +270,32 @@ class Blas2 : public ::testing::Test
                 //printf("\tDouble hY[%d] = %.*e (0x%016" PRIx64 "), ", i, 17, hY[i], *(uint64_t *)&hY[i]);
                 //printf("host_result[%d] = %.*e (0x%016" PRIx64 ")\n", i, 17, host_result[i], *(uint64_t *)&host_result[i]);
             }
-            printf("Double Min ulps: %" PRIu64 "\n", min_ulps);
-            printf("Double Max ulps: %" PRIu64 "\n", max_ulps);
-            printf("Double Total ulps: %" PRIu64 "\n", total_ulps);
-            printf("Double Average ulps: %f (Size: %lu)\n", (double)total_ulps/(double)hY.size(), hY.size());
+            if (extended_precision)
+            {
+                printf("Double Min ulps: %" PRIu64 "\n", min_ulps);
+                printf("Double Max ulps: %" PRIu64 "\n", max_ulps);
+                printf("Double Total ulps: %" PRIu64 "\n", total_ulps);
+                printf("Double Average ulps: %f (Size: %lu)\n", (double)total_ulps/(double)hY.size(), hY.size());
 
-            for (int i = 0; i < hY.size(); i++)
+                for (int i = 0; i < hY.size(); i++)
+                {
+                    double compare_val = fabs(hY[i]*1e-14);
+                    if (compare_val < 10*DBL_EPSILON)
+                        compare_val = 10*DBL_EPSILON;
+                    ASSERT_NEAR(hY[i], host_result[i], compare_val);
+                }
+            }
+            else
             {
-                double compare_val = fabs(hY[i]*1e-14);
-                if (compare_val < 10*DBL_EPSILON)
-                    compare_val = 10*DBL_EPSILON;
-                ASSERT_NEAR(hY[i], host_result[i], compare_val);
+                for (int i = 0; i < hY.size(); i++)
+                {
+                    double compare_val = 0.;
+                    if (boost::math::isnormal(hY[i]))
+                        compare_val = fabs(hY[i]*0.1);
+                    if (compare_val < 10*DBL_EPSILON)
+                        compare_val = 10*DBL_EPSILON;
+                    ASSERT_NEAR(hY[i], host_result[i], compare_val);
+                }
             }
 
             cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,