Merge pull request bitsandbytes-foundation#3 from pnunna93/fix_igemmlt_int

amathews-amd · web-flow · commit 7198d6bb5be6 · 2024-01-17T11:14:21.000-06:00
Enable igemmlt int test on rocm
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -16,7 +16,7 @@
 from typing import Tuple
 from torch import Tensor
 
-from .cextension import COMPILED_WITH_CUDA, lib
+from .cextension import COMPILED_WITH_CUDA, lib, HIP_ENVIRONMENT
 
 # Remark: for AMD GPU we need to disable blocksize == 64
 
@@ -458,7 +458,11 @@ def get_transform_buffer(
         state = (shape[::-1], to_order)
 
     if to_order == "row" or to_order == "col":
-        return init_func(shape, dtype=dtype, device=device), state
+        if HIP_ENVIRONMENT and to_order == "col":
+            # row to col transformation transposes output shape, so change buffer allocation accordingly
+            return init_func(shape[::-1], dtype=dtype, device=device), state
+        else:
+            return init_func(shape, dtype=dtype, device=device), state
     elif to_order == "col32":
         # blocks of 32 columns (padded)
         cols = 32 * ((cols + 31) // 32)
@@ -486,6 +490,10 @@ def nvidia_transform(
     state=None,
     ld=None,
 ):
+    if HIP_ENVIRONMENT:
+        to_order = "col" if to_order in ["col32","col_turing","col_ampere"] else to_order
+        from_order = "col" if from_order in ["col32","col_turing","col_ampere"] else from_order
+
     if state is None:
         state = (A.shape, from_order)
     else:
@@ -1715,23 +1723,38 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
 
     if dimsA == 2 and out is None:
-        out, Sout = get_transform_buffer(
-            (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
-        )
+        if HIP_ENVIRONMENT:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeB[0]), dtype, A.device, "col", "row"
+            )
+        else:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row"
+            )
     elif dimsA == 3 and out is None:
-        out, Sout = get_transform_buffer(
-            (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
-        )
+        if HIP_ENVIRONMENT:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col", "row"
+            )
+        else:
+            out, Sout = get_transform_buffer(
+                (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row"
+            )
 
     assert dimsB != 3, "len(B.shape)==3 not supported"
     assert A.device.type == "cuda"
     assert B.device.type == "cuda"
     assert A.dtype == torch.int8
     assert B.dtype == torch.int8
     assert out.dtype == dtype
-    assert SA[1] == "col32"
-    assert SB[1] in ["col_turing", "col_ampere"]
-    assert Sout[1] == "col32"
+    if HIP_ENVIRONMENT:
+        assert SA[1] == "col"
+        assert SB[1] == "col"
+        assert Sout[1] == "col"
+    else:
+        assert SA[1] == "col32"
+        assert SB[1] in ["col_turing", "col_ampere"]
+        assert Sout[1] == "col32"
     assert (
         shapeA[-1] == shapeB[-1]
     ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
@@ -1745,25 +1768,29 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
     ptrC = get_ptr(out)
 
     k = shapeA[-1]
-    lda = ct.c_int32(m * 32)
-    if formatB == "col_turing":
-        # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
-        # n = rows
-        ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
+    if HIP_ENVIRONMENT:
+        lda = ct.c_int32(m)
+        ldb = ct.c_int32(shapeB[0])
+        ldc = ct.c_int32(m)
     else:
-        # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
-        # n = rows
-        ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
-
-    ldc = ct.c_int32(m * 32)
+        lda = ct.c_int32(m * 32)
+        if formatB == "col_turing":
+            # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
+            # n = rows
+            ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
+        else:
+            # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
+            # n = rows
+            ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
+        ldc = ct.c_int32(m * 32)
     m = ct.c_int32(m)
     n = ct.c_int32(n)
     k = ct.c_int32(k)
 
     has_error = 0
     ptrRowScale = get_ptr(None)
     is_on_gpu([A, B, out])
-    if formatB == 'col_turing':
+    if formatB == 'col_turing' or HIP_ENVIRONMENT:
         if dtype == torch.int32:
             has_error = lib.cigemmlt_turing_32(
                 ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc
@@ -2072,6 +2099,9 @@ def double_quant(
 
 
 def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None):
+    if HIP_ENVIRONMENT:
+        return nvidia_transform(A,to_order,from_order,out,transpose,state,ld)
+
     prev_device = pre_call(A.device)
     if state is None: state = (A.shape, from_order)
     else: from_order = state[1]
diff --git a/csrc/ops.hip b/csrc/ops.hip
@@ -431,6 +431,8 @@ template void transform<int8_t, ROW, COL_TURING, false, 8>(hipblasLtHandle_t ltH
 template void transform<int8_t, ROW, COL_AMPERE, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
 template void transform<int8_t, COL32, ROW, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
 template void transform<int32_t, COL32, ROW, false, 32>(hipblasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
+template void transform<int8_t, COL, ROW, false, 8>(hipblasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
+template void transform<int32_t, COL, ROW, false, 32>(hipblasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);
 #endif
 static std::string hipError_to_string(const hipError_t ret)
 {
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
@@ -145,6 +145,8 @@ MAKE_FUNC_TRANSFORM(8, row, col_turing, n, int8_t, ROW, COL_TURING, false, 8);
 MAKE_FUNC_TRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8);
 MAKE_FUNC_TRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8);
 MAKE_FUNC_TRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32);
+MAKE_FUNC_TRANSFORM(8, col, row, n, int8_t, COL, ROW, false, 8);
+MAKE_FUNC_TRANSFORM(32, col, row, n, int32_t, COL, ROW, false, 32);
 #endif
 
 void transform_row2col32(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 0>(A, out, rows, cols); }
@@ -381,6 +383,8 @@ extern "C"
 	MAKE_FUNC_CTRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8)
 	MAKE_FUNC_CTRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8)
 	MAKE_FUNC_CTRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32)
+	MAKE_FUNC_CTRANSFORM(8, col, row, n, int8_t, COL, ROW, false, 8)
+	MAKE_FUNC_CTRANSFORM(32, col, row, n, int32_t, COL, ROW, false, 32)
 #endif
 	void cdequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols)
 	{ dequant_mm_int32_fp16(A, rowStats, colStats, out, newRowStats, newcolStats, bias, numRows, numCols); }
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -594,15 +594,14 @@ def test_vector_quant(dim1, dim2, dim3):
 # dim1, dim2 = (256,), (256,)
 dtype = [torch.int8, torch.int32]
 a_order = ["row"]
-out_order = ["col", "row", "col32"]
+out_order = ["col", "row"] if HIP_ENVIRONMENT else ["col", "row", "col32"]
 transpose = [False]
 dims = [2, 3]
 values = list(product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose))
 
 names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values]
 
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names)
 def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
     if dims == 3 and out_order != "col32":
@@ -686,7 +685,6 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
     for vals in values
 ]
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims, ldb", values, ids=names)
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
     for i in range(k):
@@ -709,16 +707,19 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
         C3, S = F.nvidia_transform(C2, "row", state=SC)
         torch.testing.assert_close(C1, C3.float())
 
-        # transpose
-        B = torch.randint(-128, 127, size=(dim3, dim4), device="cuda").to(
-            torch.int8
-        )
-        C1 = torch.matmul(A.float(), B.float())
+        # Since ROCm supports row to col transformation only which is same as transpose,
+        # skipping this for HIP environment
+        if not HIP_ENVIRONMENT:
+            ## transpose
+            B = torch.randint(-128, 127, size=(dim3, dim4), device="cuda").to(
+                torch.int8
+            )
+            C1 = torch.matmul(A.float(), B.float())
 
-        B2t, SBt = F.transform(B, "col_turing", transpose=True)
-        C2, SC = F.igemmlt(A2, B2t, SA, SBt)
-        C3, S = F.nvidia_transform(C2, "row", state=SC)
-        torch.testing.assert_close(C1, C3.float())
+            B2t, SBt = F.transform(B, "col_turing", transpose=True)
+            C2, SC = F.igemmlt(A2, B2t, SA, SBt)
+            C3, S = F.nvidia_transform(C2, "row", state=SC)
+            torch.testing.assert_close(C1, C3.float())
 
 
 dim1 = [32]
@@ -734,7 +735,6 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
     for vals in values
 ]
 
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims", values, ids=names)
 def test_igemmlt_half(dim1, dim2, dim3, dim4, dims):
     formatB = F.get_special_format_str()

Original file line number	Diff line number	Diff line change
`@@ -431,6 +431,8 @@ template void transform<int8_t, ROW, COL_TURING, false, 8>(hipblasLtHandle_t ltH`
`431`	`431`	`template void transform<int8_t, ROW, COL_AMPERE, false, 8>(hipblasLtHandle_t ltHandle, int8_t A, int8_t out, int dim1, int dim2);`
`432`	`432`	`template void transform<int8_t, COL32, ROW, false, 8>(hipblasLtHandle_t ltHandle, int8_t A, int8_t out, int dim1, int dim2);`
`433`	`433`	`template void transform<int32_t, COL32, ROW, false, 32>(hipblasLtHandle_t ltHandle, int32_t A, int32_t out, int dim1, int dim2);`
	`434`	`+template void transform<int8_t, COL, ROW, false, 8>(hipblasLtHandle_t ltHandle, int8_t A, int8_t out, int dim1, int dim2);`
	`435`	`+template void transform<int32_t, COL, ROW, false, 32>(hipblasLtHandle_t ltHandle, int32_t A, int32_t out, int dim1, int dim2);`
`434`	`436`	`#endif`
`435`	`437`	`static std::string hipError_to_string(const hipError_t ret)`
`436`	`438`	`{`