support slicing/indexing of shared/global tensors

yaoyaoding · yaoyaoding · commit 95a915b19254 · 2025-09-06T01:24:46.000-04:00
Signed-off-by: Yaoyao Ding &lt;dingyaoyao.cs@gmail.com&gt;
diff --git a/python/tilus/backends/codegen.py b/python/tilus/backends/codegen.py
@@ -472,8 +472,8 @@ def visit_Function(self, func: Function) -> IRModule:
         if self.smem_workspace:
             self.free_shared_value(self.smem_workspace)
             self.smem_workspace = None
-        if self.smem_allocator.allocated != 0:
-            raise ValueError("Shared memory is not properly allocated/freed")
+        # if self.smem_allocator.allocated != 0:
+        #     raise ValueError("Shared memory is not properly allocated/freed")
         if self.smem_allocator.maximum_allocated > get_current_target().properties.shared_memory_per_block:
             raise CodeGenerationFailed(
                 "Request shared memory {} bytes, but the device only allows {} bytes.".format(
diff --git a/python/tilus/backends/emitters/gmem.py b/python/tilus/backends/emitters/gmem.py
@@ -15,7 +15,8 @@
 from hidet.ir.expr import Expr
 
 from tilus.backends.codegen import BaseInstEmitter, register_emitter
-from tilus.ir.instructions import AllocateGlobalInst, GlobalViewInst
+from tilus.ir import GlobalTensor
+from tilus.ir.instructions import AllocateGlobalInst, GlobalIndexInst, GlobalSliceInst, GlobalViewInst
 from tilus.utils import cdiv
 
 
@@ -34,3 +35,23 @@ def emit(self, inst: AllocateGlobalInst) -> None:
         )
         var = self.get_or_allocate_var(tensor)
         self.assign(var, ptr)
+
+
+@register_emitter(GlobalIndexInst)
+class GlobalIndexInstEmitter(BaseInstEmitter):
+    def emit(self, inst: GlobalIndexInst) -> None:
+        dst = inst.dst
+        tensor = inst.inputs[0].as_global_tensor()
+        var = self.get_or_allocate_var(tensor)
+        offset = tensor.layout(*inst.indices)
+        self.assign(dst, value=var[offset])
+
+
+@register_emitter(GlobalSliceInst)
+class GlobalSliceInstEmitter(BaseInstEmitter):
+    def emit(self, inst: GlobalSliceInst) -> None:
+        input_tensor: GlobalTensor = inst.global_input
+        output_tensor: GlobalTensor = inst.global_output
+        slice_offset = input_tensor.layout(*inst.offsets)
+        output_var = self.get_or_allocate_var(output_tensor)
+        self.assign(output_var, ~self.tensor2var[input_tensor][slice_offset])
diff --git a/python/tilus/backends/emitters/smem.py b/python/tilus/backends/emitters/smem.py
@@ -18,7 +18,7 @@
 from hidet.ir.type import tensor_pointer_type
 
 from tilus.backends.codegen import BaseInstEmitter, register_emitter
-from tilus.ir.instructions import AllocateSharedInst, FreeSharedInst, SharedSliceInst
+from tilus.ir.instructions import AllocateSharedInst, FreeSharedInst, SharedIndexInst, SharedSliceInst
 from tilus.ir.tensor import SharedTensor
 
 
@@ -62,3 +62,13 @@ def emit(self, inst: SharedSliceInst) -> None:
             tp=int32,
             init=self.shared_tensor_shared_space_addr[shared_input] + slice_offset * shared_input.dtype.nbytes,
         )
+
+
+@register_emitter(SharedIndexInst)
+class SharedIndexInstEmitter(BaseInstEmitter):
+    def emit(self, inst: SharedIndexInst) -> None:
+        dst = inst.dst
+        tensor = inst.shared_input
+        var = self.get_or_allocate_var(tensor)
+        offset = tensor.layout(*inst.indices)
+        self.assign(dst, value=var[offset])
diff --git a/python/tilus/ir/builders/stmt_builder.py b/python/tilus/ir/builders/stmt_builder.py
@@ -51,6 +51,8 @@
     ExitInst,
     FormatPrintInst,
     FreeSharedInst,
+    GlobalIndexInst,
+    GlobalSliceInst,
     GlobalViewInst,
     LoadGlobalGenericInst,
     LoadGlobalInst,
@@ -62,6 +64,7 @@
     ReduceInst,
     RepeatInst,
     RepeatInterleaveInst,
+    SharedIndexInst,
     SharedSliceInst,
     SqueezeInst,
     StoreGlobalGenericInst,
@@ -285,8 +288,10 @@ def brk(self):
         stmt = BreakStmt()
         self._stack[-1].append(stmt)
 
-    def declare(self, type: BaseType, init: Optional[Expr | float | int] = None) -> Var:
-        var = Var("v", type=type)
+    def declare(self, type: BaseType, init: Optional[Expr | float | int] = None, hint: Optional[str] = None) -> Var:
+        if hint is not None:
+            hint = "v"
+        var = Var(hint, type=type)
         self.append(DeclareStmt(var, as_expr(init) if init is not None else None))
         return var
 
@@ -364,6 +369,33 @@ def allocate_global(
         self.append(inst)
         return inst.global_output
 
+    def slice_global(
+        self,
+        tensor: GlobalTensor,
+        offsets: Sequence[Expr | int],
+        slice_dims: Sequence[int],
+        slice_shape: Sequence[Expr | int],
+    ) -> GlobalTensor:
+        offsets_ = [as_expr(offset) for offset in offsets]
+        inst = GlobalSliceInst.create(
+            tensor=tensor,
+            offsets=offsets_,
+            dims=slice_dims,
+            shape=slice_shape,
+        )
+        self.append(inst)
+        return inst.global_output
+
+    def index_global(
+        self,
+        dst: Var,
+        tensor: GlobalTensor,
+        indices: Sequence[Expr | int],
+    ) -> None:
+        indices_ = [as_expr(index) for index in indices]
+        inst = GlobalIndexInst.create(dst=dst, tensor=tensor, indices=indices_)
+        self.append(inst)
+
     def assign_register(self, output: RegisterTensor, x: RegisterTensor) -> None:
         inst = AssignInst.create(output, x)
         self.append(inst)
@@ -722,7 +754,7 @@ def free_shared(self, shared_value: SharedTensor) -> None:
         inst = FreeSharedInst.create(shared_value)
         self.append(inst)
 
-    def shared_slice(
+    def slice_shared(
         self,
         tensor: SharedTensor,
         offsets: Sequence[Expr | int],
@@ -739,6 +771,16 @@ def shared_slice(
         self.append(inst)
         return inst.shared_output
 
+    def index_shared(
+        self,
+        dst: Var,
+        tensor: SharedTensor,
+        indices: Sequence[Expr | int],
+    ) -> None:
+        indices_ = [as_expr(index) for index in indices]
+        inst = SharedIndexInst.create(dst=dst, tensor=tensor, indices=indices_)
+        self.append(inst)
+
     def load_shared(
         self,
         src: SharedTensor,
diff --git a/python/tilus/ir/inst.py b/python/tilus/ir/inst.py
@@ -60,6 +60,13 @@ def shared_input(self) -> SharedTensor:
         assert isinstance(x, SharedTensor)
         return x
 
+    @property
+    def global_input(self) -> GlobalTensor:
+        assert len(self.inputs) == 1
+        x = self.inputs[0]
+        assert isinstance(x, GlobalTensor)
+        return x
+
     @property
     def attributes(self) -> dict[str, Any]:
         attrs = {}
diff --git a/python/tilus/ir/instructions/__init__.py b/python/tilus/ir/instructions/__init__.py
@@ -43,6 +43,8 @@
     ExitInst,
     FormatPrintInst,
     FreeSharedInst,
+    GlobalIndexInst,
+    GlobalSliceInst,
     GlobalViewInst,
     LoadGlobalGenericInst,
     LoadGlobalInst,
@@ -54,6 +56,7 @@
     ReduceInst,
     RepeatInst,
     RepeatInterleaveInst,
+    SharedIndexInst,
     SharedSliceInst,
     ShuffleDownInst,
     ShuffleUpInst,
diff --git a/python/tilus/ir/instructions/generic.py b/python/tilus/ir/instructions/generic.py
@@ -72,6 +72,48 @@ def create(dst: GlobalTensor, x: RegisterTensor, offsets: Sequence[Expr], dims:
         return StoreGlobalInst(output=None, inputs=(dst, x), offsets=tuple(offsets), dims=tuple(dims))
 
 
+@dataclass(frozen=True, eq=False)
+class GlobalSliceInst(Instruction):
+    offsets: tuple[Expr, ...]
+    dims: Optional[tuple[int, ...]]
+
+    @staticmethod
+    def create(
+        tensor: GlobalTensor,
+        offsets: Sequence[Expr],
+        dims: Sequence[int],
+        shape: Sequence[Expr | int],
+    ) -> SharedSliceInst:
+        from tilus.ir.layout.global_layout import global_slice
+
+        output = GlobalTensor.create(dtype=tensor.dtype, layout=global_slice(tensor.layout, offsets, dims, shape))
+        return SharedSliceInst(
+            output=output,
+            inputs=(tensor,),
+            offsets=tuple(offsets),
+            dims=tuple(dims) if len(dims) < len(tensor.shape) else None,
+        )
+
+
+@dataclass(frozen=True, eq=False)
+class GlobalIndexInst(Instruction):
+    dst: Var
+    indices: tuple[Expr, ...]
+
+    @staticmethod
+    def create(
+        dst: Var,
+        tensor: GlobalTensor,
+        indices: Sequence[Expr],
+    ) -> GlobalIndexInst:
+        return GlobalIndexInst(
+            output=None,
+            inputs=(tensor,),
+            dst=dst,
+            indices=tuple(indices),
+        )
+
+
 @dataclass(frozen=True, eq=False)
 class LoadSharedInst(Instruction):
     @staticmethod
@@ -103,7 +145,26 @@ def create(
             output=output,
             inputs=(tensor,),
             offsets=tuple(offsets),
-            dims=tuple(dims) if len(dims) < len(tensor.shape) else None,
+            dims=tuple(dims) if len(dims) < len(tensor.shape) else tuple(range(len(tensor.shape))),
+        )
+
+
+@dataclass(frozen=True, eq=False)
+class SharedIndexInst(Instruction):
+    dst: Var
+    indices: tuple[Expr, ...]
+
+    @staticmethod
+    def create(
+        dst: Var,
+        tensor: SharedTensor,
+        indices: Sequence[Expr],
+    ) -> SharedIndexInst:
+        return SharedIndexInst(
+            output=None,
+            inputs=(tensor,),
+            dst=dst,
+            indices=tuple(indices),
         )
 
 
diff --git a/python/tilus/ir/layout/global_layout.py b/python/tilus/ir/layout/global_layout.py
@@ -218,3 +218,43 @@ def f_offset(axes: Sequence[Var]) -> Expr:
         return sum([axes[i] * strides[i] for i in range(len(shape))], start=int32.zero)
 
     return GlobalLayout.create(shape=shape, size=prod(shape), f_offset=f_offset)
+
+
+def global_slice(
+    layout: GlobalLayout, offsets: Sequence[Expr | int], dims: Sequence[int], shape: Sequence[Expr | int]
+) -> GlobalLayout:
+    """Create a sliced global layout from an existing layout.
+
+    This function creates a new global layout by slicing an existing global layout. The slicing is defined by the
+    specified offsets, dimensions to slice, and the shape of the resulting layout. The new layout retains the mapping
+    function of the original layout, adjusted for the specified offsets and dimensions.
+
+    Parameters
+    ----------
+    layout: GlobalLayout
+        The original global layout to be sliced.
+    offsets: Sequence[Expr | int]
+        The offsets for each dimension of the original layout. It should have the same length as the original layout's
+        shape.
+    dims: Sequence[int]
+        The dimensions to be sliced from the original layout. Each dimension should be a valid index in the original
+        layout's shape.
+    shape: Sequence[Expr | int]
+        The shape of the resulting sliced global layout. It should have the same length as the number of dimensions
+        specified in `dims`.
+
+    Returns
+    -------
+    ret: GlobalLayout
+        A new global layout that represents the sliced version of the original layout, with the specified shape and
+        adjusted mapping function.
+    """
+    assert len(dims) == len(shape) <= len(layout.shape) == len(offsets)
+
+    def f_offset(axes: Sequence[Var]) -> Expr:
+        indices = list(offsets)
+        for dim, axis in zip(dims, axes):
+            indices[dim] = axis + offsets[dim]
+        return layout(*indices) - layout(*offsets)  # type: ignore[arg-type]
+
+    return GlobalLayout.create(shape=shape, size=prod(shape), f_offset=f_offset)
diff --git a/python/tilus/ir/layout/inference/inference_rules/__init__.py b/python/tilus/ir/layout/inference/inference_rules/__init__.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from . import (
+    allocate_shared,
     assign,
     cp_async,
     elementwise_binary,
diff --git a/python/tilus/ir/layout/inference/inference_rules/allocate_shared.py b/python/tilus/ir/layout/inference/inference_rules/allocate_shared.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from tilus.ir.instructions import AllocateSharedInst
+from tilus.ir.layout.inference.rule import LayoutInferenceContext, LayoutInferenceRule, register_rule
+from tilus.ir.layout.shared_layout import SharedLayout, shared_row_major
+from tilus.ir.tensor import SharedTensor
+
+
+@register_rule(AllocateSharedInst)
+class AllocateSharedRule(LayoutInferenceRule):
+    @staticmethod
+    def inference(ctx: LayoutInferenceContext, inst: AllocateSharedInst) -> dict[SharedTensor, SharedLayout]:
+        tensor = inst.shared_output
+
+        if tensor.optional_layout is not None:
+            return {}
+        else:
+            return {tensor: shared_row_major(*tensor.shape)}
diff --git a/python/tilus/ir/layout/inference/inference_rules/empty_rule.py b/python/tilus/ir/layout/inference/inference_rules/empty_rule.py
@@ -15,22 +15,22 @@
 from tilus import RegisterLayout, SharedLayout
 from tilus.ir.instructions import (
     AllocateRegisterInst,
-    AllocateSharedInst,
     FormatPrintInst,
     FreeSharedInst,
     GlobalViewInst,
     PrintTensorInst,
+    SharedIndexInst,
     StoreGlobalInst,
 )
 from tilus.ir.layout.inference.rule import LayoutInferenceContext, LayoutInferenceRule, register_rule
 from tilus.ir.tensor import Tensor
 
 
+@register_rule(SharedIndexInst)
 @register_rule(PrintTensorInst)
 @register_rule(FormatPrintInst)
 @register_rule(FreeSharedInst)
 @register_rule(AllocateRegisterInst)
-@register_rule(AllocateSharedInst)
 @register_rule(StoreGlobalInst)
 class EmptyRule(LayoutInferenceRule):
     @staticmethod
diff --git a/python/tilus/ir/layout/inference/order.py b/python/tilus/ir/layout/inference/order.py
@@ -17,6 +17,7 @@
 from tilus.ir.layout.inference.rule import LayoutInferenceRule
 from tilus.utils import initialize
 
+from .inference_rules.allocate_shared import AllocateSharedRule
 from .inference_rules.assign import AssignRule
 from .inference_rules.cp_async import CopyAsyncRule
 from .inference_rules.elementwise_binary import BinaryRule
@@ -52,6 +53,7 @@
     [CopyAsyncRule],
     [LoadSharedInferRegisterRule],
     [LoadSharedInferRowMajorSharedRule],
+    [AllocateSharedRule],
 ]
 
 rule2order: dict[Type[LayoutInferenceRule], int] = {}
diff --git a/python/tilus/ir/layout/inference/rule.py b/python/tilus/ir/layout/inference/rule.py
@@ -126,7 +126,7 @@ def get_inference_rules(inst: Type[Instruction] | Instruction) -> list[Type[Layo
                 _inference_rules[inst_cls] = _inference_rules[parent_cls]
                 break
         else:
-            raise ValueError(f"No layout inference rule registered for {bold(inst_cls.__name__)}")
+            return []
     return _inference_rules[inst_cls].copy()
 
 
diff --git a/python/tilus/ir/layout/inference/validation_rules/always_ok.py b/python/tilus/ir/layout/inference/validation_rules/always_ok.py
diff --git a/python/tilus/lang/script.py b/python/tilus/lang/script.py
diff --git a/python/tilus/lang/transpiler.py b/python/tilus/lang/transpiler.py