kvzch use new operator in model publish (#3108)

Chenyu Zhang · facebook-github-bot · commit 718234bb1521 · 2025-07-07T11:45:38.000-07:00
Summary:

Publish change to enable KVEmbeddingInference when use_virtual_table is set to true

Reviewed By: emlin

Differential Revision: D75321284
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -292,6 +292,7 @@ def create_sharding_infos_by_sharding_device_group(
                         getattr(config, "num_embeddings_post_pruning", None)
                         # TODO: Need to check if attribute exists for BC
                     ),
+                    use_virtual_table=config.use_virtual_table,
                 ),
                 param_sharding=parameter_sharding,
                 param=param,
diff --git a/torchrec/distributed/quant_embedding_kernel.py b/torchrec/distributed/quant_embedding_kernel.py
@@ -20,6 +20,7 @@
     PoolingMode,
     rounded_row_size_in_bytes,
 )
+from fbgemm_gpu.tbe.cache.kv_embedding_ops_inference import KVEmbeddingInference
 from torchrec.distributed.batched_embedding_kernel import (
     BaseBatchedEmbedding,
     BaseBatchedEmbeddingBag,
@@ -119,6 +120,32 @@ def _quantize_weight(
     return quant_weight_list
 
 
+def _get_shard_offsets_for_kv_zch(
+    config: GroupedEmbeddingConfig,
+    shard_index: int,
+) -> List[int]:
+    """
+    Given kv zch tables are rw sharded, getting the row offsets for each shard
+    at level to be used witin kv zch look up kernel
+    """
+    shard_row_offsets = []
+    for table in config.embedding_tables:
+        assert (
+            table.global_metadata is not None
+        ), f"Expected global_metadata to be populated for table {table.name} to get shard offsets for kv zch look up kernel"
+        assert (
+            len(table.global_metadata.shards_metadata) > shard_index
+        ), f"Expected table {table.name} to have more shards than shard index {shard_index}. Found {len(table.global_metadata.shards_metadata)} shards"
+        shard_row_offsets.append(
+            # pyre-ignore: Undefined attribute [16]
+            table.global_metadata.shards_metadata[shard_index].shard_offsets[0]
+        )
+    logger.info(
+        f"Shard row offsets for kv zch look up table {config.embedding_names=}: {shard_row_offsets=}"
+    )
+    return shard_row_offsets
+
+
 def _get_runtime_device(
     device: Optional[torch.device],
     config: GroupedEmbeddingConfig,
@@ -237,13 +264,16 @@ def __init__(
         super().__init__(config, pg, device)
 
         managed: List[EmbeddingLocation] = []
+        is_virtual_table: bool = False
         for table in config.embedding_tables:
             if device is not None and device.type == "cuda":
                 managed.append(
                     compute_kernel_to_embedding_location(table.compute_kernel)
                 )
             else:
                 managed.append(EmbeddingLocation.HOST)
+            if table.use_virtual_table:
+                is_virtual_table = True
         self._config: GroupedEmbeddingConfig = config
         self._emb_module_registered: bool = is_fused_param_register_tbe(fused_params)
         self._is_weighted: Optional[bool] = config.is_weighted
@@ -284,9 +314,21 @@ def __init__(
 
         if self.lengths_to_tbe:
             tbe_clazz = IntNBitTableBatchedEmbeddingBagsCodegenWithLength
+        elif is_virtual_table:
+            tbe_clazz = KVEmbeddingInference
         else:
             tbe_clazz = IntNBitTableBatchedEmbeddingBagsCodegen
 
+        if is_virtual_table:
+            assert (
+                shard_index is not None and shard_index >= 0
+            ), "valid shard_index must be provided for kv zch batch embedding to compute shard offsets"
+            shard_offsets_for_kv_zch = _get_shard_offsets_for_kv_zch(
+                config, shard_index
+            )
+        else:
+            shard_offsets_for_kv_zch = None
+
         self._emb_module: IntNBitTableBatchedEmbeddingBagsCodegen = tbe_clazz(
             embedding_specs=embedding_specs,
             device=device,
@@ -448,13 +490,16 @@ def __init__(
         super().__init__(config, pg, device)
 
         managed: List[EmbeddingLocation] = []
+        is_virtual_table = False
         for table in config.embedding_tables:
             if device is not None and device.type == "cuda":
                 managed.append(
                     compute_kernel_to_embedding_location(table.compute_kernel)
                 )
             else:
                 managed.append(EmbeddingLocation.HOST)
+            if table.use_virtual_table:
+                is_virtual_table = True
         self._config: GroupedEmbeddingConfig = config
         self._emb_module_registered: bool = is_fused_param_register_tbe(fused_params)
         self._quant_state_dict_split_scale_bias: bool = (
@@ -465,37 +510,52 @@ def __init__(
         )
         # 16 for CUDA, 1 for others like CPU and MTIA.
         self._tbe_row_alignment: int = 16 if self._runtime_device.type == "cuda" else 1
-        self._emb_module: IntNBitTableBatchedEmbeddingBagsCodegen = (
-            IntNBitTableBatchedEmbeddingBagsCodegen(
-                embedding_specs=[
-                    (
-                        table.name,
-                        local_rows,
-                        (
-                            local_cols
-                            if self._quant_state_dict_split_scale_bias
-                            else table.embedding_dim
-                        ),
-                        data_type_to_sparse_type(table.data_type),
-                        location,
-                    )
-                    for local_rows, local_cols, table, location in zip(
-                        self._local_rows,
-                        self._local_cols,
-                        config.embedding_tables,
-                        managed,
-                    )
-                ],
-                device=device,
-                pooling_mode=PoolingMode.NONE,
-                feature_table_map=self._feature_table_map,
-                row_alignment=self._tbe_row_alignment,
-                uvm_host_mapped=True,  # Use cudaHostAlloc for UVM CACHING to fix imbalance numa memory issue
-                feature_names_per_table=[
-                    table.feature_names for table in config.embedding_tables
-                ],
-                **(tbe_fused_params(fused_params) or {}),
+        embedding_clazz = (
+            KVEmbeddingInference
+            if is_virtual_table
+            else IntNBitTableBatchedEmbeddingBagsCodegen
+        )
+        if is_virtual_table:
+            assert (
+                shard_index is not None and shard_index >= 0
+            ), "valid shard_index must be provided for kv zch batch embedding to compute shard offsets"
+            shard_offsets_for_kv_zch = _get_shard_offsets_for_kv_zch(
+                config, shard_index
             )
+        else:
+            shard_offsets_for_kv_zch = None
+
+        self._emb_module: (
+            IntNBitTableBatchedEmbeddingBagsCodegen | KVEmbeddingInference
+        ) = embedding_clazz(
+            embedding_specs=[
+                (
+                    table.name,
+                    local_rows,
+                    (
+                        local_cols
+                        if self._quant_state_dict_split_scale_bias
+                        else table.embedding_dim
+                    ),
+                    data_type_to_sparse_type(table.data_type),
+                    location,
+                )
+                for local_rows, local_cols, table, location in zip(
+                    self._local_rows,
+                    self._local_cols,
+                    config.embedding_tables,
+                    managed,
+                )
+            ],
+            device=device,
+            pooling_mode=PoolingMode.NONE,
+            feature_table_map=self._feature_table_map,
+            row_alignment=self._tbe_row_alignment,
+            uvm_host_mapped=True,  # Use cudaHostAlloc for UVM CACHING to fix imbalance numa memory issue
+            feature_names_per_table=[
+                table.feature_names for table in config.embedding_tables
+            ],
+            **(tbe_fused_params(fused_params) or {}),
         )
         if device is not None:
             self._emb_module.initialize_weights()
diff --git a/torchrec/quant/embedding_modules.py b/torchrec/quant/embedding_modules.py
@@ -30,6 +30,7 @@
     IntNBitTableBatchedEmbeddingBagsCodegen,
     PoolingMode,
 )
+from fbgemm_gpu.tbe.cache.kv_embedding_ops_inference import KVEmbeddingInference
 from torch import Tensor
 from torchrec.distributed.utils import none_throws
 from torchrec.modules.embedding_configs import (
@@ -357,7 +358,7 @@ def __init__(
         self._is_weighted = is_weighted
         self._embedding_bag_configs: List[EmbeddingBagConfig] = tables
         self._key_to_tables: Dict[
-            Tuple[PoolingType, DataType, bool], List[EmbeddingBagConfig]
+            Tuple[PoolingType, bool], List[EmbeddingBagConfig]
         ] = defaultdict(list)
         self._feature_names: List[str] = []
         self._feature_splits: List[int] = []
@@ -383,15 +384,13 @@ def __init__(
                 key = (table.pooling, table.use_virtual_table)
             else:
                 key = (table.pooling, False)
-            # pyre-ignore
             self._key_to_tables[key].append(table)
 
         location = (
             EmbeddingLocation.HOST if device.type == "cpu" else EmbeddingLocation.DEVICE
         )
 
-        for key, emb_configs in self._key_to_tables.items():
-            pooling = key[0]
+        for (pooling, use_virtual_table), emb_configs in self._key_to_tables.items():
             embedding_specs = []
             weight_lists: Optional[
                 List[Tuple[torch.Tensor, Optional[torch.Tensor]]]
@@ -420,7 +419,12 @@ def __init__(
                     )
                 feature_table_map.extend([idx] * table.num_features())
 
-            emb_module = IntNBitTableBatchedEmbeddingBagsCodegen(
+            embedding_clazz = (
+                KVEmbeddingInference
+                if use_virtual_table
+                else IntNBitTableBatchedEmbeddingBagsCodegen
+            )
+            emb_module = embedding_clazz(
                 embedding_specs=embedding_specs,
                 pooling_mode=pooling_type_to_pooling_mode(pooling),
                 weight_lists=weight_lists,
@@ -790,8 +794,7 @@ def __init__(  # noqa C901
                 key = (table.data_type, False)
             self._key_to_tables[key].append(table)
         self._feature_splits: List[int] = []
-        for key, emb_configs in self._key_to_tables.items():
-            data_type = key[0]
+        for (data_type, use_virtual_table), emb_configs in self._key_to_tables.items():
             embedding_specs = []
             weight_lists: Optional[
                 List[Tuple[torch.Tensor, Optional[torch.Tensor]]]
@@ -816,10 +819,13 @@ def __init__(  # noqa C901
                         table_name_to_quantized_weights[table.name]
                     )
                 feature_table_map.extend([idx] * table.num_features())
-                # move to here to make sure feature_names order is consistent with the embedding groups
                 self._feature_names.extend(table.feature_names)
-
-            emb_module = IntNBitTableBatchedEmbeddingBagsCodegen(
+            embedding_clazz = (
+                KVEmbeddingInference
+                if use_virtual_table
+                else IntNBitTableBatchedEmbeddingBagsCodegen
+            )
+            emb_module = embedding_clazz(
                 embedding_specs=embedding_specs,
                 pooling_mode=PoolingMode.NONE,
                 weight_lists=weight_lists,