[XLA:GPU] Add support for all-to-all to perf table gen.

golechwierowicz · Google-ML-Automation · commit da63f2971676 · 2025-06-05T01:36:10.000-07:00
PiperOrigin-RevId: 767489571
diff --git a/xla/service/gpu/model/collective_interpolator.cc b/xla/service/gpu/model/collective_interpolator.cc
@@ -249,6 +249,51 @@ std::unique_ptr<HloModule> AllGatherModule(
   return module;
 }
 
+std::unique_ptr<HloModule> AllToAllModule(
+    const HloInstructionProfile& profile) {
+  HloModuleConfig config;
+  auto module = std::make_unique<HloModule>("m", config);
+  auto shape = Shape::FromProto(profile.instruction().shape());
+  if (!shape.ok()) {
+    VLOG(1) << "Cannot parse shape: " << profile.DebugString();
+    return nullptr;
+  }
+
+  HloComputation::Builder entry_builder("entry");
+  CollectiveDeviceList collective_device_list(
+      IotaReplicaGroupList::FromProto(profile.instruction()
+                                          .collective_device_list()
+                                          .iota_replica_group_list()));
+
+  HloInstruction* p0 = entry_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, *shape, "p0"));
+  entry_builder.AddInstruction(HloInstruction::CreateAllToAll(
+      *shape, {p0}, collective_device_list,
+      profile.instruction().constrain_layout(),
+      profile.instruction().channel_id(),
+      profile.instruction().use_global_device_ids()));
+  module->AddEntryComputation(entry_builder.Build());
+  return module;
+}
+
+std::optional<CollectiveDeviceList> CanonicalDeviceList(
+    const HloCollectiveInstruction& instr) {
+  if (instr.device_list().iota_replica_group_list().has_value()) {
+    return instr.device_list();
+  }
+  auto num_groups_and_devices = GetReplicaGroupCountAndSize(&instr);
+  if (!num_groups_and_devices.ok() || !num_groups_and_devices->has_value()) {
+    VLOG(1) << "Failed to determine a number of devices participating in "
+               "the collective: "
+            << instr.ToString();
+    return std::nullopt;
+  }
+
+  IotaReplicaGroupList iota((*num_groups_and_devices)->first,
+                            (*num_groups_and_devices)->second);
+  return CollectiveDeviceList(iota);
+}
+
 HloOpcode AsyncToSyncOpcode(const HloCollectiveInstruction& instr) {
   HloOpcode opcode = instr.opcode();
   switch (opcode) {
@@ -294,6 +339,17 @@ int64_t GetBytesTransferred(const HloInstruction& instr,
   return adhoc.BytesTransferred(instr);
 }
 
+bool RequiresAccumulation(HloOpcode opcode) {
+  switch (opcode) {
+    case HloOpcode::kAllReduceStart:
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kReduceScatter:
+      return true;
+    default:
+      return false;
+  }
+}
+
 absl::StatusOr<std::unique_ptr<
     absl::flat_hash_map<CollectiveInterpolator::ExactInterpolatorKey,
                         std::unique_ptr<InterpolatorBase<int64_t, 1>>>>>
@@ -311,7 +367,9 @@ ConstructExactInterpolators(int num_devices_per_host,
     CollectiveInterpolator::ExactInterpolatorKey exact_key{
         /*opcode=*/spec.opcode,
         /*device_list=*/spec.device_list,
-        /*data_type=*/spec.data_type,
+        /*data_type=*/
+        RequiresAccumulation(spec.opcode) ? std::make_optional(spec.data_type)
+                                          : std::nullopt,
     };
     auto exact_it = exact_interpolators->find(exact_key);
     if (exact_it == exact_interpolators->end()) {
@@ -429,17 +487,6 @@ ConstructFallbackNNInterpolators(int num_devices_per_host,
   return fallback_interpolators;
 }
 
-bool RequiresAccumulation(const HloCollectiveInstruction& instr) {
-  switch (instr.opcode()) {
-    case HloOpcode::kAllReduceStart:
-    case HloOpcode::kAllReduce:
-    case HloOpcode::kReduceScatter:
-      return true;
-    default:
-      return false;
-  }
-}
-
 }  // namespace
 
 // We can get rid of `analysis` being nullptr once we get rid of stats
@@ -488,21 +535,23 @@ std::optional<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
   int64_t bytes_transferred =
       GetBytesTransferred(instr, device_info_, analysis_);
 
-  ExactInterpolatorKey exact_key{
-      /*opcode=*/instr.opcode(),
-      /*device_list=*/instr.device_list(),
-      /*data_type=*/
-      RequiresAccumulation(instr)
-          ? std::make_optional(instr.shape().element_type())
-          : std::nullopt,
-  };
+  std::optional<CollectiveDeviceList> devices = CanonicalDeviceList(instr);
+  if (devices.has_value()) {
+    ExactInterpolatorKey exact_key{
+        /*opcode=*/instr.opcode(),
+        /*device_list=*/*devices,
+        /*data_type=*/
+        RequiresAccumulation(instr.opcode())
+            ? std::make_optional(instr.shape().element_type())
+            : std::nullopt,
+    };
 
-  if (exact_interpolators_->contains(exact_key)) {
-    std::array<int64_t, 1> point({bytes_transferred});
-    return absl::Seconds(1.0 * bytes_transferred /
-                         exact_interpolators_->at(exact_key)->Eval(point));
+    if (exact_interpolators_->contains(exact_key)) {
+      std::array<int64_t, 1> point({bytes_transferred});
+      return absl::Seconds(1.0 * bytes_transferred /
+                           exact_interpolators_->at(exact_key)->Eval(point));
+    }
   }
-
   // Fallback interpolation.
   auto comm = CommunicationType(num_devices_per_host_, instr,
                                 device_info_.gpu_compute_capability());
@@ -537,6 +586,8 @@ std::optional<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
     case HloOpcode::kAllGather:
     case HloOpcode::kAllGatherStart:
       return AllGatherModule(profile);
+    case HloOpcode::kAllToAll:
+      return AllToAllModule(profile);
     default:
       LOG(FATAL) << "Unsupported profile instruction: "
                  << profile.DebugString();
diff --git a/xla/service/gpu/model/collective_interpolator_test.cc b/xla/service/gpu/model/collective_interpolator_test.cc
@@ -101,6 +101,7 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
     switch (opcode) {
       case HloOpcode::kAllReduce:
       case HloOpcode::kAllReduceStart:
+      case HloOpcode::kAllToAll:
         device_list = CollectiveDeviceList(CommToDeviceList(comm, num_hosts));
         shape = ShapeUtil::MakeShape(PrimitiveType::F32, {tensor_size / 4});
         break;
@@ -419,6 +420,27 @@ class CollectiveInterpolationTest : public TestWithParam<ParametrizedTestCase> {
           /*num_nodes=*/4,
           /*network_througput_bytes=*/2 * 2048,
       },
+      {
+          /*opcode=*/HloOpcode::kAllToAll,
+          /*comm=*/GPUCommunicationType::SINGLE_HOST,
+          /*tensor_size=*/1024,
+          /*num_nodes=*/1,
+          /*network_througput_bytes=*/1024,
+      },
+      {
+          /*opcode=*/HloOpcode::kAllToAll,
+          /*comm=*/GPUCommunicationType::RAIL_ALIGNED,
+          /*tensor_size=*/1024,
+          /*num_nodes=*/2,
+          /*network_througput_bytes=*/2048,
+      },
+      {
+          /*opcode=*/HloOpcode::kAllToAll,
+          /*comm=*/GPUCommunicationType::NON_RAIL_ALIGNED,
+          /*tensor_size=*/1024,
+          /*num_nodes=*/2,
+          /*network_througput_bytes=*/4096,
+      },
   };
 };
 
@@ -960,6 +982,39 @@ INSTANTIATE_TEST_SUITE_P(
             },
             /*expected_duration=*/absl::Milliseconds(625),
         },
+        {
+            /*test_name=*/"A2A_rail_aligned_exact_match",
+            {
+                /*opcode=*/HloOpcode::kAllToAll,
+                /*comm=*/
+                GPUCommunicationType::RAIL_ALIGNED,
+                /*tensor_size=*/1024,
+                /*num_nodes=*/2,
+            },
+            /*expected_duration=*/absl::Milliseconds(500),
+        },
+        {
+            /*test_name=*/"A2A_nonrail_aligned_exact_match",
+            {
+                /*opcode=*/HloOpcode::kAllToAll,
+                /*comm=*/
+                GPUCommunicationType::NON_RAIL_ALIGNED,
+                /*tensor_size=*/1024,
+                /*num_nodes=*/2,
+            },
+            /*expected_duration=*/absl::Milliseconds(250),
+        },
+        {
+            /*test_name=*/"A2A_single_host_exact_match",
+            {
+                /*opcode=*/HloOpcode::kAllToAll,
+                /*comm=*/
+                GPUCommunicationType::SINGLE_HOST,
+                /*tensor_size=*/1024,
+                /*num_nodes=*/1,
+            },
+            /*expected_duration=*/absl::Seconds(1),
+        },
     }),
     [](const TestParamInfo<CollectiveInterpolationTest::ParamType>& info) {
       return info.param.test_name;
diff --git a/xla/service/gpu/model/gpu_hlo_cost_analysis.cc b/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
@@ -533,13 +533,14 @@ absl::Status GpuHloCostAnalysis::HandleAllGatherStart(
 
 absl::Status GpuHloCostAnalysis::HandleAsyncStart(const HloInstruction* hlo) {
   auto* async_start = DynCast<HloAsyncStartInstruction>(hlo);
-  if (async_start->async_wrapped_opcode() != HloOpcode::kReduceScatter) {
-    VLOG(2) << "Only Reduce Scatter is supported.";
-    return absl::OkStatus();
-  }
-
   TF_RETURN_IF_ERROR(hlo->async_wrapped_instruction()->Accept(this));
-  return HandleReduceScatter(async_start->async_wrapped_instruction());
+  if (async_start->async_wrapped_opcode() == HloOpcode::kReduceScatter) {
+    return HandleReduceScatter(async_start->async_wrapped_instruction());
+  }
+  if (async_start->async_wrapped_opcode() == HloOpcode::kAllToAll) {
+    return HandleAllToAll(async_start->async_wrapped_instruction());
+  }
+  return absl::OkStatus();
 }
 
 absl::Status GpuHloCostAnalysis::HandleReduceScatter(
@@ -563,6 +564,12 @@ absl::Status GpuHloCostAnalysis::HandleReduceScatter(
   return absl::OkStatus();
 }
 
+absl::Status GpuHloCostAnalysis::HandleAllToAll(const HloInstruction* hlo) {
+  int64_t bytes_transferred = ShapeSize(hlo->shape(), options_.shape_size);
+  current_properties_[kCollBytesTransferred] = bytes_transferred;
+  return absl::OkStatus();
+}
+
 absl::Status GpuHloCostAnalysis::HandleElementwiseOp(
     const HloInstruction* hlo) {
   current_properties_[kFlopsKey] = GetFlopsForElementwiseOp(hlo);
diff --git a/xla/service/gpu/model/gpu_hlo_cost_analysis.h b/xla/service/gpu/model/gpu_hlo_cost_analysis.h
@@ -78,6 +78,7 @@ class GpuHloCostAnalysis : public HloCostAnalysis {
   absl::Status HandleAllGatherStart(const HloInstruction* hlo) override;
   absl::Status HandleAsyncStart(const HloInstruction* hlo) override;
   absl::Status HandleReduceScatter(const HloInstruction* hlo) override;
+  absl::Status HandleAllToAll(const HloInstruction* hlo) override;
 
   // Estimate the total size of IR accounting for both duplication
   // of producer code by consumer and the total number of basic blocks.
diff --git a/xla/tools/collective_perf_table_gen.cc b/xla/tools/collective_perf_table_gen.cc
@@ -121,15 +121,14 @@ int64_t GetInputDim(CollectivePerfTableGen::CollectiveType type,
   CHECK_EQ(tensor_size_bytes % kBytesPerElem, 0);
   switch (type) {
     case CollectivePerfTableGen::CollectiveType::ALL_REDUCE:
+    case CollectivePerfTableGen::CollectiveType::REDUCE_SCATTER:
+    case CollectivePerfTableGen::CollectiveType::ALL_TO_ALL:
       dim_size = tensor_size_bytes / kBytesPerElem;
       break;
     case CollectivePerfTableGen::CollectiveType::ALL_GATHER:
       dim_size = tensor_size_bytes /
                  (kBytesPerElem * replica_groups.num_devices_per_group());
       break;
-    case CollectivePerfTableGen::CollectiveType::REDUCE_SCATTER:
-      dim_size = tensor_size_bytes / kBytesPerElem;
-      break;
     default:
       LOG(FATAL) << "Unsupported collective type.";
   }
@@ -144,6 +143,7 @@ int64_t GetOutputDim(CollectivePerfTableGen::CollectiveType type,
   switch (type) {
     case CollectivePerfTableGen::CollectiveType::ALL_REDUCE:
     case CollectivePerfTableGen::CollectiveType::ALL_GATHER:
+    case CollectivePerfTableGen::CollectiveType::ALL_TO_ALL:
       dim_size = tensor_size_bytes / kBytesPerElem;
       break;
     case CollectivePerfTableGen::CollectiveType::REDUCE_SCATTER:
@@ -215,6 +215,19 @@ std::string GetHlo(CollectivePerfTableGen::CollectiveType type,
                              "f32", input_dim, output_dim,
                              replica_groups.ToString());
       break;
+    case CollectivePerfTableGen::CollectiveType::ALL_TO_ALL:
+      hlo = absl::Substitute(R"(
+        HloModule m
+
+        ENTRY e {
+          p0 = $0[$1] parameter(0)
+          ROOT _ = $0[$2] all-to-all(p0), replica_groups=$3, channel_id=1,
+          dimensions={0}
+        }
+      )",
+                             "f32", input_dim, output_dim,
+                             replica_groups.ToString());
+      break;
     default:
       LOG(FATAL) << "Unsupported collective type.";
   }
diff --git a/xla/tools/collective_perf_table_gen.h b/xla/tools/collective_perf_table_gen.h
@@ -53,6 +53,7 @@ class CollectivePerfTableGen {
     ALL_REDUCE,
     ALL_GATHER,
     REDUCE_SCATTER,
+    ALL_TO_ALL,
   };
 
   struct Config {
@@ -64,6 +65,7 @@ class CollectivePerfTableGen {
         CollectiveType::ALL_REDUCE,
         CollectiveType::ALL_GATHER,
         CollectiveType::REDUCE_SCATTER,
+        CollectiveType::ALL_TO_ALL,
     };
     std::vector<std::string> replica_groups_list;
 
diff --git a/xla/tools/collective_perf_table_gen_main.cc b/xla/tools/collective_perf_table_gen_main.cc
@@ -106,6 +106,10 @@ std::vector<CollectivePerfTableGen::CollectiveType> ParseCollectives(
       types.push_back(CollectivePerfTableGen::CollectiveType::REDUCE_SCATTER);
       continue;
     }
+    if (token == "ALL_TO_ALL") {
+      types.push_back(CollectivePerfTableGen::CollectiveType::ALL_TO_ALL);
+      continue;
+    }
   }
   CHECK_GT(types.size(), 0);
   return types;
@@ -160,10 +164,12 @@ int main(int argc, char* argv[]) {
   int32_t num_nodes = 1;
   int32_t num_devices_per_host = 8;
   int32_t task_id = 0;
-  std::string collectives_unparsed = "ALL_REDUCE,ALL_GATHER,REDUCE_SCATTER";
+  std::string collectives_unparsed =
+      "ALL_REDUCE,ALL_GATHER,REDUCE_SCATTER,ALL_TO_ALL";
   std::string tensor_size_bytes_spec_unparsed =
       "start=1024,stop=2147483648,factor=2";
-  std::string collective_devices_spec_unparsed;
+  std::string collective_devices_spec_unparsed =
+      "[1,8]<=[8];[2,4]<=[8];[4,2]<=[8]";
   std::string coordinator_address = std::string(kDefaultCoordinatorAddress);
   std::string output = std::string(CollectivePerfTableGen::Config::kStdout);
   std::string merge_path;
@@ -179,7 +185,8 @@ int main(int argc, char* argv[]) {
                 "across the distributed system you run it on."),
       tsl::Flag("collectives", &collectives_unparsed,
                 "Comma separated list of collectives to generate perf table "
-                "for. Allowed values: ALL_REDUCE, ALL_GATHER, REDUCE_SCATTER."),
+                "for. Allowed values: ALL_REDUCE, ALL_GATHER, REDUCE_SCATTER, "
+                "ALL_TO_ALL."),
       tsl::Flag("tensor_size_bytes_spec", &tensor_size_bytes_spec_unparsed,
                 "Spec for a search sweep over transfer sizes. Format example: "
                 "start=1,stop=8,factor=2 generates {1,2,4,8}."),
diff --git a/xla/tools/collective_perf_table_gen_test.cc b/xla/tools/collective_perf_table_gen_test.cc
@@ -85,6 +85,7 @@ TEST_F(CollectivePerfTableGenTest, FactorStepGeneratesConfigs) {
       CollectivePerfTableGen::CollectiveType::ALL_REDUCE,
       CollectivePerfTableGen::CollectiveType::ALL_GATHER,
       CollectivePerfTableGen::CollectiveType::REDUCE_SCATTER,
+      CollectivePerfTableGen::CollectiveType::ALL_TO_ALL,
   };
   cfg_.replica_groups_list.emplace_back("[1,1]<=[1]");
   CollectivePerfTableGen::StepSpec spec{
@@ -100,7 +101,7 @@ TEST_F(CollectivePerfTableGenTest, FactorStepGeneratesConfigs) {
 
   DeviceHloInstructionProfiles profiles = gen->ComputeTable();
   EXPECT_EQ(profiles.entries_size(), 1);
-  EXPECT_EQ(profiles.entries().begin()->second.entries_size(), 12);
+  EXPECT_EQ(profiles.entries().begin()->second.entries_size(), 16);
 }
 
 TEST_F(CollectivePerfTableGenTest, HappyPathWorks) {