WIP: Adding separate thread for kv cache events exchange

pcastonguay · pcastonguay · commit 3a56c1e434be · 2025-08-04T08:12:56.000-07:00
Signed-off-by: Patrice Castonguay &lt;55748270+pcastonguay@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1702,6 +1702,12 @@ struct KVCacheUpdatedData
     explicit KVCacheUpdatedData(IdType blockHash)
         : blockHash{blockHash} {};
 
+    explicit KVCacheUpdatedData(IdType blockHash, std::optional<KVCacheEventDiff<SizeType32>> cacheLevel,
+        std::optional<KVCacheEventDiff<SizeType32>> priority)
+        : blockHash{blockHash}
+        , cacheLevel{cacheLevel}
+        , priority{priority} {};
+
     KVCacheUpdatedData& cacheLevelUpdated(SizeType32 oldValue, SizeType32 newValue)
     {
         cacheLevel = KVCacheEventDiff<SizeType32>{oldValue, newValue};
diff --git a/cpp/include/tensorrt_llm/executor/serialization.h b/cpp/include/tensorrt_llm/executor/serialization.h
@@ -302,6 +302,45 @@ class Serialization
     [[nodiscard]] static std::vector<RequestStatsPerIteration> deserializeRequestStatsPerIterationVec(
         std::vector<char>& buffer);
 
+    // KVCacheEvent deque
+    [[nodiscard]] static std::vector<char> serialize(std::deque<KVCacheEvent> const& kvCacheEvents);
+    [[nodiscard]] static std::deque<KVCacheEvent> deserializeKVCacheEvents(std::vector<char>& buffer);
+
+    // KVCacheEvent
+    [[nodiscard]] static size_t serializedSize(KVCacheEvent const& event);
+    static void serialize(KVCacheEvent const& event, std::ostream& os);
+    [[nodiscard]] static KVCacheEvent deserializeKVCacheEvent(std::istream& is);
+
+    // KVCacheCreatedData
+    [[nodiscard]] static size_t serializedSize(KVCacheCreatedData const& data);
+    static void serialize(KVCacheCreatedData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheCreatedData deserializeKVCacheCreatedData(std::istream& is);
+
+    // KVCacheStoredData
+    [[nodiscard]] static size_t serializedSize(KVCacheStoredData const& data);
+    static void serialize(KVCacheStoredData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheStoredData deserializeKVCacheStoredData(std::istream& is);
+
+    // KVCacheStoredBlockData
+    [[nodiscard]] static size_t serializedSize(KVCacheStoredBlockData const& data);
+    static void serialize(KVCacheStoredBlockData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheStoredBlockData deserializeKVCacheStoredBlockData(std::istream& is);
+
+    // KVCacheRemovedData
+    [[nodiscard]] static size_t serializedSize(KVCacheRemovedData const& data);
+    static void serialize(KVCacheRemovedData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheRemovedData deserializeKVCacheRemovedData(std::istream& is);
+
+    // KVCacheEventDiff
+    [[nodiscard]] static size_t serializedSize(KVCacheEventDiff<SizeType32> const& data);
+    static void serialize(KVCacheEventDiff<SizeType32> const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheEventDiff<SizeType32> deserializeKVCacheEventDiff(std::istream& is);
+
+    // KVCacheUpdateData
+    [[nodiscard]] static size_t serializedSize(KVCacheUpdatedData const& data);
+    static void serialize(KVCacheUpdatedData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheUpdatedData deserializeKVCacheUpdatedData(std::istream& is);
+
     // String
     static std::string deserializeString(std::istream& is);
 
diff --git a/cpp/include/tensorrt_llm/runtime/utils/mpiTags.h b/cpp/include/tensorrt_llm/runtime/utils/mpiTags.h
@@ -68,6 +68,10 @@ enum class MpiTag : int
     // LogitsThread
     kSpecDecLogitsId = 129,
     kSpecDecLogitsData = 1025,
+
+    // KvCacheEventManager
+    kKvCacheEventSize = 1026,
+    kKvCacheEvent = 1027
 };
 
 } // namespace tensorrt_llm::mpi
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp
@@ -49,6 +49,7 @@ KVCacheEventManager::KVCacheEventManager(size_t maxKVEventEntries, std::optional
     }
     // mWorkerThread = std::thread(std::bind(&KVCacheEventManager::worker, this));
     mWorkerThread = std::thread([this]() { this->worker(); });
+    mExchangeAttentionDpThread = std::thread([this]() { this->exchangeAttentionDpEvents(); });
 };
 
 KVCacheEventManager::~KVCacheEventManager()
@@ -57,6 +58,7 @@ KVCacheEventManager::~KVCacheEventManager()
     mPendingEmptyCV.notify_all();
     mEmptyCV.notify_all();
     mWorkerThread.join();
+    mAttentionDpExchangeThread.join();
 }
 
 void KVCacheEventManager::enqueueCreatedEvent(
@@ -129,82 +131,103 @@ std::deque<tle::KVCacheEvent> KVCacheEventManager::getEvents(std::optional<std::
     return std::exchange(mEvents, {});
 }
 
-std::vector<char> KVCacheEventManager::serializeEventQueue(std::deque<tle::KVCacheEvent> const& eventQueue)
-{
-    std::vector<char> buffer;
-    for (auto const& event : eventQueue)
-    {
-        auto serialized = event.serialize();
-        buffer.insert(buffer.end(), serialized.begin(), serialized.end());
-    }
-    return buffer;
-}
-
 void KVCacheEventManager::flush()
 {
     auto eventQueue = std::exchange(mEventQueue, {});
-
-    // In case of attention DP, we need to gather the events on rank 0
-    if (mAttentionDpSize && mAttentionDpSize.value() > 1)
-    {
-        auto packed = serializeEventQueue(eventQueue);
-
-        std::vector<std::vector<char>> rankEventQueues(mAttentionDpSize.value());
-        serializedRankEventQueues[mAttentionDpRank.value()] = std::move(packed);
-
-        // Use COMM_SESSION to fill serializedRankEventQueues on rank 0
-
-        // Deserialize the events
-        eventQueue.clear();
-        if (mAttentionDpRank == 0)
-        {
-            for (auto const& serializedRankEventQueue : serializedRankEventQueues)
-            {
-                auto rankEventQueue = deserializeEventQueue(serializedRankEventQueue);
-                eventQueue.insert(eventQueue.end(), rankEventQueue.begin(), rankEventQueue.end());
-            }
-        }
-    }
-
     std::unique_lock<std::mutex> lck(mPendingEventsMutex);
     mPendingEvents.push_back(std::move(eventQueue));
     // If we have events, we need to notify the worker thread to process them
     mPendingEmptyCV.notify_one();
 }
 
-void KVCacheEventManager::worker()
+void KVCacheEventManager::exchangeAttentionDpThread()
 {
+    int32_t pollPeriodMs = 5;
     while (true)
     {
-        std::deque<tle::KVCacheEvent> events;
+        // If we are not rank 0, send events asynchronously
+        if (mAttentionDpRank.value() != 0)
         {
-            std::unique_lock<std::mutex> pendingLock(mPendingEventsMutex);
-            mPendingEmptyCV.wait(pendingLock, [this] { return !mPendingEvents.empty() || !mRun; });
-            if (!mRun)
+            std::vector<char> serializedEvents;
             {
-                return;
+                std::unique_lock<std::mutex> lck(mEventsMutex);
+                serializedEvents = Serialization::serialize(mEvents);
+                mEvents.clear();
             }
-            events = mPendingEvents.front();
-            mPendingEvents.pop_front();
+            uint64_t vecSize = serializedEvents.size();
+            COMM_SESSION.send(&vecSize, 1, MpiType::kUINT64, 0, MpiTag::kKVCacheEventSize);
+            COMM_SESSION.send(
+                serializedEvents.data(), serializedEvents.size(), MpiType::kCHAR, 0, MpiTag::kKVCacheEvent);
         }
+        else
+        {
+            TLLM_CHECK(mAttentionDpSize.has_value());
+            // Loop until have received events from all ranks
+            int32_t numRecvs = 0;
+            while (numRecvs < mAttentionDpSize.value() - 1)
+            {
+                MPI_Status probeStatus;
+                if (COMM_SESSION.iprobe(MPI_ANY_SOURCE, MpiTag::kKVCacheEvent, &status))
+                {
+                    uint64_t vecSize;
+                    COMM_SESSION.recv(
+                        &vecSize, 1, mpi::MpiType::kUINT64, probeStatus.MPI_SOURCE, mpi::MpiTag::kKVCacheEventSize);
+
+                    std::vector<char> serializedEvents(vecSize);
+                    COMM_SESSION.recv(&serializedEvents.data(), vecSize, mpi::MpiType::kCHAR, probeStatus.MPI_SOURCE,
+                        mpi::MpiTag::kKVCacheEvent);
+
+                    // Deserialize the events and add them to the local queue
+                    auto rankEvents = Serialization::deserializeKVCacheEvents(serializedEvents);
+                    {
+                        std::unique_lock<std::mutex> lck(mEventsMutex);
+                        mEvents.insert(mEvents.end(), rankEvents.begin(), rankEvents.end());
+                        mEmptyCV.notify_one();
+                    }
+                    numRecvs++;
+                }
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(pollPeriodMs));
+        }
+    }
 
-        std::unique_lock<std::mutex> lck(mEventsMutex);
-
-        SizeType32 elementsToRemove = mEvents.size() + events.size() - mMaxSize;
+    void KVCacheEventManager::worker()
+    {
 
-        // First, take elements from mEvents since they are the oldest.
-        if (elementsToRemove > 0)
+        while (true)
         {
-            SizeType32 numRemoved = std::min(static_cast<SizeType32>(mEvents.size()), elementsToRemove);
-            mEvents.erase(mEvents.begin(), mEvents.begin() + numRemoved);
-            elementsToRemove -= numRemoved;
-            TLLM_LOG_WARNING("The event queue has reached the max size of %d. Events have been discarded.", mMaxSize);
-        }
+            std::deque<tle::KVCacheEvent> events;
+            {
+                std::unique_lock<std::mutex> pendingLock(mPendingEventsMutex);
+                mPendingEmptyCV.wait(pendingLock, [this] { return !mPendingEvents.empty() || !mRun; });
+                if (!mRun)
+                {
+                    return;
+                }
+                events = mPendingEvents.front();
+                mPendingEvents.pop_front();
+            }
 
-        // If there's still too many events, take from the front of the events queue.
-        mEvents.insert(mEvents.end(), events.begin() + std::max(0, elementsToRemove), events.end());
-        mEmptyCV.notify_one();
+            std::unique_lock<std::mutex> lck(mEventsMutex);
+
+            SizeType32 elementsToRemove = mEvents.size() + events.size() - mMaxSize;
+
+            // First, take elements from mEvents since they are the oldest.
+            if (elementsToRemove > 0)
+            {
+                SizeType32 numRemoved = std::min(static_cast<SizeType32>(mEvents.size()), elementsToRemove);
+                mEvents.erase(mEvents.begin(), mEvents.begin() + numRemoved);
+                elementsToRemove -= numRemoved;
+                TLLM_LOG_WARNING(
+                    "The event queue has reached the max size of %d. Events have been discarded.", mMaxSize);
+            }
+
+            // If there's still too many events, take from the front of the events queue.
+            mEvents.insert(mEvents.end(), events.begin() + std::max(0, elementsToRemove), events.end());
+
+            // Notify the empty condition variable to wake up any waiting threads
+            mEmptyCV.notify_one();
+        }
     }
-}
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/tensorrt_llm/executor/serialization.cpp b/cpp/tensorrt_llm/executor/serialization.cpp
diff --git a/cpp/tensorrt_llm/executor/serializeUtils.h b/cpp/tensorrt_llm/executor/serializeUtils.h