Skip to content

Commit 24a085f

Browse files
committed
llama : remove write/read of output ids/logits/embeddings
This commit removes the write/read of output ids, logits and embeddings from the llama context state. Refs: ggml-org#18862 (comment)
1 parent 2cce9fd commit 24a085f

1 file changed

Lines changed: 0 additions & 122 deletions

File tree

src/llama-context.cpp

Lines changed: 0 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -2491,64 +2491,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
24912491
// TODO: add more model-specific info which should prevent loading the session file if not identical
24922492
}
24932493

2494-
// write output ids
2495-
{
2496-
LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
2497-
2498-
const auto n_outputs = this->n_outputs;
2499-
const auto & output_ids = this->output_ids;
2500-
2501-
std::vector<int32_t> w_output_pos;
2502-
2503-
w_output_pos.resize(n_outputs);
2504-
2505-
// build a more compact representation of the output ids
2506-
for (size_t i = 0; i < n_batch(); ++i) {
2507-
// map an output id to a position in the batch
2508-
int64_t pos = output_ids[i];
2509-
if (pos >= 0) {
2510-
GGML_ASSERT(pos < n_outputs);
2511-
w_output_pos[pos] = i;
2512-
}
2513-
}
2514-
2515-
io.write(&n_outputs, sizeof(n_outputs));
2516-
2517-
if (n_outputs) {
2518-
io.write(w_output_pos.data(), n_outputs * sizeof(int32_t));
2519-
}
2520-
}
2521-
2522-
// [TAG_CONTEXT_STATE_LOGITS]
2523-
// write logits
2524-
{
2525-
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
2526-
2527-
const uint64_t logits_size = std::min((uint64_t) this->logits.size, (uint64_t) n_outputs * model.vocab.n_tokens());
2528-
2529-
io.write(&logits_size, sizeof(logits_size));
2530-
2531-
if (logits_size) {
2532-
io.write(logits.data, logits_size * sizeof(float));
2533-
}
2534-
}
2535-
2536-
// write embeddings
2537-
{
2538-
LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__);
2539-
2540-
const uint64_t embd_size = std::min((uint64_t) this->embd.size, (uint64_t) n_outputs * model.hparams.n_embd);
2541-
2542-
io.write(&embd_size, sizeof(embd_size));
2543-
2544-
if (embd_size) {
2545-
io.write(embd.data, embd_size * sizeof(float));
2546-
}
2547-
}
2548-
2549-
// TODO: handle sampling buffers and samplers state ?
2550-
// https://github.com/ggml-org/llama.cpp/pull/17004
2551-
25522494
if (memory != nullptr) {
25532495
LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__);
25542496
memory->state_write(io);
@@ -2574,70 +2516,6 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
25742516
// TODO: add more info which needs to be identical but which is not verified otherwise
25752517
}
25762518

2577-
// read output ids
2578-
{
2579-
LLAMA_LOG_DEBUG("%s: - reading output ids\n", __func__);
2580-
2581-
auto n_outputs = this->n_outputs;
2582-
io.read_to(&n_outputs, sizeof(n_outputs));
2583-
2584-
if (n_outputs > output_reserve(n_outputs)) {
2585-
throw std::runtime_error("could not reserve outputs");
2586-
}
2587-
2588-
std::vector<int32_t> output_pos;
2589-
2590-
if (n_outputs) {
2591-
output_pos.resize(n_outputs);
2592-
io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
2593-
2594-
for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
2595-
int32_t id = output_pos[i];
2596-
if ((uint32_t) id >= n_batch()) {
2597-
throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch()));
2598-
}
2599-
this->output_ids[id] = i;
2600-
}
2601-
2602-
this->n_outputs = n_outputs;
2603-
}
2604-
}
2605-
2606-
// read logits
2607-
{
2608-
LLAMA_LOG_DEBUG("%s: - reading logits\n", __func__);
2609-
2610-
uint64_t logits_size;
2611-
io.read_to(&logits_size, sizeof(logits_size));
2612-
2613-
if (this->logits.size < logits_size) {
2614-
throw std::runtime_error("logits buffer too small");
2615-
}
2616-
2617-
if (logits_size) {
2618-
io.read_to(this->logits.data, logits_size * sizeof(float));
2619-
}
2620-
}
2621-
2622-
// read embeddings
2623-
{
2624-
LLAMA_LOG_DEBUG("%s: - reading embeddings\n", __func__);
2625-
2626-
uint64_t embd_size;
2627-
io.read_to(&embd_size, sizeof(embd_size));
2628-
2629-
if (this->embd.size < embd_size) {
2630-
throw std::runtime_error("embeddings buffer too small");
2631-
}
2632-
2633-
if (embd_size) {
2634-
io.read_to(this->embd.data, embd_size * sizeof(float));
2635-
}
2636-
}
2637-
2638-
// TODO: handle sampling buffers and samplers state ?
2639-
// https://github.com/ggml-org/llama.cpp/pull/17004
2640-
26412519
if (memory) {
26422520
LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__);
26432521

0 commit comments

Comments
 (0)