@@ -2491,64 +2491,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
24912491 // TODO: add more model-specific info which should prevent loading the session file if not identical
24922492 }
24932493
2494- // write output ids
2495- {
2496- LLAMA_LOG_DEBUG (" %s: - writing output ids\n " , __func__);
2497-
2498- const auto n_outputs = this ->n_outputs ;
2499- const auto & output_ids = this ->output_ids ;
2500-
2501- std::vector<int32_t > w_output_pos;
2502-
2503- w_output_pos.resize (n_outputs);
2504-
2505- // build a more compact representation of the output ids
2506- for (size_t i = 0 ; i < n_batch (); ++i) {
2507- // map an output id to a position in the batch
2508- int64_t pos = output_ids[i];
2509- if (pos >= 0 ) {
2510- GGML_ASSERT (pos < n_outputs);
2511- w_output_pos[pos] = i;
2512- }
2513- }
2514-
2515- io.write (&n_outputs, sizeof (n_outputs));
2516-
2517- if (n_outputs) {
2518- io.write (w_output_pos.data (), n_outputs * sizeof (int32_t ));
2519- }
2520- }
2521-
2522- // [TAG_CONTEXT_STATE_LOGITS]
2523- // write logits
2524- {
2525- LLAMA_LOG_DEBUG (" %s: - writing logits\n " , __func__);
2526-
2527- const uint64_t logits_size = std::min ((uint64_t ) this ->logits .size , (uint64_t ) n_outputs * model.vocab .n_tokens ());
2528-
2529- io.write (&logits_size, sizeof (logits_size));
2530-
2531- if (logits_size) {
2532- io.write (logits.data , logits_size * sizeof (float ));
2533- }
2534- }
2535-
2536- // write embeddings
2537- {
2538- LLAMA_LOG_DEBUG (" %s: - writing embeddings\n " , __func__);
2539-
2540- const uint64_t embd_size = std::min ((uint64_t ) this ->embd .size , (uint64_t ) n_outputs * model.hparams .n_embd );
2541-
2542- io.write (&embd_size, sizeof (embd_size));
2543-
2544- if (embd_size) {
2545- io.write (embd.data , embd_size * sizeof (float ));
2546- }
2547- }
2548-
2549- // TODO: handle sampling buffers and samplers state ?
2550- // https://github.com/ggml-org/llama.cpp/pull/17004
2551-
25522494 if (memory != nullptr ) {
25532495 LLAMA_LOG_DEBUG (" %s: - writing memory module\n " , __func__);
25542496 memory->state_write (io);
@@ -2574,70 +2516,6 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
25742516 // TODO: add more info which needs to be identical but which is not verified otherwise
25752517 }
25762518
2577- // read output ids
2578- {
2579- LLAMA_LOG_DEBUG (" %s: - reading output ids\n " , __func__);
2580-
2581- auto n_outputs = this ->n_outputs ;
2582- io.read_to (&n_outputs, sizeof (n_outputs));
2583-
2584- if (n_outputs > output_reserve (n_outputs)) {
2585- throw std::runtime_error (" could not reserve outputs" );
2586- }
2587-
2588- std::vector<int32_t > output_pos;
2589-
2590- if (n_outputs) {
2591- output_pos.resize (n_outputs);
2592- io.read_to (output_pos.data (), n_outputs * sizeof (int32_t ));
2593-
2594- for (int32_t i = 0 ; i < (int32_t ) output_pos.size (); ++i) {
2595- int32_t id = output_pos[i];
2596- if ((uint32_t ) id >= n_batch ()) {
2597- throw std::runtime_error (format (" invalid output id, %d does not fit in batch size of %u" , id, n_batch ()));
2598- }
2599- this ->output_ids [id] = i;
2600- }
2601-
2602- this ->n_outputs = n_outputs;
2603- }
2604- }
2605-
2606- // read logits
2607- {
2608- LLAMA_LOG_DEBUG (" %s: - reading logits\n " , __func__);
2609-
2610- uint64_t logits_size;
2611- io.read_to (&logits_size, sizeof (logits_size));
2612-
2613- if (this ->logits .size < logits_size) {
2614- throw std::runtime_error (" logits buffer too small" );
2615- }
2616-
2617- if (logits_size) {
2618- io.read_to (this ->logits .data , logits_size * sizeof (float ));
2619- }
2620- }
2621-
2622- // read embeddings
2623- {
2624- LLAMA_LOG_DEBUG (" %s: - reading embeddings\n " , __func__);
2625-
2626- uint64_t embd_size;
2627- io.read_to (&embd_size, sizeof (embd_size));
2628-
2629- if (this ->embd .size < embd_size) {
2630- throw std::runtime_error (" embeddings buffer too small" );
2631- }
2632-
2633- if (embd_size) {
2634- io.read_to (this ->embd .data , embd_size * sizeof (float ));
2635- }
2636- }
2637-
2638- // TODO: handle sampling buffers and samplers state ?
2639- // https://github.com/ggml-org/llama.cpp/pull/17004
2640-
26412519 if (memory) {
26422520 LLAMA_LOG_DEBUG (" %s: - reading memory module\n " , __func__);
26432521
0 commit comments