From e42839382e33c07f7e254ff03bbaa918f4cd9ad3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 22 Dec 2024 23:32:43 +0200
Subject: [PATCH] examples : fix

ggml-ci
---
 common/common.cpp                             | 10 ++---
 common/common.h                               | 17 ++------
 .../cvector-generator/cvector-generator.cpp   |  7 ++-
 examples/embedding/embedding.cpp              |  7 ++-
 examples/eval-callback/eval-callback.cpp      |  8 ++--
 examples/imatrix/imatrix.cpp                  | 11 +++--
 examples/infill/infill.cpp                    |  7 +--
 examples/lookahead/lookahead.cpp              |  7 +--
 examples/lookup/lookup-create.cpp             | 13 ++----
 examples/lookup/lookup-stats.cpp              | 10 ++---
 examples/lookup/lookup.cpp                    |  7 +--
 examples/main/main.cpp                        | 11 ++---
 examples/parallel/parallel.cpp                |  7 +--
 examples/perplexity/perplexity.cpp            |  8 ++--
 examples/retrieval/retrieval.cpp              |  6 +--
 examples/save-load-state/save-load-state.cpp  | 29 ++-----------
 examples/server/server.cpp                    | 43 ++++++-------------
 .../speculative-simple/speculative-simple.cpp | 16 +++----
 examples/speculative/speculative.cpp          | 16 +++----
 examples/tts/tts.cpp                          | 16 +++----
 include/llama-cpp.h                           |  5 +++
 21 files changed, 87 insertions(+), 174 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index 20be92911..c10dcf89d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -922,14 +922,14 @@ struct common_init_result common_init_from_params(common_params & params) {
         common_lora_adapter_container loaded_la;
         loaded_la.path = la.path;
         loaded_la.scale = la.scale;
-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
+        loaded_la.adapter.reset(llama_lora_adapter_init(model, la.path.c_str()));
         if (loaded_la.adapter == nullptr) {
             LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
             llama_free(lctx);
             llama_free_model(model);
             return iparams;
         }
-        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
+        iparams.lora_adapters.emplace_back(std::move(loaded_la)); // copy to list of loaded adapters
     }
     if (!params.lora_init_without_apply) {
         common_lora_adapters_apply(lctx, iparams.lora_adapters);
@@ -993,8 +993,8 @@ struct common_init_result common_init_from_params(common_params & params) {
         llama_perf_context_reset(lctx);
     }
 
-    iparams.model   = model;
-    iparams.context = lctx;
+    iparams.model.reset(model);
+    iparams.context.reset(lctx);
 
     return iparams;
 }
@@ -1003,7 +1003,7 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
     llama_lora_adapter_clear(ctx);
     for (auto & la : lora_adapters) {
         if (la.scale != 0.0f) {
-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
+            llama_lora_adapter_set(ctx, la.adapter.get(), la.scale);
         }
     }
 }
diff --git a/common/common.h b/common/common.h
index fb5c56d88..fee4c264e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include "llama.h"
+#include "llama-cpp.h"
 
 #include <string>
 #include <vector>
@@ -30,7 +30,7 @@ struct common_lora_adapter_info {
 };
 
 struct common_lora_adapter_container : common_lora_adapter_info {
-    struct llama_lora_adapter * adapter;
+    llama_lora_adapter_ptr adapter;
 };
 
 using llama_tokens = std::vector<llama_token>;
@@ -479,19 +479,10 @@ std::string fs_get_cache_file(const std::string & filename);
 //
 
 struct common_init_result {
-    struct llama_model   * model   = nullptr;
-    struct llama_context * context = nullptr;
+    llama_model_ptr   model;
+    llama_context_ptr context;
 
     std::vector<common_lora_adapter_container> lora_adapters;
-
-    ~common_init_result() {
-        llama_free(context);
-        llama_free_model(model);
-
-        for (auto & lora_adapter : lora_adapters) {
-            llama_lora_adapter_free(lora_adapter.adapter);
-        }
-    }
 };
 
 struct common_init_result     common_init_from_params(common_params & params);
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index d1731bba6..7c9f50228 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -415,12 +415,13 @@ int main(int argc, char ** argv) {
     // load the model to get hparams
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
 
     // int n_ctx = llama_n_ctx(ctx);
     int n_layers = llama_n_layer(model);
     int n_embd = llama_n_embd(model);
+
     // get model hint param (a.k.a model arch name)
     char model_hint[128];
     llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
@@ -474,8 +475,6 @@ int main(int argc, char ** argv) {
 
     // done with the model, we can now free it to make gain some memory
     printf("Done evaluate prompts, unload model...\n");
-    llama_free(ctx);
-    llama_free_model(model);
 
     bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
 
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 3f18fc6a7..27f75cb77 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -97,8 +97,9 @@ int main(int argc, char ** argv) {
     // load the model
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
@@ -316,8 +317,6 @@ int main(int argc, char ** argv) {
 
     // clean up
     llama_batch_free(batch);
-    llama_free(ctx);
-    llama_free_model(model);
     llama_backend_free();
 
     return 0;
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index c08e3e5f6..2111c3cda 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -162,8 +162,9 @@ int main(int argc, char ** argv) {
     // init
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
     if (model == nullptr || ctx == nullptr) {
         LOG_ERR("%s : failed to init\n", __func__);
         return 1;
@@ -184,9 +185,6 @@ int main(int argc, char ** argv) {
     LOG("\n");
     llama_perf_context_print(ctx);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
     return 0;
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 45206f4a7..588114ecd 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -430,9 +430,10 @@ static void process_logits(
 
 static bool compute_imatrix(llama_context * ctx, const common_params & params) {
     const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
     const int n_ctx = llama_n_ctx(ctx);
 
+    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
+
     auto tim1 = std::chrono::high_resolution_clock::now();
     LOG_INF("%s: tokenizing the input ..\n", __func__);
 
@@ -618,8 +619,9 @@ int main(int argc, char ** argv) {
     // init
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
     if (model == nullptr || ctx == nullptr) {
         LOG_ERR("%s : failed to init\n", __func__);
         return 1;
@@ -655,9 +657,6 @@ int main(int argc, char ** argv) {
     LOG("\n");
     llama_perf_context_print(ctx);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
     return 0;
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index ef7008957..d460be314 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -131,8 +131,8 @@ int main(int argc, char ** argv) {
     LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
     common_init_result llama_init = common_init_from_params(params);
 
-    model = llama_init.model;
-    ctx = llama_init.context;
+    model = llama_init.model.get();
+    ctx = llama_init.context.get();
 
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n", __func__);
@@ -581,9 +581,6 @@ int main(int argc, char ** argv) {
     LOG("\n");
     common_perf_print(ctx, smpl);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     common_sampler_free(smpl);
     llama_backend_free();
 
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 8d0ef8b3d..e016618e3 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -58,8 +58,8 @@ int main(int argc, char ** argv) {
     // load the target model
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
 
     // Tokenize the prompt
     std::vector<llama_token> inp;
@@ -474,9 +474,6 @@ int main(int argc, char ** argv) {
 
     llama_batch_free(batch);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
     LOG("\n\n");
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
index 7ced0aa97..3da45ed9e 100644
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -1,14 +1,9 @@
 #include "arg.h"
 #include "common.h"
 #include "ngram-cache.h"
-#include "ggml.h"
 #include "llama.h"
 
-#include <cstdint>
-#include <fstream>
-#include <iostream>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 int main(int argc, char ** argv){
@@ -25,16 +20,16 @@ int main(int argc, char ** argv){
     // load the model
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model_ptr & model = llama_init.model;
+    llama_context_ptr & ctx = llama_init.context;
+
     GGML_ASSERT(model != nullptr);
 
     // tokenize the prompt
     std::vector<llama_token> inp;
-    inp = common_tokenize(ctx, params.prompt, true, true);
+    inp = common_tokenize(ctx.get(), params.prompt, true, true);
     fprintf(stderr, "%s: tokenization done\n", __func__);
 
-
     common_ngram_cache ngram_cache;
     common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
     fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index dff07c075..fcb289abe 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -30,12 +30,11 @@ int main(int argc, char ** argv){
     // load the model
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_context_ptr & ctx = llama_init.context;
 
     // tokenize the prompt
     std::vector<llama_token> inp;
-    inp = common_tokenize(ctx, params.prompt, true, true);
+    inp = common_tokenize(ctx.get(), params.prompt, true, true);
 
     common_ngram_cache ngram_cache_context;
     common_ngram_cache ngram_cache_dynamic;
@@ -66,7 +65,7 @@ int main(int argc, char ** argv){
     }
 
     const int n_input = inp.size();
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx = llama_n_ctx(ctx.get());
 
     int n_drafted = 0;
     int n_accept  = 0;
@@ -150,9 +149,6 @@ int main(int argc, char ** argv){
     LOG_INF("n_accept     = %d\n", n_accept);
     LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
     LOG("\n\n");
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 4d92bb238..0d68b80b9 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -33,8 +33,8 @@ int main(int argc, char ** argv){
     // load the model
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
 
     // tokenize the prompt
     std::vector<llama_token> inp;
@@ -243,9 +243,6 @@ int main(int argc, char ** argv){
 
     llama_batch_free(batch_tgt);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
     LOG("\n\n");
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index d0c28f317..b5e477f5b 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -145,18 +145,18 @@ int main(int argc, char ** argv) {
     llama_context * ctx = nullptr;
     common_sampler * smpl = nullptr;
 
-    std::vector<common_chat_msg> chat_msgs;
-
     g_model = &model;
     g_ctx = &ctx;
     g_smpl = &smpl;
 
+    std::vector<common_chat_msg> chat_msgs;
+
     // load the model and apply lora adapter, if any
     LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
     common_init_result llama_init = common_init_from_params(params);
 
-    model = llama_init.model;
-    ctx = llama_init.context;
+    model = llama_init.model.get();
+    ctx = llama_init.context.get();
 
     if (model == NULL) {
         LOG_ERR("%s: error: unable to load model\n", __func__);
@@ -889,9 +889,6 @@ int main(int argc, char ** argv) {
 
     common_sampler_free(smpl);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
     ggml_threadpool_free_fn(threadpool);
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index fd2b1c011..d48f51975 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -132,8 +132,8 @@ int main(int argc, char ** argv) {
     // load the target model
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
 
     // load the prompts from an external file if there are any
     if (params.prompt.empty()) {
@@ -416,9 +416,6 @@ int main(int argc, char ** argv) {
 
     llama_batch_free(batch);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
     LOG("\n\n");
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 64a84607c..6bdc57f8e 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1987,8 +1987,9 @@ int main(int argc, char ** argv) {
     // load the model and apply lora adapter, if any
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
@@ -2023,9 +2024,6 @@ int main(int argc, char ** argv) {
     LOG("\n");
     llama_perf_context_print(ctx);
 
-    llama_free(ctx);
-    llama_free_model(model);
-
     llama_backend_free();
 
     return 0;
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index a5c6fe7e5..f534b5eff 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -151,8 +151,8 @@ int main(int argc, char ** argv) {
     // load the model
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
 
     if (model == NULL) {
         LOG_ERR("%s: unable to load model\n", __func__);
@@ -298,7 +298,5 @@ int main(int argc, char ** argv) {
 
     // clean up
     llama_batch_free(query_batch);
-    llama_free(ctx);
-    llama_free_model(model);
     llama_backend_free();
 }
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 2f0cf9baa..cd03661cf 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -30,8 +30,8 @@ int main(int argc, char ** argv) {
     // init
     common_init_result llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
 
     if (model == nullptr || ctx == nullptr) {
         fprintf(stderr, "%s : failed to init\n", __func__);
@@ -89,8 +89,6 @@ int main(int argc, char ** argv) {
         if (llama_decode(ctx, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_batch_free(batch);
-            llama_free(ctx);
-            llama_free_model(model);
             return 1;
         }
         n_past += 1;
@@ -98,11 +96,8 @@ int main(int argc, char ** argv) {
 
     printf("\n\n");
 
-    // free old context
-    llama_free(ctx);
-
     // make new context
-    auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));
+    llama_context * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));
 
     llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
 
@@ -123,8 +118,6 @@ int main(int argc, char ** argv) {
 
         if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
             fprintf(stderr, "\n%s : failed to read state\n", __func__);
-            llama_free(ctx2);
-            llama_free_model(model);
             return 1;
         }
 
@@ -148,8 +141,6 @@ int main(int argc, char ** argv) {
         if (llama_decode(ctx2, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_batch_free(batch);
-            llama_free(ctx2);
-            llama_free_model(model);
             return 1;
         }
         n_past += 1;
@@ -157,15 +148,13 @@ int main(int argc, char ** argv) {
 
     printf("\n\n");
 
-    llama_free(ctx2);
-
     if (result0 != result1) {
         fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
         return 1;
     }
 
     // make new context
-    auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));
+    llama_context * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));
 
     llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
 
@@ -186,8 +175,6 @@ int main(int argc, char ** argv) {
 
         if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
             fprintf(stderr, "\n%s : failed to read state\n", __func__);
-            llama_free(ctx3);
-            llama_free_model(model);
             return 1;
         }
 
@@ -204,8 +191,6 @@ int main(int argc, char ** argv) {
         const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
         if (ncopy != seq_store.size()) {
             fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
-            llama_free(ctx3);
-            llama_free_model(model);
             return 1;
         }
         fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
@@ -218,8 +203,6 @@ int main(int argc, char ** argv) {
         const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
         if (nset != seq_store.size()) {
             fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
-            llama_free(ctx3);
-            llama_free_model(model);
             return 1;
         }
         fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
@@ -239,8 +222,6 @@ int main(int argc, char ** argv) {
         if (llama_decode(ctx3, batch)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_batch_free(batch);
-            llama_free(ctx3);
-            llama_free_model(model);
             return 1;
         }
         n_past += 1;
@@ -253,8 +234,6 @@ int main(int argc, char ** argv) {
     llama_sampler_free(smpl3);
 
     llama_batch_free(batch);
-    llama_free(ctx3);
-    llama_free_model(model);
 
     if (result0 != result2) {
         fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index fa3682a92..67c802eca 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1491,11 +1491,16 @@ struct server_response {
 struct server_context {
     common_params params_base;
 
+    common_init_result llama_init;
+    common_init_result llama_init_dft;
+
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
+
     std::vector<common_lora_adapter_container> loras;
 
     llama_model * model_dft = nullptr;
+
     llama_context_params cparams_dft;
 
     llama_batch batch = {};
@@ -1519,21 +1524,6 @@ struct server_context {
     float slot_prompt_similarity = 0.0f;
 
     ~server_context() {
-        if (ctx) {
-            llama_free(ctx);
-            ctx = nullptr;
-        }
-
-        if (model) {
-            llama_free_model(model);
-            model = nullptr;
-        }
-
-        if (model_dft) {
-            llama_free_model(model_dft);
-            model_dft = nullptr;
-        }
-
         // Clear any sampling context
         for (server_slot & slot : slots) {
             common_sampler_free(slot.smpl);
@@ -1556,11 +1546,12 @@ struct server_context {
 
         params_base = params;
 
-        common_init_result llama_init = common_init_from_params(params_base);
+        llama_init = common_init_from_params(params_base);
 
-        model = llama_init.model;
-        ctx   = llama_init.context;
-        loras = llama_init.lora_adapters;
+        model = llama_init.model.get();
+        ctx   = llama_init.context.get();
+
+        loras = std::move(llama_init.lora_adapters);
 
         if (model == nullptr) {
             SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
@@ -1583,25 +1574,22 @@ struct server_context {
             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
             params_dft.n_parallel   = 1;
 
-            common_init_result llama_init_dft = common_init_from_params(params_dft);
+            llama_init_dft = common_init_from_params(params_dft);
 
-            model_dft = llama_init_dft.model;
+            model_dft = llama_init_dft.model.get();
 
             if (model_dft == nullptr) {
                 SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
                 return false;
             }
 
-            if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
+            if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
                 SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
 
-                llama_free      (llama_init_dft.context);
-                llama_free_model(llama_init_dft.model);
-
                 return false;
             }
 
-            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
+            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
 
             cparams_dft = common_context_params_to_llama(params_dft);
             cparams_dft.n_batch = n_ctx_dft;
@@ -1609,9 +1597,6 @@ struct server_context {
             // force F16 KV cache for the draft model for extra performance
             cparams_dft.type_k = GGML_TYPE_F16;
             cparams_dft.type_v = GGML_TYPE_F16;
-
-            // the context is not needed - we will create one for each slot
-            llama_free(llama_init_dft.context);
         }
 
         return true;
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 8ca84f7af..9070c3512 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -34,7 +34,7 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     llama_model * model_tgt = NULL;
-    llama_model * model_dft = NULL;
+    //llama_model * model_dft = NULL;
 
     llama_context * ctx_tgt = NULL;
     llama_context * ctx_dft = NULL;
@@ -42,8 +42,8 @@ int main(int argc, char ** argv) {
     // load the target model
     common_init_result llama_init_tgt = common_init_from_params(params);
 
-    model_tgt = llama_init_tgt.model;
-    ctx_tgt   = llama_init_tgt.context;
+    model_tgt = llama_init_tgt.model.get();
+    ctx_tgt   = llama_init_tgt.context.get();
 
     // load the draft model
     params.devices      = params.speculative.devices;
@@ -59,8 +59,8 @@ int main(int argc, char ** argv) {
     params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
     common_init_result llama_init_dft = common_init_from_params(params);
 
-    model_dft = llama_init_dft.model;
-    ctx_dft   = llama_init_dft.context;
+    //model_dft = llama_init_dft.model.get();
+    ctx_dft   = llama_init_dft.context.get();
 
     if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
         return 1;
@@ -251,12 +251,6 @@ int main(int argc, char ** argv) {
     common_sampler_free(smpl);
     common_speculative_free(spec);
 
-    llama_free(ctx_tgt);
-    llama_free_model(model_tgt);
-
-    llama_free(ctx_dft);
-    llama_free_model(model_dft);
-
     llama_backend_free();
 
     LOG("\n\n");
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index d4ad9751e..bc0b6813b 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -72,8 +72,9 @@ int main(int argc, char ** argv) {
 
     // load the target model
     common_init_result llama_init_tgt = common_init_from_params(params);
-    model_tgt = llama_init_tgt.model;
-    ctx_tgt = llama_init_tgt.context;
+
+    model_tgt = llama_init_tgt.model.get();
+    ctx_tgt   = llama_init_tgt.context.get();
 
     // load the draft model
     params.devices = params.speculative.devices;
@@ -85,8 +86,9 @@ int main(int argc, char ** argv) {
 
     params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
     common_init_result llama_init_dft = common_init_from_params(params);
-    model_dft = llama_init_dft.model;
-    ctx_dft = llama_init_dft.context;
+
+    model_dft = llama_init_dft.model.get();
+    ctx_dft   = llama_init_dft.context.get();
 
     const bool vocab_type_tgt = llama_vocab_type(model_tgt);
     LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
@@ -631,12 +633,6 @@ int main(int argc, char ** argv) {
 
     llama_batch_free(batch_dft);
 
-    llama_free(ctx_tgt);
-    llama_free_model(model_tgt);
-
-    llama_free(ctx_dft);
-    llama_free_model(model_dft);
-
     llama_backend_free();
 
     LOG("\n\n");
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 7f36b80f0..522f5e881 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -458,8 +458,9 @@ int main(int argc, char ** argv) {
     llama_context * ctx_cts = NULL;
 
     common_init_result llama_init_ttc = common_init_from_params(params);
-    model_ttc = llama_init_ttc.model;
-    ctx_ttc = llama_init_ttc.context;
+
+    model_ttc = llama_init_ttc.model.get();
+    ctx_ttc   = llama_init_ttc.context.get();
 
     // TODO: refactor in a common struct
     params.model     = params.vocoder.model;
@@ -470,8 +471,9 @@ int main(int argc, char ** argv) {
     params.embedding = true;
 
     common_init_result llama_init_cts = common_init_from_params(params);
-    model_cts = llama_init_cts.model;
-    ctx_cts = llama_init_cts.context;
+
+    model_cts = llama_init_cts.model.get();
+    ctx_cts   = llama_init_cts.context.get();
 
     std::vector<common_sampler *> smpl(n_parallel);
     for (int i = 0; i < n_parallel; ++i) {
@@ -920,12 +922,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
 
     LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
 
-    llama_free(ctx_ttc);
-    llama_free_model(model_ttc);
-
-    llama_free(ctx_cts);
-    llama_free_model(model_cts);
-
     llama_backend_free();
 
     return 0;
diff --git a/include/llama-cpp.h b/include/llama-cpp.h
index daa04d4d8..1500cb2fc 100644
--- a/include/llama-cpp.h
+++ b/include/llama-cpp.h
@@ -20,6 +20,11 @@ struct llama_sampler_deleter {
     void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
 };
 
+struct llama_lora_adapter_deleter {
+    void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); }
+};
+
 typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
 typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
 typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
+typedef std::unique_ptr<llama_lora_adapter, llama_lora_adapter_deleter> llama_lora_adapter_ptr;