llava : use logger in llava-cli (#6797)

This change removes printf() logging so llava-cli is shell scriptable.
This commit is contained in:
Justine Tunney 2024-04-21 08:19:04 -04:00 committed by GitHub
parent b97bc3966e
commit 89b0bf0d5d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 111 additions and 96 deletions

View File

@ -3,6 +3,7 @@
// I'll gradually clean and extend it // I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h" #include "clip.h"
#include "log.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
#include "ggml-backend.h" #include "ggml-backend.h"
@ -23,7 +24,6 @@
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include <fstream> #include <fstream>
#include <iostream>
#include <map> #include <map>
#include <regex> #include <regex>
#include <stdexcept> #include <stdexcept>
@ -145,7 +145,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
static int get_key_idx(const gguf_context * ctx, const char * key) { static int get_key_idx(const gguf_context * ctx, const char * key) {
int i = gguf_find_key(ctx, key); int i = gguf_find_key(ctx, key);
if (i == -1) { if (i == -1) {
fprintf(stderr, "key %s not found in file\n", key); LOG_TEE("key %s not found in file\n", key);
throw std::runtime_error(format("Missing required key: %s", key)); throw std::runtime_error(format("Missing required key: %s", key));
} }
@ -247,7 +247,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
size_t tensor_size = ggml_nbytes(tensor); size_t tensor_size = ggml_nbytes(tensor);
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size, prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
} }
@ -265,7 +265,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary); std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
std::cerr << "Failed to open file for writing: " << filename << std::endl; LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
return; return;
} }
@ -284,7 +284,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary); std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) { if (!file.is_open()) {
std::cerr << "Failed to open file for writing: " << filename << std::endl; LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
return; return;
} }
@ -515,7 +515,7 @@ struct clip_ctx {
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return nullptr; return nullptr;
} }
@ -879,21 +879,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
const int idx_name = gguf_find_key(ctx, KEY_NAME); const int idx_name = gguf_find_key(ctx, KEY_NAME);
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name); const std::string name = gguf_get_val_str(ctx, idx_name);
printf("%s: model name: %s\n", __func__, name.c_str()); LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
} }
printf("%s: description: %s\n", __func__, description.c_str()); LOG_TEE("%s: description: %s\n", __func__, description.c_str());
printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: n_tensors: %d\n", __func__, n_tensors); LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
printf("%s: n_kv: %d\n", __func__, n_kv); LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
printf("%s: ftype: %s\n", __func__, ftype_str.c_str()); LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
printf("\n"); LOG_TEE("\n");
} }
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
// kv // kv
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
__func__, n_kv, n_tensors, fname); __func__, n_kv, n_tensors, fname);
{ {
std::map<enum ggml_type, uint32_t> n_type; std::map<enum ggml_type, uint32_t> n_type;
@ -904,7 +904,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
n_type[type]++; n_type[type]++;
} }
printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) { for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx, i); const char * name = gguf_get_key(ctx, i);
const enum gguf_type type = gguf_get_kv_type(ctx, i); const enum gguf_type type = gguf_get_kv_type(ctx, i);
@ -920,7 +920,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
replace_all(value, "\n", "\\n"); replace_all(value, "\n", "\\n");
printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
} }
// print type counts // print type counts
@ -929,7 +929,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
continue; continue;
} }
printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
} }
} }
@ -944,7 +944,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
size_t tensor_size = ggml_nbytes(cur); size_t tensor_size = ggml_nbytes(cur);
model_size += tensor_size; model_size += tensor_size;
if (verbosity >= 3) { if (verbosity >= 3) {
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
} }
} }
@ -971,18 +971,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
new_clip->backend = ggml_backend_cuda_init(0); new_clip->backend = ggml_backend_cuda_init(0);
printf("%s: CLIP using CUDA backend\n", __func__); LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
new_clip->backend = ggml_backend_metal_init(); new_clip->backend = ggml_backend_metal_init();
printf("%s: CLIP using Metal backend\n", __func__); LOG_TEE("%s: CLIP using Metal backend\n", __func__);
#endif #endif
if (!new_clip->backend) { if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init(); new_clip->backend = ggml_backend_cpu_init();
printf("%s: CLIP using CPU backend\n", __func__); LOG_TEE("%s: CLIP using CPU backend\n", __func__);
} }
// model size and capabilities // model size and capabilities
@ -1006,15 +1006,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->use_gelu = gguf_get_val_bool(ctx, idx); new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
if (verbosity >= 1) { if (verbosity >= 1) {
printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
printf("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
} }
} }
printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
// load tensors // load tensors
{ {
@ -1027,7 +1027,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->ctx_data = ggml_init(params); new_clip->ctx_data = ggml_init(params);
if (!new_clip->ctx_data) { if (!new_clip->ctx_data) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__); LOG_TEE("%s: ggml_init() failed\n", __func__);
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1035,7 +1035,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
auto fin = std::ifstream(fname, std::ios::binary); auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) { if (!fin) {
printf("cannot open model file for loading tensors\n"); LOG_TEE("cannot open model file for loading tensors\n");
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1057,7 +1057,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg); fin.seekg(offset, std::ios::beg);
if (!fin) { if (!fin) {
printf("%s: failed to seek for tensor %s\n", __func__, name); LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx); gguf_free(ctx);
return nullptr; return nullptr;
@ -1128,23 +1128,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
if (verbosity >= 2) { if (verbosity >= 2) {
printf("\n%s: vision model hparams\n", __func__); LOG_TEE("\n%s: vision model hparams\n", __func__);
printf("image_size %d\n", hparams.image_size); LOG_TEE("image_size %d\n", hparams.image_size);
printf("patch_size %d\n", hparams.patch_size); LOG_TEE("patch_size %d\n", hparams.patch_size);
printf("v_hidden_size %d\n", hparams.hidden_size); LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
printf("v_n_intermediate %d\n", hparams.n_intermediate); LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
printf("v_projection_dim %d\n", hparams.projection_dim); LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
printf("v_n_head %d\n", hparams.n_head); LOG_TEE("v_n_head %d\n", hparams.n_head);
printf("v_n_layer %d\n", hparams.n_layer); LOG_TEE("v_n_layer %d\n", hparams.n_layer);
printf("v_eps %f\n", hparams.eps); LOG_TEE("v_eps %f\n", hparams.eps);
printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
printf("v_image_grid_pinpoints: "); LOG_TEE("v_image_grid_pinpoints: ");
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
printf("%d ", hparams.image_grid_pinpoints[i]); LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
} }
printf("\n"); LOG_TEE("\n");
printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
} }
@ -1155,7 +1155,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
} catch(const std::exception& e) { } catch(const std::exception& e) {
fprintf(stderr, "%s: failed to load vision model tensors\n", __func__); LOG_TEE("%s: failed to load vision model tensors\n", __func__);
} }
// LLaVA projection // LLaVA projection
@ -1184,7 +1184,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} catch (std::runtime_error & e) { } } catch (std::runtime_error & e) { }
try { try {
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
// fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__); // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
} catch (std::runtime_error & e) { } } catch (std::runtime_error & e) { }
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection // MobileVLM projection
@ -1264,7 +1264,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
ggml_gallocr_reserve(new_clip->compute_alloc, gf); ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
} }
return new_clip; return new_clip;
@ -1304,7 +1304,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc; int nx, ny, nc;
auto * data = stbi_load(fname, &nx, &ny, &nc, 3); auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) { if (!data) {
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname); LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
return false; return false;
} }
build_clip_img_from_data(data, nx, ny, img); build_clip_img_from_data(data, nx, ny, img);
@ -1316,7 +1316,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
int nx, ny, nc; int nx, ny, nc;
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) { if (!data) {
fprintf(stderr, "%s: failed to decode image bytes\n", __func__); LOG_TEE("%s: failed to decode image bytes\n", __func__);
return false; return false;
} }
build_clip_img_from_data(data, nx, ny, img); build_clip_img_from_data(data, nx, ny, img);
@ -1506,7 +1506,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
int downscaled_height = static_cast<int>(original_height * scale); int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution; int wasted_resolution = (width * height) - effective_resolution;
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution; max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution; min_wasted_resolution = wasted_resolution;
@ -1545,7 +1545,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
bool pad_to_square = true; bool pad_to_square = true;
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
auto & params = ctx->vision_model.hparams; auto & params = ctx->vision_model.hparams;
@ -1622,7 +1622,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
} }
for (size_t i = 0; i < patches.size(); i++) { for (size_t i = 0; i < patches.size(); i++) {
// printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
clip_image_u8_free(patches[i]); clip_image_u8_free(patches[i]);
} }
@ -1765,7 +1765,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
@ -1777,7 +1777,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return false; return false;
} }
@ -1939,7 +1939,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
new_type = type; new_type = type;
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
// fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
} }
const size_t n_elms = ggml_nelements(cur); const size_t n_elms = ggml_nelements(cur);
float * f32_data; float * f32_data;
@ -1958,7 +1958,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
f32_data = (float *)conv_buf.data(); f32_data = (float *)conv_buf.data();
break; break;
default: default:
printf("Please use an input file in f32 or f16\n"); LOG_TEE("Please use an input file in f32 or f16\n");
gguf_free(ctx_out); gguf_free(ctx_out);
return false; return false;
} }
@ -1985,7 +1985,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
fout.put(0); fout.put(0);
} }
printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
} }
@ -2001,8 +2001,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
gguf_free(ctx_out); gguf_free(ctx_out);
{ {
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
} }
return true; return true;

View File

@ -1,4 +1,5 @@
#include "ggml.h" #include "ggml.h"
#include "log.h"
#include "common.h" #include "common.h"
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
@ -18,7 +19,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
n_eval = n_batch; n_eval = n_batch;
} }
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false; return false;
} }
*n_past += n_eval; *n_past += n_eval;
@ -73,7 +74,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
size_t img_base64_str_start, img_base64_str_end; size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL; return NULL;
} }
@ -87,7 +88,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) { if (!embed) {
fprintf(stderr, "%s: could not load image from base64 string.\n", __func__); LOG_TEE("%s: could not load image from base64 string.\n", __func__);
return NULL; return NULL;
} }
@ -112,8 +113,8 @@ struct llava_context {
}; };
static void show_additional_info(int /*argc*/, char ** argv) { static void show_additional_info(int /*argc*/, char ** argv) {
fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
fprintf(stderr, " note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) { static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
@ -123,18 +124,18 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
auto prompt = params->prompt; auto prompt = params->prompt;
if (prompt_contains_image(prompt)) { if (prompt_contains_image(prompt)) {
if (!params->image.empty()) { if (!params->image.empty()) {
fprintf(stderr, "using base64 encoded image instead of command line image path\n"); LOG_TEE("using base64 encoded image instead of command line image path\n");
} }
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
if (!embed) { if (!embed) {
fprintf(stderr, "%s: can't load image from prompt\n", __func__); LOG_TEE("%s: can't load image from prompt\n", __func__);
return NULL; return NULL;
} }
params->prompt = remove_image_from_prompt(prompt); params->prompt = remove_image_from_prompt(prompt);
} else { } else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str()); embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
if (!embed) { if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str()); LOG_TEE("%s: is %s really an image file?\n", __func__, params->image.c_str());
return NULL; return NULL;
} }
} }
@ -153,18 +154,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos); system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length()); user_prompt = prompt.substr(image_pos + std::string("<image>").length());
printf("system_prompt: %s\n", system_prompt.c_str()); LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
printf("user_prompt: %s\n", user_prompt.c_str()); LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
} else { } else {
@ -174,7 +175,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
if (params->verbose_prompt) { if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) { for (int i = 0; i < (int) tmp.size(); i++) {
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
} }
} }
} }
@ -185,7 +186,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
// generate the response // generate the response
fprintf(stderr, "\n"); LOG_TEE("\n");
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
std::string response = ""; std::string response = "";
@ -224,7 +225,7 @@ static struct llava_context * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); LOG_TEE("%s: error: unable to load model\n" , __func__);
return NULL; return NULL;
} }
@ -234,7 +235,7 @@ static struct llava_context * llava_init(gpt_params * params) {
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) { if (ctx_llama == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
return NULL; return NULL;
} }
@ -257,6 +258,12 @@ static void llava_free(struct llava_context * ctx_llava) {
llama_backend_free(); llama_backend_free();
} }
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
LOG_TEE("%s", text);
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
ggml_time_init(); ggml_time_init();
@ -266,6 +273,14 @@ int main(int argc, char ** argv) {
show_additional_info(argc, argv); show_additional_info(argc, argv);
return 1; return 1;
} }
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("llava", "log"));
LOG_TEE("Log start\n");
log_dump_cmdline(argc, argv);
llama_log_set(llama_log_callback_logTee, nullptr);
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
gpt_print_usage(argc, argv, params); gpt_print_usage(argc, argv, params);
show_additional_info(argc, argv); show_additional_info(argc, argv);
@ -274,7 +289,7 @@ int main(int argc, char ** argv) {
auto ctx_llava = llava_init(&params); auto ctx_llava = llava_init(&params);
if (ctx_llava == NULL) { if (ctx_llava == NULL) {
fprintf(stderr, "%s: error: failed to init llava\n", __func__); LOG_TEE("%s: error: failed to init llava\n", __func__);
return 1; return 1;
} }

View File

@ -54,7 +54,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
int downscaled_height = static_cast<int>(original_height * scale); int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution; int wasted_resolution = (width * height) - effective_resolution;
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution; max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution; min_wasted_resolution = wasted_resolution;
@ -154,13 +154,13 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) { if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
if (newline_tmp->buffer == NULL) { if (newline_tmp->buffer == NULL) {
printf("newline_tmp tensor buffer is NULL\n"); LOG_TEE("newline_tmp tensor buffer is NULL\n");
} }
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp)); ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
} else { } else {
model.newline->data = newline_tmp->data; model.newline->data = newline_tmp->data;
if (model.newline->data == NULL) { if (model.newline->data == NULL) {
printf("newline_tmp tensor data is NULL\n"); LOG_TEE("newline_tmp tensor data is NULL\n");
} }
} }
@ -224,7 +224,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
img_res_v.size = 0; img_res_v.size = 0;
img_res_v.data = nullptr; img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__); LOG_TEE("%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data; delete[] img_res_v.data;
return false; return false;
} }
@ -239,7 +239,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data; delete[] img_res_v.data;
if (!encoded) { if (!encoded) {
fprintf(stderr, "Unable to encode image\n"); LOG_TEE("Unable to encode image\n");
return false; return false;
} }
@ -252,12 +252,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) { if (!encoded) {
fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false; return false;
} }
} }
const int64_t t_img_enc_batch_us = ggml_time_us(); const int64_t t_img_enc_batch_us = ggml_time_us();
printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip); const int32_t * image_grid = clip_image_grid(ctx_clip);
@ -290,12 +290,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
// clip_image_save_to_bmp(*tmp, "image_feature.bmp"); // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
} }
printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us(); const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true; return true;
} }
@ -305,7 +305,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip); auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) { if (n_image_embd != n_llama_embd) {
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
return false; return false;
} }
return true; return true;
@ -314,13 +314,13 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
if (!image_embd) { if (!image_embd) {
fprintf(stderr, "Unable to allocate memory for image embeddings\n"); LOG_TEE("Unable to allocate memory for image embeddings\n");
return false; return false;
} }
int n_img_pos; int n_img_pos;
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
fprintf(stderr, "%s: cannot encode image, aborting\n", __func__); LOG_TEE("%s: cannot encode image, aborting\n", __func__);
free(image_embd); free(image_embd);
return false; return false;
} }
@ -340,7 +340,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
} }
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) { if (llama_decode(ctx_llama, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
return false; return false;
} }
*n_past += n_eval; *n_past += n_eval;
@ -352,7 +352,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
clip_image_u8 * img = clip_image_u8_init(); clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img); clip_image_u8_free(img);
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__); LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
return NULL; return NULL;
} }
@ -361,7 +361,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) { if (!image_embed_result) {
clip_image_u8_free(img); clip_image_u8_free(img);
fprintf(stderr, "%s: coulnd't embed the image\n", __func__); LOG_TEE("%s: coulnd't embed the image\n", __func__);
return NULL; return NULL;
} }
@ -375,7 +375,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb"); auto file = fopen(path, "rb");
if (file == NULL) { if (file == NULL) {
fprintf(stderr, "%s: can't read file %s\n", __func__, path); LOG_TEE("%s: can't read file %s\n", __func__, path);
return false; return false;
} }
@ -385,7 +385,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) { if (buffer == NULL) {
fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error"); perror("Memory allocation error");
fclose(file); fclose(file);
return false; return false;
@ -410,7 +410,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
long image_bytes_length; long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) { if (!loaded) {
fprintf(stderr, "%s: failed to load %s\n", __func__, image_path); LOG_TEE("%s: failed to load %s\n", __func__, image_path);
return NULL; return NULL;
} }