Add cvector-generator example (#7514)

* add control-vector-generator * calc diff * add comments * proof-of-concept stdlib implementation Implements PCA and file writing using mostly standard libraries. The output is recognized as a functional control vector, but outputs gibberish. * param parsing, refactor, comments Added basic command-line parameters for outfile and one each positive/negative prompt. Refactored some messy code in PCA computation and GGUF exporting. Left a bunch of comments regarding further work needed. * example template completions Implements an example template set built from the positive/negative prompts like the control vector Python implementation. * add multi prompts, multi-thread for PCA * fix mem error * add debugs * fix matrix transpose multiplication you have got to be kidding me * preliminary template/multiprompt support model is running out of context and that ought to be fixed (segfaulting) but other than that it looks goodish * fix zero output & param parsing, functional templating fixed a bug where the output file had no tensor data/was all zero fixed a bug where single hyphen flags were not being correctly parsed implements creation of templated prompts from input (still need to adapt based on model) * fix square_diff matmul index range and CRLF->LF line endings fixed a logic error where square_diff would not multiply all rows fixed a formatting error where the provided completions.txt had CRLF line endings * add command-line args for num threads, num completions file lines, always reload model refactored a few things and did what the commit message says on the tin * code aestheticization * fix compiler warnings * in-series multithreading for prompt embedding? added commented-out code to attempt to start implementing mutlithreading for embedding in main * remove unnecessary multithreading * interim fix memory leak * translated everything but PCA (I think) * tentatively translate the rest * fix ggml errors and make new ones at least it compiles and runs * fix cb_eval * temporary commit while I move dev environments it finally outputs a functioning control vector - "functioning" in the sense that it can be loaded and it clearly has the right idea, but makes the model incoherent * update debug statements * pre-tokenize so we can allocate correct memory to ctx_diffs_wrapped * update comments * (wip) refactor * clean up PCA ggml implementation * fix shape of v_diff_original * add n_batch for pca * working version * remember to copy back the last_eigenvector * fix n_completions * bring back n_completions * default n_pca_batch to 20 * fix macos build * add to makefile all targets * use ggml_format_name * add readme * fix .editorconfig * use ggml_backend_tensor_copy * attemp to fix compile problem on mac * fix compile warn * reuse allocr * move param parser to common * better error handling * clean up a bit * add print_usage * shorten help msg * beautify help msg * escape prompt by default * change compile target to llama-cvector-generator * typo * disable GPU for PCA * code style --------- Co-authored-by: Christian Zhou-Zheng <christianzhouzheng@gmail.com>
2025-01-11 19:21:46 +00:00 · 2024-06-15 18:53:40 +02:00 · 2024-06-15 18:53:40 +02:00 · 0c7b3595b9
commit 0c7b3595b9
parent 7b2f4a7d19
12 changed files with 1522 additions and 0 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -26,3 +26,6 @@ indent_size = 2
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
 [examples/cvector-generator/*.txt]
 insert_final_newline = unset
--- a/5
+++ b/5
@ -38,6 +38,7 @@ BUILD_TARGETS = \
 	llama-tokenize \
 	llama-train-text-from-scratch \
 	llama-vdot \
 	llama-cvector-generator \
 	tests/test-c.o
 # Binaries only useful for tests
@ -922,6 +923,10 @@ llama-eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1576,6 +1576,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.out_file = argv[i];
        params.cvector_outfile = argv[i];
        return true;
    }
    if (arg == "-ofreq" || arg == "--output-frequency") {
@ -1610,6 +1611,55 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.i_chunk = std::stoi(argv[i]);
        return true;
    }
    // cvector params
    if (arg == "--completions-file") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.cvector_completions_file = argv[i];
        return true;
    }
    if (arg == "--positive-file") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.cvector_positive_file = argv[i];
        return true;
    }
    if (arg == "--negative-file") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.cvector_negative_file = argv[i];
        return true;
    }
    if (arg == "--completions") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_completions = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--pca-batch") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_pca_batch = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--pca-iter") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_pca_iterations = std::stoi(argv[i]);
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
    // Parse args for logging parameters
    if (log_param_single_parse(argv[i])) {
@ -1931,6 +1981,16 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "logging",     "       --log-append",           "Don't truncate the old log file." });
 #endif // LOG_DISABLE_LOGS
    options.push_back({ "cvector" });
    options.push_back({ "cvector",     "-o,    --output FNAME",         "output file (default: '%s')", params.cvector_outfile.c_str() });
    options.push_back({ "cvector",     "       --positive-file FNAME",  "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() });
    options.push_back({ "cvector",     "       --negative-file FNAME",  "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() });
    options.push_back({ "cvector",     "       --completions-file FNAME",
                                                                        "completions file (default: '%s')", params.cvector_completions_file.c_str() });
    options.push_back({ "cvector",     "       --completions N",        "number of lines of completions file to use (default: %d)", params.n_completions });
    options.push_back({ "cvector",     "       --batch-pca N",          "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch });
    options.push_back({ "cvector",     "       --iter-pca N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
    printf("usage: %s [options]\n", argv[0]);
    for (const auto & o : options) {
--- a/common/common.h
+++ b/common/common.h
@ -232,6 +232,15 @@ struct gpt_params {
    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
    // cvector-generator params
    int n_completions = 64;
    int n_pca_batch = 20;
    int n_pca_iterations = 1000;
    std::string cvector_outfile          = "control_vector.gguf";
    std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
    std::string cvector_positive_file    = "examples/cvector-generator/positive.txt";
    std::string cvector_negative_file    = "examples/cvector-generator/negative.txt";
 };
 void gpt_params_handle_model_default(gpt_params & params);
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -12,6 +12,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    add_subdirectory(cvector-generator)
    add_subdirectory(baby-llama)
    add_subdirectory(batched-bench)
    add_subdirectory(batched)
--- a/examples/cvector-generator/CMakeLists.txt
+++ b/examples/cvector-generator/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET llama-cvector-generator)
 add_executable(${TARGET} cvector-generator.cpp pca.hpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
@ -0,0 +1,34 @@
 # cvector-generator
 This example demonstrates how to generate a control vector using gguf models.
 Related PRs:
 - [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
 - (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
 - [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
 ## Examples
 ```sh
 # CPU only
 ./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
 # With GPU
 ./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
 # With advanced options
 ./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100
 # To see help message
 ./cvector-generator -h
 # Then, have a look at "cvector" section
 ```
 ## Tips and tricks
 If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example:
 ```
 <|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
 <|im_start|>system\nYou are in a very good mood today<|im_end|>
 ```
--- a/examples/cvector-generator/completions.txt
+++ b/examples/cvector-generator/completions.txt
@ -0,0 +1,582 @@
 That game
 I can see
 Hmm, this
 I can relate to
 Who is
 I understand the
 Ugh,
 What the hell was
 Hey, did anyone
 Although
 Thank you for choosing
 What are you
 Oh w
 How dare you open
 It was my pleasure
 I'm hon
 I appreciate that you
 Are you k
 Whoever left this
 It's always
 Ew,
 Hey, I l
 Hello? Is someone
 I understand that
 That poem
 Aww, poor
 Hey, it
 Alright, who
 I didn't
 Well, life
 The document
 Oh no, this
 I'm concerned
 Hello, this is
 This art
 Hmm, this drink
 Hi there!
 It seems
 Is
 Good
 I can't
 Ex
 Who are
 I can see that
 Wow,
 Today is a
 Hey friend
 Sometimes friends
 Oh, this old
 The weather outside
 This place is sur
 I appreciate your input
 Thank you for the
 Look at
 I'm disappoint
 To my
 How dare you
 That's an
 This piece of art
 Eww
 This park is
 This is incredible
 Oh no, someone
 Exc
 Well, it'
 I warned
 Hey, I understand
 Hey, I saw
 How dare you go
 What the he
 Hey
 It's
 Hello? Hello?
 It
 Oh no!
 This is the perfect
 Good morning,
 Oh no, there
 It's so
 Yeah
 Uh,
 Hello everyone
 Who turned off
 The weather
 Who'
 Hey, this
 Wait,
 Eww, gross
 Excuse
 It seems like you
 Thank you so
 What happened?
 Oh my g
 I am deeply sad
 I war
 Okay, let'
 Hey, that
 That was a beautiful
 Oh no! That
 What happened
 Hey there
 The artist'
 What?!
 Hey, it'
 I am disappoint
 It seems like
 Oh no! The
 This park is a
 If you
 Yes! I did
 It sounds
 What
 Who is it
 Hmm, that
 That's strange
 Yeah, that was
 That's interesting
 This park
 What the hell
 Who is that
 I feel like my
 Oh well
 What the hell is
 Hello? Hello
 To my dearest
 Bless you!\"
 Thank you for
 Oh, looks like
 Can you please
 This place is
 Eww, what
 Bless you
 Is everything
 Hey, I just
 Whoever left these
 Well, that'
 I feel
 Hey, do you
 It's sad
 Oh no, it
 Hey, that'
 Oh my god,
 Thank you,
 Hello little one,
 I apolog
 Hey team, I
 How dare you read
 Who is this and
 Whoever left
 Hi there! W
 A
 If you have
 I was
 U
 Bless
 Well, this
 Oh, I'
 It's a
 Eww,
 Is everything okay?
 Oh, I
 Hello, can you
 Al
 That was a great
 What are
 I understand that not
 Oh no, not
 Who is it?\"
 Hey, can we
 Whoever is taking
 I would love to
 Hey, I noticed
 Hey, could
 I understand that there
 Hello?
 D
 Oh man, I
 Thank you so much
 Oh no, my
 Dear [Name
 Uh
 I remember
 Hey, who
 Well, it
 Are you
 I understand that it
 Hey, is
 I would
 Who is this
 Excuse me
 Alright
 I am thrilled
 Sometimes friends have
 Who the
 It's interesting
 I would love
 E
 Hello? Is anyone
 Well, this is
 This place
 Well,
 I warned you
 Hey, watch where
 Oh my
 That'
 Sometimes friends have different
 I understand that everyone
 What?
 What do these notes
 I can relate
 I'm not
 I understand
 To my dear
 Guys
 Well
 Hey, I appreciate
 Wow, what
 Dear
 That melody
 Who the hell
 Today is
 Hello little
 Wow, look
 That's great
 Love is never wrong
 I'm having
 Whoa, did
 Ugh
 Can you please provide
 I miss you,
 I feel uncom
 I know
 Ugh, this
 Hey, watch
 Oh great, a
 I didn
 Okay
 That game of char
 Oh
 I appreciate
 Who's there
 I am so
 Oh great, someone
 Hey, could you
 I remember wondering
 Wait, what?
 What do
 Hello? Can
 Hey there,
 That game of
 This is incred
 Oh my gosh
 Oh great, f
 I appreciate your
 It sounds like
 What the heck
 Okay, I understand
 Ew
 I understand that this
 Uh, hi
 Hi everyone!
 What the hell?
 Thank you for your
 Oh no, the
 Wow, I
 Who turned
 Dear [
 Whoever
 This is a
 Whoa, he
 What in the world
 Although the physical
 Hello, who is
 That's amaz
 Hey, I know
 Okay, that
 Hi everyone
 Hey, is everything
 I understand your fr
 Oh no, poor
 Oh, look
 Good morning
 Ew, gross
 Oh no, did
 Look at the family
 Hey team
 Yes!
 Hey, can I
 Okay, that'
 It's great
 Love is
 Hey, what
 Good morning, world
 Who is it?
 That poem really reson
 I
 That's
 I understand the task
 Gu
 Hello? Who'
 This postcard is
 Whoa,
 Oh, that
 I understand that I
 Whoever is
 Hello? Who is
 I'm really
 Wow, this
 Can
 This artwork really
 This is a shame
 I miss you too
 Who are you?
 Today is a difficult
 Hey, just
 Are you okay
 I am
 Hi,
 Wow, that
 Hey there! Can
 Okay, stay
 Oh great, just
 Yeah,
 Hello? Can you
 Oh, looks
 Thank you for sharing
 I'm glad
 Hey, is that
 Hmm
 It was my
 It sounds like you
 Wow, your
 I was promised certain
 That was such a
 Thank
 Excuse you
 That was
 Hey team,
 I feel un
 It was
 What'
 Hey friend, I
 How
 Saying goodbye
 That
 It's heart
 How dare
 Oh,
 Hello, may
 What's this
 Thank you for recogn
 Aww, that
 Oh, I remember
 Hmm, that'
 I miss
 I know this
 Wait
 Is everything okay
 Who is that person
 Wow, you
 Oh great
 I'm sad
 Wow, the
 I am very disappoint
 Who turned off the
 I understand that things
 I'm very
 Hi
 That's very
 Okay, I
 Oh no,
 Wow, there
 What's wrong
 I apologize for
 Hey, I
 Can I help you
 Oh, I didn
 Alright,
 Oh wow,
 Oh my goodness
 I know this event
 What in the
 Saying
 Yeah, that
 Guys, I
 Hey, this v
 This post
 Are
 Hey, can
 Hello? Is
 I can only imagine
 Oh, that sounds
 Hey, is anyone
 I am disappointed
 Hello,
 Hey everyone, I
 That was such
 It's okay
 The artist
 Whoa
 I understand that mistakes
 Can I help
 Who
 Hi everyone! I
 Hey, can you
 Wow, how
 Today
 Oh no, I
 Oh well, I
 Well, that
 This is the
 Yes! I finally
 Hey there little
 Hello everyone!
 Love is never
 Look at the
 This postcard
 Oh great,
 Can I
 Hmm, this is
 I understand your
 Oh, look at
 B
 I'm so
 Whoa, this
 W
 Oh, this
 Sometimes
 This piece of
 What the
 That was a
 Hey, do
 Oh no
 Whoa, what
 I feel like I
 The documentary
 Hello
 Hello little one
 I understand that my
 Eww, that
 Wow, an
 Yes! Finally,
 Although the physical location
 Whoever is watching
 That movie
 I remember wondering about
 Hey there, little
 Who's
 Hello, who
 Hello everyone! Thank
 Hello, can
 That's too
 Hey, just wanted
 Hey there, I
 Saying good
 Hey there!
 Who is there?
 Oh my good
 I am very
 Oh no, what
 Wow, thank
 I was promised
 Hi, is
 Hey, I'
 Guys, the
 Oh no, that
 Who is there
 Hello, this
 That movie really touched
 If you have something
 The documentary was
 I'm starting
 Are you kidd
 That movie really
 Hey everyone,
 Thank you for considering
 I didn'
 Yes! I
 Can you
 Oh my god
 Hey, whoever
 That melody really
 Thank you, little
 Hello, may I
 Look
 Wow, we
 It looks
 What do these
 Oh wow
 I apologize
 What are you all
 It's such
 It's clear
 Hey, I was
 Hey friend,
 I can only
 The weather outside is
 Eww, this
 I miss you
 Wow
 Aww,
 Hi, is there
 This artwork
 Okay,
 Oh well,
 This
 I'
 Say
 Hey there little gu
 Hmm,
 Whoa, who
 I am thr
 Oh man
 Okay, stay calm
 I'm happy
 Oh, this cur
 Oh man,
 I'm sorry
 Hello? Who
 What?! That
 This piece
 Hey everyone
 That's so
 Are you okay?
 What happened? Where
 Hi there
 The
 Who the hell entered
 I can
 Guys,
 What's
 What in
 It's important
 I'm
 I'm coming
 It'
 Yes! Finally
 Wait, what
 Wow, reading
 I'm surprised
 Hey, did
 Hey,
 Okay, let
 I understand that you
 Who the hell threw
 Eww, who
 Thank you for thinking
 Who is this?\"
 I am deeply
 Thank you for including
 Oh no, an
 It looks like you
 Aww
 I'm confused
 Wow, it
 That poem really
 Yes
 Hey there, is
 Hey, what'
 Thank you for remember
 To
 This is
 Thank you for making
 I can'
 That mel
 Wow, they
 I feel like
 Although the
 Who are you
 Love
 If
 What the hell are
 I am so sad
 Oh, I found
 Thank you
 It looks like
 Well, life is
 I appreciate that
 The artist's
 Whoa, that
 It's never
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@ -0,0 +1,499 @@
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
 #include "pca.hpp"
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
 #include <cstdio>
 #include <string>
 #include <tuple>
 #include <vector>
 #include <algorithm>
 #include <iostream>
 #include <fstream>
 #include <climits>
 //////////////////////////////////////////////////
 // utils
 template <class Iter>
 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
    std::string ret;
    for (; begin != end; ++begin) {
        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    printf("\nexample usage:\n");
    printf("\n    CPU only:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
    printf("\n    with GPU:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
    printf("\n    advanced:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
    printf("\n");
 }
 //////////////////////////////////////////////////
 // cb_eval is reused for each pair of positive - negative prompt
 struct callback_data {
    ggml_context * ctx_ggml = nullptr;   // holds v_pos, v_neg, v_diff_filtered
    int n_layers = 0;
    int n_tokens = 0;
    bool is_eval_pos = true;
    // each element of the vector correspond to one layer
    std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
    std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
    std::vector<struct ggml_tensor *> v_diff_filtered;   // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
    // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
    void save_tensor_for_layer(struct ggml_tensor * t) {
        GGML_ASSERT(t->type == GGML_TYPE_F32);
        if (ctx_ggml == nullptr) {
            // alloc a new ctx_ggml if needed
            struct ggml_init_params params_ggml = {
                /*.mem_size   =*/ ggml_tensor_overhead() * n_layers * 3u,
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
            };
            ctx_ggml = ggml_init(params_ggml);
        }
        // copy tensor data
        auto n_bytes = ggml_nbytes(t);
        struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
        t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
        ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
        ggml_set_name(t_layer, ggml_get_name(t));
        //print_debug_tensor(t_layer);
        if (is_eval_pos) {
            v_pos.push_back(t_layer);
        } else {
            v_neg.push_back(t_layer);
        }
    }
    // calculate diff (v_pos - v_neg) and place the result back to v_pos
    // all zero rows in the diff tensor will also be removed
    // NOTE: final layer is ignored. we only have (n_layers - 1) to process
    std::vector<struct ggml_tensor *> calc_diff() {
        for (float il = 0; il < v_pos.size(); il++) {
            float * a = (float *) v_pos[il]->data;
            float * b = (float *) v_neg[il]->data;
            size_t n_elem = ggml_nelements(v_pos[il]);
            for (size_t j = 0; j < n_elem; j++) {
                a[j] -= b[j];
            }
            //print_debug_tensor(v_pos[i]);
            auto diff_filtered = filter_nonzero_rows(v_pos[il]);
            v_diff_filtered.push_back(diff_filtered);
        }
        return v_diff_filtered; // for convinient, we return the result std::vector
    }
    // delete zero rows from a given 2D tensor
    struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
        //printf("filter_nonzero_rows\n");
        auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
            // check if given row containing all zero elements
            int n_cols = t->ne[0]; // hint: should be equal to n_embd
            for (int col = 0; col < n_cols; ++col) {
                if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
                    return false;
                }
            }
            return true;
        };
        std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
        for (int i_row = 0; i_row < a->ne[1]; i_row++) {
            if (!is_row_all_zeros(a, i_row, 1e-6)) {
                rows_to_copy.push_back(i_row);
            }
        }
        // get "n_nonzero_rows" for the output "diff_filtered"
        int n_nonzero_rows = rows_to_copy.size();
        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
        int n_embd = a->ne[0];
        GGML_ASSERT(n_nonzero_rows > 0);
        // diff_filtered: [n_embd, n_nonzero_rows]
        struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
            ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
        ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
        diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
        // copy non-zero rows
        for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
            int src_row = rows_to_copy[dest_row];
            for (int i = 0; i < n_embd; i++) {
                float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
                ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
            }
        }
        //print_debug_tensor(diff_filtered);
        return diff_filtered;
    }
    // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
    void reset() {
        for (auto ptr : v_pos) free(ptr->data);
        for (auto ptr : v_neg) free(ptr->data);
        for (auto ptr : v_diff_filtered) free(ptr->data);
        v_pos.clear();
        v_neg.clear();
        v_diff_filtered.clear();
        if (ctx_ggml) {
            ggml_free(ctx_ggml);
        }
        ctx_ggml = nullptr;
    }
 };
 /**
 * process_ctx is used to store the ggml context for pre-post processing the diff vectors
 * in short, input => v_diff and output => v_final
 */
 struct train_context {
    ggml_context * ctx_ggml;
    int n_embd;
    int n_layers;
    /* pair of prompts to be used for generating final vector */
    std::vector<std::string> positive_entries;
    std::vector<std::string> negative_entries;
    // each element of the vector correspond to one layer
    // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
    // NOTE (2): v_diff is transposed from v_diff_tmp
    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
    std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
    // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
    // v_diff_tmp will get converted unto v_diff later on
    std::vector<std::vector<uint8_t>> v_diff_tmp;
    train_context(int n_embd_, int n_layers_) {
        n_embd = n_embd_;
        n_layers = n_layers_;
        struct ggml_init_params params_ggml = {
            /*.mem_size   =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ctx_ggml = ggml_init(params_ggml);
        for (int il = 0; il < n_layers - 1; il++) {
            std::vector<uint8_t> empty;
            v_diff_tmp.push_back(empty);
            auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
            t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
            v_final.push_back(t);
        }
    }
    // add new rows into existing tensor in v_diff_tmp
    void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
        for (int il = 0; il < n_layers - 1; il++) {
            auto t = diff_filtered[il];
            auto & diff_tmp = v_diff_tmp[il];
            size_t curr_size = diff_tmp.size();
            diff_tmp.resize(curr_size + ggml_nbytes(t));
            memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
        }
    }
    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
    void build_v_diff() {
        printf("build_v_diff\n");
        for (int il = 0; il < n_layers - 1; il++) {
            auto & diff_tmp = v_diff_tmp[il];
            int n_elem = diff_tmp.size() / sizeof(float);
            GGML_ASSERT(n_elem % n_embd == 0);
            int n_rows = n_elem / n_embd;
            struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
            // copy data & transpose
            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
            float * arr = (float *) diff_tmp.data();
            for (int ir = 0; ir < n_rows; ++ir) {
                for (int ic = 0; ic < n_embd; ++ic) {
                    float f = arr[ir*n_embd + ic];
                    ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
                }
            }
            v_diff.push_back(diff);
            print_debug_tensor(diff);
            // free memory of diff_tmp
            diff_tmp.resize(0);
        }
    }
    ~train_context() {
        for (auto ptr : v_final) free(ptr->data);
        for (auto ptr : v_diff) free(ptr->data);
        // no need to free v_diff_tmp, since we didn't use malloc
        ggml_free(ctx_ggml);
    }
 };
 struct tokenized_prompt {
    std::vector<llama_token> tokens_pos;
    std::vector<llama_token> tokens_neg;
    size_t max_seq_len;
    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
        const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
        tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
        tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
        padding_seq(ctx, tokens_pos, max_seq_len);
        padding_seq(ctx, tokens_neg, max_seq_len);
    }
    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
        // TODO: customize padding token
        std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
        llama_token pad_tok = pad_tokens.back();
        while (tokens.size() < len) {
            tokens.push_back(pad_tok);
        }
    }
 };
 //////////////////////////////////////////////////
 template <typename T>
 static std::string to_string(const T & val) {
    std::stringstream ss;
    ss << val;
    return ss.str();
 }
 static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
    std::vector<std::string> output;
    std::ifstream file(path);
    if (!file.is_open()) {
        fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
        exit(1);
    }
    std::string line;
    while (std::getline(file, line)) {
        bool is_skip = skip_empty_lines && line.empty();
        if (!is_skip) {
            string_process_escapes(line);
            output.push_back(line);
        }
    }
    file.close();
    return output;
 }
 //////////////////////////////////////////////////
 static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
    auto * cb_data = (callback_data *) user_data;
    static const char * l_out_name = "l_out";
    const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
    if (ask) {
        return is_l_out;
    }
    if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
        return true;
    }
    // save the tensor to current context
    cb_data->save_tensor_for_layer(t);
    return true;
 }
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
    llama_kv_cache_clear(ctx);
    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return false;
    }
    return true;
 }
 static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
    struct gguf_context * ctx = gguf_init_empty();
    const std::string arch = "controlvector";
    gguf_set_val_str(ctx, "general.architecture", arch.c_str());
    gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
    gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
    for (size_t i = 0; i < v_ctrl.size(); ++i) {
        gguf_add_tensor(ctx, v_ctrl[i]);
        print_debug_tensor(v_ctrl[i]);
        printf("Added tensor: %s\n", v_ctrl[i]->name);
    }
    printf("%s: writing file...\n", __func__);
    gguf_write_to_file(ctx, fname.c_str(), false);
    printf("%s: wrote file '%s'\n", __func__, fname.c_str());
    gguf_free(ctx);
 }
 /**
 * Load prompt files and completion file.
 * Then format each pair of prompt + completion to make an entry.
 */
 static int prepare_entries(gpt_params & params, train_context & ctx_train) {
    // load prompts
    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
    if (positive_prompts.size() != negative_prompts.size()) {
        fprintf(stderr, "number of positive and negative prompts must be equal\n");
        return 1;
    }
    if (positive_prompts.empty()) {
        fprintf(stderr, "must provide at least one prompt pair\n");
        return 1;
    }
    // create templated prompts
    std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
    auto format_template = [](std::string persona, std::string suffix) {
        // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
        return persona + " " + suffix;
    };
    for (size_t i = 0; i < positive_prompts.size(); ++i) {
        for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
            // TODO replicate the truncations done by the python implementation
            ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
            ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
        }
    }
    return 0;
 }
 int main(int argc, char ** argv) {
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        print_usage(argc, argv, params);
        return 1;
    }
    if (params.n_pca_iterations % params.n_pca_batch != 0) {
        fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
        return 1;
    }
    callback_data cb_data;
    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
    params.cb_eval = cb_eval;
    params.cb_eval_user_data = &cb_data;
    params.warmup = false;
    print_build_info();
    llama_backend_init();
    llama_numa_init(params.numa);
    // load the model to get hparams
    llama_model * model;
    llama_context * ctx;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    // int n_ctx = llama_n_ctx(ctx);
    int n_layers = llama_n_layer(model);
    int n_embd = llama_n_embd(model);
    // get model hint param (a.k.a model arch name)
    char model_hint[128];
    llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
    // init train_context
    train_context ctx_train(n_embd, n_layers);
    // load and prepare entries for training
    prepare_entries(params, ctx_train);
    // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
    std::vector<tokenized_prompt> tokenized_prompts;
    size_t n_total_tokens = 0;
    for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
        tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
        n_total_tokens += 2 * t.max_seq_len;
        tokenized_prompts.push_back(std::move(t));
    }
    std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
    for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
        bool success = false;
        tokenized_prompt t = tokenized_prompts[i];
        cb_data.n_layers = n_layers;
        cb_data.n_tokens = t.max_seq_len;
        printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
            (int) i+1, (int) ctx_train.positive_entries.size(),
            tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
            tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
            (int) t.max_seq_len);
        cb_data.is_eval_pos = true;
        success = get_hidden_layers(ctx, t.tokens_pos);
        if (!success) break;
        cb_data.is_eval_pos = false;
        success = get_hidden_layers(ctx, t.tokens_neg);
        if (!success) break;
        // calculate diff and remove all zero rows
        auto v_diff_filtered = cb_data.calc_diff();
        // save & concat the filtered v_diff to ctx_train
        ctx_train.concat_diff_tmp(v_diff_filtered);
        // reset for next iteration
        cb_data.reset();
    }
    // done with the model, we can now free it to make gain some memory
    printf("Done evaluate prompts, unload model...\n");
    llama_free(ctx);
    llama_free_model(model);
    // prepare ctx_train for PCA
    ctx_train.build_v_diff();
    // run PCA
    PCA::pca_params pca_params;
    pca_params.n_threads = params.n_threads;
    pca_params.n_batch = params.n_pca_batch;
    pca_params.n_iterations = params.n_pca_iterations;
    PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
    // write output vectors to gguf
    export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
    llama_backend_free();
    return 0;
 }
--- a/examples/cvector-generator/negative.txt
+++ b/examples/cvector-generator/negative.txt
@ -0,0 +1 @@
 [INST] Act like a person who is extremely sad. [/INST]
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@ -0,0 +1,322 @@
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
 #include <cstdio>
 #include <ctime>
 #include <string>
 #include <tuple>
 #include <vector>
 #include <algorithm>
 #include <iostream>
 #include <fstream>
 #define DEBUG_POS 5
 static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
    printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
    if (!with_data) return;
    printf("%s: %s[0] = [", __func__, t->name);
    for (size_t i = 0; i <= DEBUG_POS; i++) {
        printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
    }
    printf(" ... ]\n");
 }
 namespace PCA {
 // input params for PCA computations
 struct pca_params {
    int n_threads = 1;
    int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
    int n_iterations = 1000;
    float tolerance = 1e-7;
    // for debugging
    int i_layer = 0;
    int n_layers = 0;
 };
 // result from each iteration
 struct pca_result {
    struct ggml_tensor * calculated_square = NULL;
    std::vector<struct ggml_tensor *> eigenvectors;
    std::vector<float> distances;
 };
 struct pca_model {
    ggml_backend_t backend = NULL;
    ggml_backend_buffer_t buffer;
    struct ggml_context * ctx;      // context to compute graph on target device
    struct ggml_context * ctx_host; // host context to store results
    // tensors on target device
    struct ggml_tensor * dev_input;
    struct ggml_tensor * dev_square;
    struct ggml_tensor * dev_eigenvector;
    pca_model(struct ggml_tensor * t_input) {
 // TODO: enable GPU support when support for GGML_OP_SQRT is added
 // #ifdef GGML_USE_CUDA
 //         fprintf(stderr, "%s: using CUDA backend\n", __func__);
 //         backend = ggml_backend_cuda_init(0); // init device 0
 //         if (!backend) {
 //             fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
 //         }
 // #endif
 // #ifdef GGML_USE_METAL
 //         fprintf(stderr, "%s: using Metal backend\n", __func__);
 //         backend = ggml_backend_metal_init();
 //         if (!backend) {
 //             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
 //         }
 // #endif
        // if there aren't GPU Backends fallback to CPU backend
        if (!backend) {
            backend = ggml_backend_cpu_init();
        }
        const int num_tensors = 4;
        struct ggml_init_params params {
            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ctx = ggml_init(params);
        auto n_samples = t_input->ne[0];
        auto n_embd    = t_input->ne[1];
        dev_input       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
        dev_square      = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,    n_embd);
        dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
        ggml_set_name(dev_input,       "dev_input");
        ggml_set_name(dev_square,      "dev_square");
        ggml_set_name(dev_eigenvector, "dev_eigenvector");
        buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
        ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
        // initialize eigenvector to random normalized vector
        {
            std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
            std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
            std::uniform_real_distribution<float> distribution(0.0, 1.0);
            float sum_sqr = 0.0; // for normalizing random_vec
            for (size_t i = 0; i < random_vec.size(); ++i) {
                float f = distribution(generator);
                sum_sqr += f * f;
                random_vec[i] = f;
            }
            // normalize it
            float random_vec_norm = std::sqrt(sum_sqr);
            for (size_t i = 0; i < random_vec.size(); ++i) {
                random_vec[i] /= random_vec_norm;
            }
            ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
        }
    }
    ~pca_model() {
        ggml_free(ctx);
        ggml_backend_buffer_free(buffer);
        ggml_backend_free(backend);
    }
 };
 static struct ggml_cgraph * build_graph_piter(
        const struct pca_params & params,
        const pca_model & model,
        bool calc_square = false) {
    GGML_ASSERT(params.n_batch > 0);
    // TODO: buf_size must be able to scale with params.n_batch
    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
    static std::vector<uint8_t> buf(buf_size);
    struct ggml_init_params params0 = {
        /*.mem_size   =*/ buf_size,
        /*.mem_buffer =*/ buf.data(),
        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
    };
    // create a temporally context to build the graph
    struct ggml_context * ctx0 = ggml_init(params0);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
    // turn v_diff_original into square matrix if needed
    struct ggml_tensor * tmp_square;
    if (calc_square) {
        tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
        ggml_set_name(tmp_square, "tmp_square");
    }
    struct ggml_tensor * b_tensor;
    struct ggml_tensor * distance;
    struct ggml_tensor * old_eigen    = model.dev_eigenvector;
    struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
    for (int i = 0; i < params.n_batch; ++i) {
        // b_tensor = square * eigenvector^T
        b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
        ggml_set_name(b_tensor, "b_tensor");
        // normalize
        b_tensor = ggml_div_inplace(ctx0,
            b_tensor,
            ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
        );
        ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
        // calculate distance(new eigenvector - old eigenvector)
        // we don't use ggml_sub because it may not be implemented on GPU backend
        struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
        distance = ggml_sqrt_inplace(ctx0,
            ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
        ggml_format_name(distance, "distance_%d", i);
        old_eigen = b_tensor;
        // build operations nodes
        ggml_build_forward_expand(gf, distance);
    }
    // delete the temporally context used to build the graph
    ggml_free(ctx0);
    return gf;
 }
 static ggml_status compute_piter(
        const struct pca_params & params,
        const pca_model & model,
        struct ggml_cgraph * gf,
        ggml_gallocr_t allocr,
        struct pca_result & result) {
    // allocate tensors
    ggml_gallocr_alloc_graph(allocr, gf);
    if (ggml_backend_is_cpu(model.backend)) {
        ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
    }
 // TODO: enable GPU support when support for GGML_OP_SQRT is added
 //#ifdef GGML_USE_METAL
 //    if (ggml_backend_is_metal(model.backend)) {
 //        ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
 //    }
 //#endif
    ggml_status res = ggml_backend_graph_compute(model.backend, gf);
    if (res == GGML_STATUS_SUCCESS) {
        auto extract_i = [](std::string prefix, std::string str) -> int {
            int i = -1;
            if (str.rfind(prefix, 0) == 0) {
                sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
            }
            return i;
        };
        result.calculated_square = NULL;
        result.eigenvectors.clear();
        result.distances.clear();
        result.eigenvectors.resize(params.n_batch);
        result.distances.resize(params.n_batch);
        // get output nodes
        for (int i = 0; i < gf->n_nodes; ++i) {
            auto node = gf->nodes[i];
            int iter = -1;
            // find b_tensor (without copying data from device)
            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
                result.eigenvectors[iter] = node;
            }
            // find distances, then copy data from device
            if ((iter = extract_i("distance_", node->name)) > -1) {
                float d;
                ggml_backend_tensor_get(node, &d, 0, sizeof(float));
                result.distances[iter] = d;
                // std::cout << node->name << " = " << d << "\n";
            }
            // find tmp_square if it exists (without copying data from device)
            if (std::string(node->name) == "tmp_square") {
                result.calculated_square = node;
            }
        }
    }
    return res;
 }
 static void power_iteration(
        const struct pca_params & params,
        struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
        struct ggml_tensor * output) {
    //printf("in power iteration\n");
    struct pca_model model(input);
    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
    struct pca_result result;
    struct ggml_tensor * last_eigenvector = NULL;
    int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
    for (int iter = 0; iter < n_iters; ++iter) {
        bool calc_square = (iter == 0); // only need to calculate square for first iteration
        struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
        // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
        compute_piter(params, model, gf, allocr, result);
        for (size_t k = 0; k < result.distances.size(); ++k) {
            last_eigenvector = result.eigenvectors[k];
            if (result.distances[k] < params.tolerance) {
                break; // done
            }
        }
        if (calc_square) {
            // copy and store the square matrix if needed
            GGML_ASSERT(result.calculated_square != NULL);
            ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
        }
        {
            // copy last eigen vector and store as input for next iteration
            GGML_ASSERT(last_eigenvector != NULL);
            ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
        }
        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
            __func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
    }
    // get output tensor
    GGML_ASSERT(last_eigenvector);
    ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
    //print_debug_tensor(output);
    ggml_gallocr_free(allocr);
 }
 static void run_pca(
        struct pca_params & params,
        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
        const std::vector<struct ggml_tensor *> & v_output) {
    printf("%s: Running PCA...\n", __func__);
    for (size_t il = 0; il < v_input.size(); ++il) {
        // prepare output vector
        struct ggml_tensor * ctrl_out = v_output[il];
        ggml_format_name(ctrl_out, "direction.%ld", il+1);
        // run power_iteration
        params.i_layer = il;
        params.n_layers = v_input.size();
        power_iteration(params, v_input[il], ctrl_out);
        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
    }
 }
 }
--- a/examples/cvector-generator/positive.txt
+++ b/examples/cvector-generator/positive.txt
@ -0,0 +1 @@
 [INST] Act like a person who is extremely happy. [/INST]
		`@ -0,0 +1 @@`
							`[INST] Act like a person who is extremely sad. [/INST]`