mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-04-15 19:16:09 +00:00

* wip llava2 * migrated gemma3 to llava2 * add timings * correct pre/postfix * fix missing include * fix compilation unused var warn * update llava2_tokenize * change name llava2 --> mtmd * improve api * refine helpers * Update examples/llava/mtmd.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
147 lines
4.6 KiB
C++
147 lines
4.6 KiB
C++
#ifndef MTMD_H
|
|
#define MTMD_H
|
|
|
|
#include "ggml.h"
|
|
#include "llama.h"
|
|
#include "clip.h"
|
|
|
|
#include <vector>
|
|
#include <cinttypes>
|
|
#include <memory>
|
|
|
|
#ifdef LLAMA_SHARED
|
|
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
# ifdef LLAMA_BUILD
|
|
# define MTMD_API __declspec(dllexport)
|
|
# else
|
|
# define MTMD_API __declspec(dllimport)
|
|
# endif
|
|
# else
|
|
# define MTMD_API __attribute__ ((visibility ("default")))
|
|
# endif
|
|
#else
|
|
# define MTMD_API
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
|
|
enum mtmd_input_chunk_type {
|
|
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
|
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
|
};
|
|
|
|
struct mtmd_context;
|
|
struct mtmd_image_tokens;
|
|
|
|
// represents raw image data, layout is RGBRGBRGB...
|
|
// length of data must be nx * ny * 3
|
|
struct mtmd_bitmap {
|
|
uint32_t nx;
|
|
uint32_t ny;
|
|
std::vector<unsigned char> data;
|
|
};
|
|
|
|
struct mtmd_input_chunk {
|
|
mtmd_input_chunk_type type;
|
|
std::vector<llama_token> tokens_text;
|
|
mtmd_image_tokens * tokens_image = nullptr;
|
|
};
|
|
|
|
using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
|
|
|
|
struct mtmd_context_params {
|
|
bool use_gpu = true;
|
|
bool print_timings = true;
|
|
int n_threads = 4;
|
|
enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
|
|
const char * image_marker = "<__image__>";
|
|
};
|
|
|
|
struct mtmd_input_text {
|
|
std::string text;
|
|
bool add_special;
|
|
bool parse_special;
|
|
};
|
|
|
|
// initialize the mtmd context
|
|
// return nullptr on failure
|
|
MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
|
const llama_model * text_model,
|
|
const mtmd_context_params ctx_params);
|
|
|
|
MTMD_API void mtmd_free(mtmd_context * ctx);
|
|
|
|
// tokenize an input text prompt and an image
|
|
// the prompt must have the input image marker (default: "<__image__>") in it
|
|
// the marker will be replaced with the image tokens
|
|
// for example:
|
|
// "here is an image: <__image__>\ndescribe it in detail."
|
|
// this will gives 3 chunks:
|
|
// 1. "here is an image: <start_of_image>"
|
|
// 2. (image tokens)
|
|
// 3. "<end_of_image>\ndescribe it in detail."
|
|
// number of bitmaps must be equal to the number of image markers in the prompt
|
|
// this function is thread-safe (shared ctx)
|
|
MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
|
|
const mtmd_input_text & text,
|
|
const std::vector<mtmd_bitmap> & bitmaps);
|
|
|
|
// free image chunk data
|
|
MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
|
|
|
|
// returns 0 on success
|
|
MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
|
|
const mtmd_image_tokens * image_tokens);
|
|
|
|
// get output embeddings from the last encode pass
|
|
MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
|
|
|
//
|
|
// helper functions (can be implemented based on other functions)
|
|
//
|
|
|
|
// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
|
|
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
|
|
|
|
// helper function that automatically:
|
|
// 1. run llama_decode() on text chunks
|
|
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
|
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
|
// otherwise, returns 0 on success
|
|
MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
|
|
llama_context * lctx,
|
|
mtmd_input_chunks * chunks,
|
|
llama_pos pos0,
|
|
llama_seq_id seq_id,
|
|
int32_t n_batch);
|
|
|
|
// helper function to construct a mtmd_bitmap from a file
|
|
// returns 0 on success
|
|
// this function is thread-safe
|
|
MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
|
|
|
|
// helper function to construct a mtmd_bitmap from a buffer
|
|
// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
|
|
// returns 0 on success
|
|
// this function is thread-safe
|
|
MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
|
|
|
|
// convenient unique_ptr wrappers
|
|
struct mtmd_context_deleter {
|
|
void operator()(mtmd_context * val) { mtmd_free(val); }
|
|
};
|
|
using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
|
|
|
|
struct mtmd_input_chunks_deleter {
|
|
void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
|
|
};
|
|
using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
|
|
|
|
#else
|
|
|
|
static_assert(false && "C header is not yet supported by this library");
|
|
|
|
#endif
|
|
|
|
#endif
|