mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-04-16 19:46:08 +00:00
308 lines
12 KiB
Python
308 lines
12 KiB
Python
![]() |
import gguf
|
||
|
import argparse
|
||
|
import logging
|
||
|
import sys
|
||
|
import torch
|
||
|
import json
|
||
|
import os
|
||
|
import numpy as np
|
||
|
from typing import cast, ContextManager, Any, Iterator
|
||
|
from pathlib import Path
|
||
|
from torch import Tensor
|
||
|
|
||
|
logger = logging.getLogger("gemma3-mmproj")
|
||
|
|
||
|
|
||
|
# (copied from convert_hf_to_gguf.py)
|
||
|
# tree of lazy tensors
|
||
|
class LazyTorchTensor(gguf.LazyBase):
|
||
|
_tensor_type = torch.Tensor
|
||
|
# to keep the type-checker happy
|
||
|
dtype: torch.dtype
|
||
|
shape: torch.Size
|
||
|
|
||
|
# only used when converting a torch.Tensor to a np.ndarray
|
||
|
_dtype_map: dict[torch.dtype, type] = {
|
||
|
torch.float16: np.float16,
|
||
|
torch.float32: np.float32,
|
||
|
}
|
||
|
|
||
|
# used for safetensors slices
|
||
|
# ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
|
||
|
# TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
|
||
|
_dtype_str_map: dict[str, torch.dtype] = {
|
||
|
"F64": torch.float64,
|
||
|
"F32": torch.float32,
|
||
|
"BF16": torch.bfloat16,
|
||
|
"F16": torch.float16,
|
||
|
# "U64": torch.uint64,
|
||
|
"I64": torch.int64,
|
||
|
# "U32": torch.uint32,
|
||
|
"I32": torch.int32,
|
||
|
# "U16": torch.uint16,
|
||
|
"I16": torch.int16,
|
||
|
"U8": torch.uint8,
|
||
|
"I8": torch.int8,
|
||
|
"BOOL": torch.bool,
|
||
|
"F8_E4M3": torch.float8_e4m3fn,
|
||
|
"F8_E5M2": torch.float8_e5m2,
|
||
|
}
|
||
|
|
||
|
def numpy(self) -> gguf.LazyNumpyTensor:
|
||
|
dtype = self._dtype_map[self.dtype]
|
||
|
return gguf.LazyNumpyTensor(
|
||
|
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||
|
args=(self,),
|
||
|
func=(lambda s: s.numpy())
|
||
|
)
|
||
|
|
||
|
@classmethod
|
||
|
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
|
||
|
return torch.empty(size=shape, dtype=dtype, device="meta")
|
||
|
|
||
|
@classmethod
|
||
|
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
|
||
|
dtype = cls._dtype_str_map[st_slice.get_dtype()]
|
||
|
shape: tuple[int, ...] = tuple(st_slice.get_shape())
|
||
|
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
|
||
|
return cast(torch.Tensor, lazy)
|
||
|
|
||
|
@classmethod
|
||
|
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||
|
del types # unused
|
||
|
|
||
|
if kwargs is None:
|
||
|
kwargs = {}
|
||
|
|
||
|
if func is torch.Tensor.numpy:
|
||
|
return args[0].numpy()
|
||
|
|
||
|
return cls._wrap_fn(func)(*args, **kwargs)
|
||
|
|
||
|
|
||
|
class Gemma3VisionTower:
|
||
|
hparams: dict
|
||
|
gguf_writer: gguf.GGUFWriter
|
||
|
fname_out: Path
|
||
|
ftype: gguf.LlamaFileType
|
||
|
|
||
|
@staticmethod
|
||
|
def load_hparams(dir_model: Path):
|
||
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||
|
return json.load(f)
|
||
|
|
||
|
@staticmethod
|
||
|
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
|
||
|
part_names: list[str] = []
|
||
|
for filename in os.listdir(dir_model):
|
||
|
if filename.startswith(prefix) and filename.endswith(suffix):
|
||
|
part_names.append(filename)
|
||
|
part_names.sort()
|
||
|
return part_names
|
||
|
|
||
|
def __init__(self,
|
||
|
dir_model: Path,
|
||
|
fname_out: Path,
|
||
|
ftype: gguf.LlamaFileType,
|
||
|
is_big_endian: bool,):
|
||
|
hparams = Gemma3VisionTower.load_hparams(dir_model)
|
||
|
self.hparams = hparams
|
||
|
self.fname_out = fname_out
|
||
|
self.ftype = ftype
|
||
|
endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||
|
self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess)
|
||
|
|
||
|
text_config = hparams["text_config"]
|
||
|
vision_config = hparams["vision_config"]
|
||
|
|
||
|
assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration"
|
||
|
assert text_config is not None
|
||
|
assert vision_config is not None
|
||
|
|
||
|
self.gguf_writer.add_string ("clip.projector_type", "gemma3")
|
||
|
self.gguf_writer.add_bool ("clip.has_text_encoder", False)
|
||
|
self.gguf_writer.add_bool ("clip.has_vision_encoder", True)
|
||
|
self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy
|
||
|
self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"])
|
||
|
self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"])
|
||
|
self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"])
|
||
|
self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"])
|
||
|
self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"])
|
||
|
self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"])
|
||
|
self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"])
|
||
|
self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6))
|
||
|
# default values taken from HF tranformers code
|
||
|
self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5])
|
||
|
self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5])
|
||
|
self.gguf_writer.add_bool ("clip.use_gelu", True)
|
||
|
|
||
|
# load tensors
|
||
|
for name, data_torch in self.get_tensors(dir_model):
|
||
|
# convert any unsupported data types to float32
|
||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||
|
data_torch = data_torch.to(torch.float32)
|
||
|
self.add_tensor(name, data_torch)
|
||
|
|
||
|
def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]:
|
||
|
part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors")
|
||
|
tensor_names_from_parts: set[str] = set()
|
||
|
for part_name in part_names:
|
||
|
logger.info(f"gguf: loading model part '{part_name}'")
|
||
|
from safetensors import safe_open
|
||
|
ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu"))
|
||
|
with ctx as model_part:
|
||
|
tensor_names_from_parts.update(model_part.keys())
|
||
|
|
||
|
for name in model_part.keys():
|
||
|
data = model_part.get_slice(name)
|
||
|
data = LazyTorchTensor.from_safetensors_slice(data)
|
||
|
yield name, data
|
||
|
|
||
|
def add_tensor(self, name: str, data_torch: Tensor):
|
||
|
is_1d = len(data_torch.shape) == 1
|
||
|
is_embd = ".embeddings." in name
|
||
|
old_dtype = data_torch.dtype
|
||
|
can_quantize = not is_1d and not is_embd
|
||
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||
|
|
||
|
# this is to support old checkpoint
|
||
|
# TODO: remove this when we have the final model
|
||
|
name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.")
|
||
|
name = name.replace("multimodal_projector.", "multi_modal_projector.")
|
||
|
|
||
|
# filter only vision tensors
|
||
|
if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."):
|
||
|
return
|
||
|
# prefix
|
||
|
name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.")
|
||
|
name = name.replace("vision_tower.vision_model.", "v.")
|
||
|
# projector and input embd
|
||
|
name = name.replace(".embeddings.patch_embedding.", ".patch_embd.")
|
||
|
name = name.replace(".embeddings.position_embedding.", ".position_embd.")
|
||
|
name = name.replace(
|
||
|
"multi_modal_projector.mm_input_projection_weight",
|
||
|
"mm.input_projection.weight"
|
||
|
)
|
||
|
name = name.replace(
|
||
|
"multi_modal_projector.mm_soft_emb_norm.weight",
|
||
|
"mm.soft_emb_norm.weight"
|
||
|
)
|
||
|
name = name.replace("post_layernorm.", "post_ln.")
|
||
|
# each block
|
||
|
name = name.replace(".self_attn.k_proj.", ".attn_k.")
|
||
|
name = name.replace(".self_attn.v_proj.", ".attn_v.")
|
||
|
name = name.replace(".self_attn.q_proj.", ".attn_q.")
|
||
|
name = name.replace(".self_attn.out_proj.", ".attn_out.")
|
||
|
name = name.replace(".layer_norm1.", ".ln1.")
|
||
|
name = name.replace(".layer_norm2.", ".ln2.")
|
||
|
name = name.replace(".mlp.fc1.", ".ffn_down.")
|
||
|
name = name.replace(".mlp.fc2.", ".ffn_up.")
|
||
|
|
||
|
if can_quantize:
|
||
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||
|
else:
|
||
|
raise ValueError(f"Unsupported file type: {self.ftype}")
|
||
|
|
||
|
# corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
|
||
|
# the other norm values are part of SigLIP model, and they are already correct
|
||
|
# ref code: Gemma3RMSNorm
|
||
|
if "soft_emb_norm.weight" in name:
|
||
|
logger.info(f"Correcting norm value for '{name}'")
|
||
|
data_torch = data_torch + 1
|
||
|
|
||
|
data = data_torch.numpy()
|
||
|
|
||
|
try:
|
||
|
data = gguf.quants.quantize(data, data_qtype)
|
||
|
except Exception as e:
|
||
|
logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
|
||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||
|
data = gguf.quants.quantize(data, data_qtype)
|
||
|
|
||
|
# reverse shape to make it similar to the internal ggml dimension order
|
||
|
shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
|
||
|
logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||
|
|
||
|
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
|
||
|
|
||
|
def write(self):
|
||
|
self.gguf_writer.write_header_to_file(path=self.fname_out)
|
||
|
self.gguf_writer.write_kv_data_to_file()
|
||
|
self.gguf_writer.write_tensors_to_file(progress=True)
|
||
|
self.gguf_writer.close()
|
||
|
|
||
|
def parse_args() -> argparse.Namespace:
|
||
|
parser = argparse.ArgumentParser(
|
||
|
description="Convert Gemma 3 vision tower safetensors to GGUF format",)
|
||
|
parser.add_argument(
|
||
|
"--outfile", type=Path, default="mmproj.gguf",
|
||
|
help="path to write to",
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
|
||
|
help="output format",
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
"--bigendian", action="store_true",
|
||
|
help="model is executed on big endian machine",
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
"model", type=Path,
|
||
|
help="directory containing model file",
|
||
|
nargs="?",
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
"--verbose", action="store_true",
|
||
|
help="increase output verbosity",
|
||
|
)
|
||
|
|
||
|
args = parser.parse_args()
|
||
|
if args.model is None:
|
||
|
parser.error("the following arguments are required: model")
|
||
|
return args
|
||
|
|
||
|
|
||
|
def main() -> None:
|
||
|
args = parse_args()
|
||
|
|
||
|
if args.verbose:
|
||
|
logging.basicConfig(level=logging.DEBUG)
|
||
|
else:
|
||
|
logging.basicConfig(level=logging.INFO)
|
||
|
|
||
|
dir_model = args.model
|
||
|
|
||
|
if not dir_model.is_dir():
|
||
|
logger.error(f'Error: {args.model} is not a directory')
|
||
|
sys.exit(1)
|
||
|
|
||
|
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||
|
"f32": gguf.LlamaFileType.ALL_F32,
|
||
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||
|
}
|
||
|
|
||
|
logger.info(f"Loading model: {dir_model.name}")
|
||
|
|
||
|
with torch.inference_mode():
|
||
|
gemma3_vision_tower = Gemma3VisionTower(
|
||
|
dir_model=dir_model,
|
||
|
fname_out=args.outfile,
|
||
|
ftype=ftype_map[args.outtype],
|
||
|
is_big_endian=args.bigendian,
|
||
|
)
|
||
|
gemma3_vision_tower.write()
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|
||
|
|