From 68ff663a04ed92044a9937bcae353e9d9733f9cd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 15 Feb 2025 16:40:57 +0200 Subject: [PATCH] repo : update links to new url (#11886) * repo : update links to new url ggml-ci * cont : more urls ggml-ci --- .devops/llama-cpp-cuda.srpm.spec | 4 +- .devops/llama-cpp.srpm.spec | 4 +- .devops/nix/package.nix | 6 +-- .devops/rocm.Dockerfile | 2 +- .github/ISSUE_TEMPLATE/020-enhancement.yml | 6 +-- .github/ISSUE_TEMPLATE/030-research.yml | 2 +- .github/ISSUE_TEMPLATE/040-refactor.yml | 4 +- .github/ISSUE_TEMPLATE/config.yml | 6 +-- .github/pull_request_template.md | 2 +- .github/workflows/bench.yml.disabled | 12 +---- .github/workflows/build.yml | 2 +- .github/workflows/labeler.yml | 2 +- CONTRIBUTING.md | 8 +-- Makefile | 8 +-- README.md | 52 +++++++++---------- SECURITY.md | 2 +- ci/README.md | 4 +- common/arg.cpp | 2 +- convert_hf_to_gguf.py | 6 +-- convert_hf_to_gguf_update.py | 4 +- convert_lora_to_gguf.py | 4 +- docs/android.md | 2 +- docs/backend/OPENCL.md | 4 +- docs/backend/SYCL.md | 6 +-- docs/build.md | 2 +- docs/cuda-fedora.md | 2 +- docs/development/HOWTO-add-model.md | 20 +++---- docs/docker.md | 32 ++++++------ docs/install.md | 2 +- examples/cvector-generator/README.md | 6 +-- examples/imatrix/README.md | 2 +- examples/imatrix/imatrix.cpp | 2 +- .../llama/src/main/cpp/CMakeLists.txt | 2 +- examples/llama.swiftui/README.md | 4 +- examples/llama.vim | 2 +- examples/llava/README-minicpmo2.6.md | 2 +- examples/llava/README-minicpmv2.5.md | 2 +- examples/lookahead/README.md | 2 +- examples/lookup/README.md | 4 +- examples/main/README.md | 4 +- examples/passkey/README.md | 4 +- .../pydantic_models_to_grammar_examples.py | 2 +- examples/quantize/README.md | 30 +++++------ examples/retrieval/README.md | 2 +- examples/server/CMakeLists.txt | 2 +- examples/server/README.md | 18 +++---- examples/server/server.cpp | 2 +- examples/server/utils.hpp | 4 +- examples/simple-cmake-pkg/README.md | 4 +- examples/speculative/README.md | 6 +-- flake.nix | 8 +-- ggml/include/ggml-cpu.h | 2 +- ggml/include/ggml-metal.h | 2 +- ggml/src/ggml-cpu/ggml-cpu.c | 4 +- ggml/src/ggml-metal/ggml-metal.m | 2 +- ggml/src/ggml-metal/ggml-metal.metal | 4 +- gguf-py/README.md | 16 +++--- gguf-py/gguf/scripts/gguf_dump.py | 2 +- gguf-py/gguf/utility.py | 2 +- gguf-py/gguf/vocab.py | 2 +- gguf-py/pyproject.toml | 2 +- grammars/README.md | 8 +-- include/llama.h | 16 +++--- pyproject.toml | 2 +- scripts/check-requirements.sh | 2 +- src/unicode.cpp | 2 +- 66 files changed, 192 insertions(+), 202 deletions(-) diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec index 7425d3a9d..3bbf4a4de 100644 --- a/.devops/llama-cpp-cuda.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -17,10 +17,10 @@ Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) License: MIT -Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz +Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz BuildRequires: coreutils make gcc-c++ git cuda-toolkit Requires: cuda-toolkit -URL: https://github.com/ggerganov/llama.cpp +URL: https://github.com/ggml-org/llama.cpp %define debug_package %{nil} %define source_date_epoch_from_changelog 0 diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec index 4d5560089..45902dcf8 100644 --- a/.devops/llama-cpp.srpm.spec +++ b/.devops/llama-cpp.srpm.spec @@ -18,10 +18,10 @@ Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) License: MIT -Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz +Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz BuildRequires: coreutils make gcc-c++ git libstdc++-devel Requires: libstdc++ -URL: https://github.com/ggerganov/llama.cpp +URL: https://github.com/ggml-org/llama.cpp %define debug_package %{nil} %define source_date_epoch_from_changelog 0 diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 043c4364b..6e8050a49 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -133,12 +133,12 @@ effectiveStdenv.mkDerivation (finalAttrs: { --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" ''; - # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015, + # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015, # `default.metallib` may be compiled with Metal compiler from XCode # and we need to escape sandbox on MacOS to access Metal compiler. # `xcrun` is used find the path of the Metal compiler, which is varible # and not on $PATH - # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion + # see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders; nativeBuildInputs = @@ -220,7 +220,7 @@ effectiveStdenv.mkDerivation (finalAttrs: { broken = (useMetalKit && !effectiveStdenv.isDarwin); description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; - homepage = "https://github.com/ggerganov/llama.cpp/"; + homepage = "https://github.com/ggml-org/llama.cpp/"; license = lib.licenses.mit; # Accommodates `nix run` and `lib.getExe` diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index a8088ea00..48e7e6aaa 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -11,7 +11,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported # gfx906 is deprecated diff --git a/.github/ISSUE_TEMPLATE/020-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml index 02dd4f575..cee1446f5 100644 --- a/.github/ISSUE_TEMPLATE/020-enhancement.yml +++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml @@ -6,7 +6,7 @@ body: - type: markdown attributes: value: | - [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas) + [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas) - type: checkboxes id: prerequisites @@ -16,11 +16,11 @@ body: options: - label: I am running the latest code. Mention the version if possible as well. required: true - - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md). + - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md). required: true - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed). required: true - - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share. + - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share. required: true - type: textarea diff --git a/.github/ISSUE_TEMPLATE/030-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml index 18975dbbf..e774550d5 100644 --- a/.github/ISSUE_TEMPLATE/030-research.yml +++ b/.github/ISSUE_TEMPLATE/030-research.yml @@ -6,7 +6,7 @@ body: - type: markdown attributes: value: | - Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22) + Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22) - type: checkboxes id: research-stage diff --git a/.github/ISSUE_TEMPLATE/040-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml index b6e6ab36d..2fe94e26c 100644 --- a/.github/ISSUE_TEMPLATE/040-refactor.yml +++ b/.github/ISSUE_TEMPLATE/040-refactor.yml @@ -6,8 +6,8 @@ body: - type: markdown attributes: value: | - Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered. - Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too. + Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered. + Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too. - type: textarea id: background-description diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index eb8c4b472..0d246533c 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,11 +1,11 @@ blank_issues_enabled: true contact_links: - name: Got an idea? - url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas + url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas about: Pop it there. It may then become an enhancement ticket. - name: Got a question? - url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a + url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a about: Ask a question there! - name: Want to contribute? - url: https://github.com/ggerganov/llama.cpp/wiki/contribute + url: https://github.com/ggml-org/llama.cpp/wiki/contribute about: Head to the contribution guide page of the wiki for areas you can help with diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index d9f5bdc23..d0bdd73c4 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1 +1 @@ -*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR* +*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR* diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled index 1c8787ef7..0370c8943 100644 --- a/.github/workflows/bench.yml.disabled +++ b/.github/workflows/bench.yml.disabled @@ -1,5 +1,5 @@ # TODO: there have been some issues with the workflow, so disabling for now -# https://github.com/ggerganov/llama.cpp/issues/7893 +# https://github.com/ggml-org/llama.cpp/issues/7893 # # Benchmark name: Benchmark @@ -57,17 +57,7 @@ jobs: if: | inputs.gpu-series == 'Standard_NC4as_T4_v3' - || ( - github.event_name == 'schedule' - && github.ref_name == 'master' - && github.repository_owner == 'ggerganov' - ) || github.event_name == 'pull_request_target' - || ( - github.event_name == 'push' - && github.event.ref == 'refs/heads/master' - && github.repository_owner == 'ggerganov' - ) steps: - name: Clone id: checkout diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e632ddd82..e6893ddd3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -129,7 +129,7 @@ jobs: run: | sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: - # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 + # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 cmake -B build \ -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 368dbdbe5..0b0f300aa 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -11,7 +11,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - repository: "ggerganov/llama.cpp" + repository: "ggml-org/llama.cpp" - uses: actions/labeler@v5 with: configuration-path: '.github/labeler.yml' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8d411982b..9d4e5a56f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,7 +12,7 @@ - Squash-merge PRs - Use the following format for the squashed commit title: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` -- Optionally pick a `` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules +- Optionally pick a `` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules - Consider adding yourself to [CODEOWNERS](CODEOWNERS) # Coding guidelines @@ -40,14 +40,14 @@ - Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code - For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines) - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices -- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ +- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ ![matmul](media/matmul.png) # Naming guidelines - Use `snake_case` for function, variable and type names -- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) +- Naming usually optimizes for longest common prefix (see https://github.com/ggml-org/ggml/pull/302#discussion_r1243240963) ```cpp // not OK @@ -122,4 +122,4 @@ The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: -https://github.com/ggerganov/llama.cpp/projects +https://github.com/ggml-org/llama.cpp/projects diff --git a/Makefile b/Makefile index dc3de3cb1..662194086 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ ifndef LLAMA_MAKEFILE -$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) +$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) endif # Define the default target now so that it is always the first target @@ -463,7 +463,7 @@ endif ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 - # https://github.com/ggerganov/llama.cpp/issues/2922 + # https://github.com/ggml-org/llama.cpp/issues/2922 MK_CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move @@ -1078,8 +1078,8 @@ endif ifdef REMOVE_WARNING $(info !!! REMOVAL WARNING !!!) $(info The following LLAMA_ options have been removed and are no longer supported) -$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418)) -$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418)) +$(info - LLAMA_DISABLE_LOGS (https://github.com/ggml-org/llama.cpp/pull/9418)) +$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418)) $(info ) endif diff --git a/README.md b/README.md index 7629647d7..1764cad81 100644 --- a/README.md +++ b/README.md @@ -3,26 +3,26 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) -[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) +[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml) -[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) +[Roadmap](https://github.com/users/ggml-org/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ ## Recent API changes -- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289) -- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291) +- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289) +- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291) ## Hot topics -- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427 +- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427 - **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode -- Universal tool call support in `llama-server`: https://github.com/ggerganov/llama.cpp/pull/9639 +- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim -- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123 -- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669 -- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) +- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123 +- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669 +- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) ---- @@ -39,7 +39,7 @@ range of hardware - locally and in the cloud. - Vulkan and SYCL backend support - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity -The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library. +The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library.
Models @@ -59,23 +59,23 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) -- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423) +- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) -- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) +- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) -- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) -- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) +- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417) +- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553) - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) - [X] [StableLM models](https://huggingface.co/stabilityai) - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek) - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) -- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) +- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557) - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) -- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003) +- [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003) - [x] [GPT-2](https://huggingface.co/gpt2) -- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) +- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118) - [x] [InternLM2](https://huggingface.co/models?search=internlm2) - [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [Gemma](https://ai.google.dev/gemma) @@ -146,7 +146,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) - Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama) -- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) +- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326) - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) @@ -245,7 +245,7 @@ The project also includes many example programs and tools using the `llama` libr - Clone this repository and build locally, see [how to build](docs/build.md) - On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md) - Use a Docker image, see [documentation for Docker](docs/docker.md) -- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases) +- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases) ## Obtaining and quantizing models @@ -258,14 +258,14 @@ You can either manually download the GGUF file or directly use any `llama.cpp`-c After downloading a model, use the CLI tools to run it locally - see below. -`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo. +`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo. The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`: - Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes -- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123) -- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268) -- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669) +- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123) +- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268) +- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669) To learn more about model quantization, [read this documentation](examples/quantize/README.md) @@ -488,9 +488,9 @@ To learn more about model quantization, [read this documentation](examples/quant - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch - Collaborators will be invited based on contributions - Any help with managing issues, PRs and projects is very appreciated! -- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions +- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information -- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) +- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205) - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532) ## Other documentation @@ -505,7 +505,7 @@ To learn more about model quantization, [read this documentation](examples/quant - [Running on Docker](docs/docker.md) - [Build on Android](docs/android.md) - [Performance troubleshooting](docs/development/token_generation_performance_tips.md) -- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) +- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks) #### Seminal papers and background on the models diff --git a/SECURITY.md b/SECURITY.md index f4322c6ee..6a1bb6c32 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -62,6 +62,6 @@ Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp- However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released. -Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new). +Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new). A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. diff --git a/ci/README.md b/ci/README.md index 406470519..8245c9df6 100644 --- a/ci/README.md +++ b/ci/README.md @@ -1,11 +1,11 @@ # CI -In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework: +In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework: https://github.com/ggml-org/ci It monitors the `master` branch for new commits and runs the -[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us +[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled to cover various hardware architectures, including GPU and Apple Silicon instances. diff --git a/common/arg.cpp b/common/arg.cpp index a4d65ad00..b016cce08 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1569,7 +1569,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- isolate: only spawn threads on CPUs on the node that execution started on\n" "- numactl: use the CPU map provided by numactl\n" "if run without this previously, it is recommended to drop the system page cache before using this\n" - "see https://github.com/ggerganov/llama.cpp/issues/1437", + "see https://github.com/ggml-org/llama.cpp/issues/1437", [](common_params & params, const std::string & value) { /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 018a2a588..8b7c75d85 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -558,7 +558,7 @@ class Model: # NOTE: this function is generated by convert_hf_to_gguf_update.py # do not modify it manually! - # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + # ref: https://github.com/ggml-org/llama.cpp/pull/6920 # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that @@ -708,7 +708,7 @@ class Model: logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {chkhsh}") logger.warning("**************************************************************************************") @@ -2835,7 +2835,7 @@ class InternLM2Model(Model): if chat_eos_token_id is not None: # For the chat model, we replace the eos with '<|im_end|>'. # TODO: this is a hack, should be fixed - # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 + # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048 special_vocab.special_token_ids["eos"] = chat_eos_token_id logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" " in chat mode so that the conversation can end normally.") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index cea34413f..fa4989a80 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -8,7 +8,7 @@ # provide the necessary information to llama.cpp via the GGUF header in order to implement # the same pre-tokenizer. # -# ref: https://github.com/ggerganov/llama.cpp/pull/6920 +# ref: https://github.com/ggml-org/llama.cpp/pull/6920 # # Instructions: # @@ -246,7 +246,7 @@ src_func = f""" logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {{chkhsh}}") logger.warning("**************************************************************************************") diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 6dea14a23..bdc991533 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -395,7 +395,7 @@ if __name__ == '__main__': logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") if ".embed_tokens.weight" in name or ".lm_head.weight" in name: logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") - logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948") + logger.error("Please refer to https://github.com/ggml-org/llama.cpp/pull/9948") sys.exit(1) if base_name in tensor_map: @@ -419,7 +419,7 @@ if __name__ == '__main__': # some archs may have the same tensor for lm_head and output (tie word embeddings) # in this case, adapters targeting lm_head will fail when using llama-export-lora # therefore, we ignore them for now - # see: https://github.com/ggerganov/llama.cpp/issues/9065 + # see: https://github.com/ggml-org/llama.cpp/issues/9065 if name == "lm_head.weight" and len(dest) == 0: raise ValueError("lm_head is present in adapter, but is ignored in base model") for dest_name, dest_data in dest: diff --git a/docs/android.md b/docs/android.md index 47530c6c1..d2a835653 100644 --- a/docs/android.md +++ b/docs/android.md @@ -12,7 +12,7 @@ $ apt update && apt upgrade -y $ apt install git cmake ``` -Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake. +Then, follow the [build instructions](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md), specifically for CMake. Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance: diff --git a/docs/backend/OPENCL.md b/docs/backend/OPENCL.md index a604058cb..2a946dc8d 100644 --- a/docs/backend/OPENCL.md +++ b/docs/backend/OPENCL.md @@ -122,7 +122,7 @@ cp libOpenCL.so ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x ```sh cd ~/dev/llm -git clone https://github.com/ggerganov/llama.cpp && \ +git clone https://github.com/ggml-org/llama.cpp && \ cd llama.cpp && \ mkdir build-android && cd build-android @@ -182,7 +182,7 @@ cmake --build . --target install mkdir -p ~/dev/llm cd ~/dev/llm -git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp +git clone https://github.com/ggml-org/llama.cpp && cd llama.cpp mkdir build && cd build cmake .. -G Ninja ` diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 89ddbd669..0cb39e792 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -36,8 +36,8 @@ The following release is verified with good quality: |Commit ID|Tag|Release|Verified Platform| Update date| |-|-|-|-|-| -|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| -|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| +|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| +|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| ## News @@ -58,7 +58,7 @@ The following release is verified with good quality: - 2024.3 - Release binary files of Windows. - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd). - - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437). + - New base line is ready: [tag b2437](https://github.com/ggml-org/llama.cpp/tree/b2437). - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing. - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE. - Support detecting all GPUs with level-zero and same top **Max compute units**. diff --git a/docs/build.md b/docs/build.md index 8b812bc89..69480aa08 100644 --- a/docs/build.md +++ b/docs/build.md @@ -3,7 +3,7 @@ **To get the Code:** ```bash -git clone https://github.com/ggerganov/llama.cpp +git clone https://github.com/ggml-org/llama.cpp cd llama.cpp ``` diff --git a/docs/cuda-fedora.md b/docs/cuda-fedora.md index 9c88b7694..75cd2b499 100644 --- a/docs/cuda-fedora.md +++ b/docs/cuda-fedora.md @@ -248,7 +248,7 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t - **Building `llama.cpp`:** - - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support. + - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support. - Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration. - **Using the Toolbox Environment:** diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md index 8fcd70811..78c6f7607 100644 --- a/docs/development/HOWTO-add-model.md +++ b/docs/development/HOWTO-add-model.md @@ -104,16 +104,16 @@ Note: to debug the inference graph: you can use [llama-eval-callback](/examples/ ## GGUF specification -https://github.com/ggerganov/ggml/blob/master/docs/gguf.md +https://github.com/ggml-org/ggml/blob/master/docs/gguf.md ## Resources -- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268 -- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009 -- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283 -- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406 -- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423 -- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204 -- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491 -- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515 -- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948 +- YaRN RoPE scaling https://github.com/ggml-org/llama.cpp/pull/2268 +- support Baichuan serial models https://github.com/ggml-org/llama.cpp/pull/3009 +- support attention bias https://github.com/ggml-org/llama.cpp/pull/4283 +- Mixtral support https://github.com/ggml-org/llama.cpp/pull/4406 +- BERT embeddings https://github.com/ggml-org/llama.cpp/pull/5423 +- Grok-1 support https://github.com/ggml-org/llama.cpp/pull/6204 +- Command R Plus support https://github.com/ggml-org/llama.cpp/pull/6491 +- support arch DBRX https://github.com/ggml-org/llama.cpp/pull/6515 +- How to convert HuggingFace model to GGUF format https://github.com/ggml-org/llama.cpp/discussions/2948 diff --git a/docs/docker.md b/docs/docker.md index 58b5d381d..343146dbd 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -7,21 +7,21 @@ ## Images We have three Docker images available for this project: -1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) -2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) -3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`) +1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) +2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) +3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`) Additionally, there the following images, similar to the above: -- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now). @@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i Replace `/path/to/models` below with the actual path where you downloaded the models. ```bash -docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B +docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-one "/models/" 7B ``` On completion, you are ready to play! ```bash -docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with a light image: ```bash -docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with a server image: ```bash -docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 +docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 ``` ## Docker With CUDA diff --git a/docs/install.md b/docs/install.md index 10a568506..0e23a2c9e 100644 --- a/docs/install.md +++ b/docs/install.md @@ -7,7 +7,7 @@ On Mac and Linux, the homebrew package manager can be used via ```sh brew install llama.cpp ``` -The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668 +The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668 ## Nix diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md index be4dd5250..6d5fd74ad 100644 --- a/examples/cvector-generator/README.md +++ b/examples/cvector-generator/README.md @@ -3,9 +3,9 @@ This example demonstrates how to generate a control vector using gguf models. Related PRs: -- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970) -- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880) -- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514) +- [Add support for control vectors](https://github.com/ggml-org/llama.cpp/pull/5970) +- (Issue) [Generate control vector using llama.cpp](https://github.com/ggml-org/llama.cpp/issues/6880) +- [Add cvector-generator example](https://github.com/ggml-org/llama.cpp/pull/7514) ## Examples diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 9c056986b..bdf248cd3 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -1,7 +1,7 @@ # llama.cpp/examples/imatrix Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models. -More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 +More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861 ## Usage diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 395e2aa47..4edc0bfac 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -100,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); // this has been adapted to the new format of storing merged experts in a single 3d tensor - // ref: https://github.com/ggerganov/llama.cpp/pull/6387 + // ref: https://github.com/ggml-org/llama.cpp/pull/6387 if (t->op == GGML_OP_MUL_MAT_ID) { // ids -> [n_experts_used, n_tokens] // src1 -> [cols, n_expert_used, n_tokens] diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt index 2de496574..6119fe09b 100644 --- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt @@ -14,7 +14,7 @@ project("llama-android") #include(FetchContent) #FetchContent_Declare( # llama -# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp +# GIT_REPOSITORY https://github.com/ggml-org/llama.cpp # GIT_TAG master #) diff --git a/examples/llama.swiftui/README.md b/examples/llama.swiftui/README.md index 96cf743d4..f717886d6 100644 --- a/examples/llama.swiftui/README.md +++ b/examples/llama.swiftui/README.md @@ -3,9 +3,9 @@ Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting point for more advanced projects. -For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508 +For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508 -![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299) +![image](https://github.com/ggml-org/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299) Video demonstration: diff --git a/examples/llama.vim b/examples/llama.vim index 57eb2a977..af3fd3935 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -39,7 +39,7 @@ " " :call llama#init() " -" more info: https://github.com/ggerganov/llama.cpp/pull/9787 +" more info: https://github.com/ggml-org/llama.cpp/pull/9787 " " colors (adjust to your liking) diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md index 8713a43d6..8f591506d 100644 --- a/examples/llava/README-minicpmo2.6.md +++ b/examples/llava/README-minicpmo2.6.md @@ -26,7 +26,7 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model ``` Build llama.cpp using `CMake`: -https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md +https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md ```bash cmake -B build diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md index 1c8498ff9..b0e72a0fa 100644 --- a/examples/llava/README-minicpmv2.5.md +++ b/examples/llava/README-minicpmv2.5.md @@ -6,7 +6,7 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V- Clone llama.cpp: ```bash -git clone https://github.com/ggerganov/llama.cpp +git clone https://github.com/ggml-org/llama.cpp cd llama.cpp ``` diff --git a/examples/lookahead/README.md b/examples/lookahead/README.md index a69a471b4..aab3cd0ca 100644 --- a/examples/lookahead/README.md +++ b/examples/lookahead/README.md @@ -4,4 +4,4 @@ Demonstration of lookahead decoding technique: https://lmsys.org/blog/2023-11-21-lookahead-decoding/ -More info: https://github.com/ggerganov/llama.cpp/pull/4207 +More info: https://github.com/ggml-org/llama.cpp/pull/4207 diff --git a/examples/lookup/README.md b/examples/lookup/README.md index 71c345c03..07d73849b 100644 --- a/examples/lookup/README.md +++ b/examples/lookup/README.md @@ -8,5 +8,5 @@ The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft More info: -https://github.com/ggerganov/llama.cpp/pull/4484 -https://github.com/ggerganov/llama.cpp/issues/4226 +https://github.com/ggml-org/llama.cpp/pull/4484 +https://github.com/ggml-org/llama.cpp/issues/4226 diff --git a/examples/main/README.md b/examples/main/README.md index ea71591bd..f7c249729 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -1,6 +1,6 @@ # llama.cpp/examples/main -This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts. +This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts. ## Table of Contents @@ -121,7 +121,7 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t ### Chat templates - `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name. Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled. + `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name. Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled. Example usage: `--chat-template gemma` diff --git a/examples/passkey/README.md b/examples/passkey/README.md index 2b8e910f9..2f19597c4 100644 --- a/examples/passkey/README.md +++ b/examples/passkey/README.md @@ -5,8 +5,8 @@ models ability to recall information from long contexts. See the following PRs for more info: -- https://github.com/ggerganov/llama.cpp/pull/3856 -- https://github.com/ggerganov/llama.cpp/pull/4810 +- https://github.com/ggml-org/llama.cpp/pull/3856 +- https://github.com/ggml-org/llama.cpp/pull/4810 ### Usage diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py index eb000d5cc..f94b82ca4 100755 --- a/examples/pydantic_models_to_grammar_examples.py +++ b/examples/pydantic_models_to_grammar_examples.py @@ -23,7 +23,7 @@ def create_completion(host, prompt, gbnf_grammar): """Calls the /completion API on llama-server. See - https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints + https://github.com/ggml-org/llama.cpp/tree/HEAD/examples/server#api-endpoints """ print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}") headers = {"Content-Type": "application/json"} diff --git a/examples/quantize/README.md b/examples/quantize/README.md index f9cce7b21..992d00e21 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -69,22 +69,22 @@ Several quantization methods are supported. They differ in the resulting model d | 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 | | 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 | -- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684) +- [k-quants](https://github.com/ggml-org/llama.cpp/pull/1684) - recent k-quants improvements and new i-quants - - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707) - - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807) - - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773) - - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856) - - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861) - - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872) - - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897) - - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) + - [#2707](https://github.com/ggml-org/llama.cpp/pull/2707) + - [#2807](https://github.com/ggml-org/llama.cpp/pull/2807) + - [#4773 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4773) + - [#4856 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4856) + - [#4861 - importance matrix](https://github.com/ggml-org/llama.cpp/pull/4861) + - [#4872 - MoE models](https://github.com/ggml-org/llama.cpp/pull/4872) + - [#4897 - 2-bit quantization](https://github.com/ggml-org/llama.cpp/pull/4897) + - [#4930 - imatrix for all k-quants](https://github.com/ggml-org/llama.cpp/pull/4930) + - [#4951 - imatrix on the GPU](https://github.com/ggml-org/llama.cpp/pull/4957) + - [#4969 - imatrix for legacy quants](https://github.com/ggml-org/llama.cpp/pull/4969) + - [#4996 - k-quants tuning](https://github.com/ggml-org/llama.cpp/pull/4996) + - [#5060 - Q3_K_XS](https://github.com/ggml-org/llama.cpp/pull/5060) + - [#5196 - 3-bit i-quants](https://github.com/ggml-org/llama.cpp/pull/5196) + - [quantization tuning](https://github.com/ggml-org/llama.cpp/pull/5320), [another one](https://github.com/ggml-org/llama.cpp/pull/5334), and [another one](https://github.com/ggml-org/llama.cpp/pull/5361) **Llama 2 7B** diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md index bc5f22e2f..6938a1e96 100644 --- a/examples/retrieval/README.md +++ b/examples/retrieval/README.md @@ -3,7 +3,7 @@ Demonstration of simple retrieval technique based on cosine similarity More info: -https://github.com/ggerganov/llama.cpp/pull/6193 +https://github.com/ggml-org/llama.cpp/pull/6193 ### How to use diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 1b7cc8c13..aee90388e 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -5,7 +5,7 @@ option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) if (MINGW) - # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006 + # fix: https://github.com/ggml-org/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006 add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) endif() diff --git a/examples/server/README.md b/examples/server/README.md index 751d4db9e..a2ae614d7 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -7,14 +7,14 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. **Features:** * LLM inference of F16 and quantized models on GPU and CPU * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes - * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510) + * Reranking endoint (WIP: https://github.com/ggml-org/llama.cpp/pull/9510) * Parallel decoding with multi-user support * Continuous batching * Multimodal (wip) * Monitoring endpoints * Schema-constrained JSON response format -The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). +The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggml-org/llama.cpp/issues/4216). ## Usage @@ -65,7 +65,7 @@ The project is under active development, and we are [looking for feedback and co | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_NO_MMAP) | -| `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | +| `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | | `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | @@ -178,7 +178,7 @@ Example usage of docker compose with environment variables: ```yml services: llamacpp-server: - image: ghcr.io/ggerganov/llama.cpp:server + image: ghcr.io/ggml-org/llama.cpp:server ports: - 8080:8080 volumes: @@ -273,10 +273,10 @@ You can consume the endpoints with Postman or NodeJS with axios library. You can ### Docker ```bash -docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 +docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 # or, with CUDA: -docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99 +docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggml-org/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99 ``` ## Testing with CURL @@ -1066,7 +1066,7 @@ print(completion.choices[0].text) ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API -Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. +Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. *Options:* @@ -1120,7 +1120,7 @@ curl http://localhost:8080/v1/chat/completions \ *Tool call support* -[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639): +[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggml-org/llama.cpp/pull/9639): - Requires `--jinja` flag - Native tool call formats supported: @@ -1599,7 +1599,7 @@ Apart from error types supported by OAI, we also have custom types that are spec ### Legacy completion web UI -A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy` +A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggml-org/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy` For example: diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 71151183b..9ffec0a64 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -42,7 +42,7 @@ enum stop_type { STOP_TYPE_LIMIT, }; -// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 +// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283 enum slot_state { SLOT_STATE_IDLE, SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 86de0e6d7..b5aebebba 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -367,10 +367,10 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec } } } else { - throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); + throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)"); } } else { - throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); + throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)"); } chat.push_back({role, content, /* tool_calls= */ {}}); diff --git a/examples/simple-cmake-pkg/README.md b/examples/simple-cmake-pkg/README.md index 8b30049e2..d7430cc9c 100644 --- a/examples/simple-cmake-pkg/README.md +++ b/examples/simple-cmake-pkg/README.md @@ -1,6 +1,6 @@ # llama.cpp/example/simple-cmake-pkg -This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree. +This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggml-org/llama.cpp) in projects which live outside of the source tree. ## Building @@ -13,7 +13,7 @@ When hardware acceleration libraries are used (e.g. CUDA, Metal, Vulkan, etc.), ### Build llama.cpp and install to llama.cpp/inst ```sh -git clone https://github.com/ggerganov/llama.cpp +git clone https://github.com/ggml-org/llama.cpp cd llama.cpp cmake -S . -B build cmake --build build diff --git a/examples/speculative/README.md b/examples/speculative/README.md index a6608c5fe..36ab37086 100644 --- a/examples/speculative/README.md +++ b/examples/speculative/README.md @@ -4,6 +4,6 @@ Demonstration of speculative decoding and tree-based speculative decoding techni More info: -- https://github.com/ggerganov/llama.cpp/pull/2926 -- https://github.com/ggerganov/llama.cpp/pull/3624 -- https://github.com/ggerganov/llama.cpp/pull/5625 +- https://github.com/ggml-org/llama.cpp/pull/2926 +- https://github.com/ggml-org/llama.cpp/pull/3624 +- https://github.com/ggml-org/llama.cpp/pull/5625 diff --git a/flake.nix b/flake.nix index 26a258816..0b5edf911 100644 --- a/flake.nix +++ b/flake.nix @@ -36,7 +36,7 @@ # ``` # nixConfig = { # extra-substituters = [ - # # Populated by the CI in ggerganov/llama.cpp + # # Populated by the CI in ggml-org/llama.cpp # "https://llama-cpp.cachix.org" # # # A development cache for nixpkgs imported with `config.cudaSupport = true`. @@ -56,11 +56,11 @@ # }; # ``` - # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl: + # For inspection, use `nix flake show github:ggml-org/llama.cpp` or the nix repl: # # ```bash # ❯ nix repl - # nix-repl> :lf github:ggerganov/llama.cpp + # nix-repl> :lf github:ggml-org/llama.cpp # Added 13 variables. # nix-repl> outputs.apps.x86_64-linux.quantize # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/llama-quantize"; type = "app"; } @@ -176,7 +176,7 @@ # # We could test all outputs e.g. as `checks = confg.packages`. # - # TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed + # TODO: Build more once https://github.com/ggml-org/llama.cpp/issues/6346 has been addressed checks = { inherit (config.packages) default vulkan; }; diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 3aa71badb..d23c6b262 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -8,7 +8,7 @@ extern "C" { #endif // the compute plan that needs to be prepared for ggml_graph_compute() - // since https://github.com/ggerganov/ggml/issues/287 + // since https://github.com/ggml-org/ggml/issues/287 struct ggml_cplan { size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index 669c1f84a..a61069442 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -45,7 +45,7 @@ GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend); GGML_DEPRECATED( GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size), - "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713"); + "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713"); GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 0cbf8318b..dbef5df21 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1816,7 +1816,7 @@ inline static float ggml_silu_f32(float x) { #if __FINITE_MATH_ONLY__ #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix" -#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461" +#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461" #endif #if defined(__ARM_NEON) && defined(__aarch64__) @@ -7574,7 +7574,7 @@ UseGgmlGemm2:; int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size; // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread. - // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915 + // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915 // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that. if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) { // distribute the thread work across the inner or outer loop based on which one is larger diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 944d90af3..0add6b51a 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -1983,7 +1983,7 @@ static void ggml_metal_encode_node( const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); // TODO: add ggml_metal_kargs struct - // TODO: optimize (see https://github.com/ggerganov/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6) + // TODO: optimize (see https://github.com/ggml-org/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6) [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; if (id_src1) { diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 44f04c909..da415184b 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1058,7 +1058,7 @@ kernel void kernel_soft_max( } // This barrier fixes a failing test - // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 + // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335 threadgroup_barrier(mem_flags::mem_none); float sum = simd_sum(lsum); @@ -1163,7 +1163,7 @@ kernel void kernel_soft_max_4( const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3]; // This barrier fixes a failing test - // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335 + // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335 threadgroup_barrier(mem_flags::mem_none); float sum = simd_sum(lsum); diff --git a/gguf-py/README.md b/gguf-py/README.md index 2e513633d..dd4ab7bde 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -1,9 +1,9 @@ ## gguf -This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) +This is a Python package for writing binary files in the [GGUF](https://github.com/ggml-org/ggml/pull/302) (GGML Universal File) format. -See [convert_hf_to_gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) +See [convert_hf_to_gguf.py](https://github.com/ggml-org/llama.cpp/blob/master/convert_hf_to_gguf.py) as an example for its usage. ## Installation @@ -13,17 +13,17 @@ pip install gguf ## API Examples/Simple Tools -[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. +[examples/writer.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. -[examples/reader.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/reader.py) — Extracts and displays key-value pairs and tensor details from a GGUF file in a readable format. +[examples/reader.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/examples/reader.py) — Extracts and displays key-value pairs and tensor details from a GGUF file in a readable format. -[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console. +[gguf/scripts/gguf_dump.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console. -[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key. +[gguf/scripts/gguf_set_metadata.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key. -[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files. +[gguf/scripts/gguf_convert_endian.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files. -[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values. +[gguf/scripts/gguf_new_metadata.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values. ## Development Maintainers who participate in development of this package are advised to install it in editable mode: diff --git a/gguf-py/gguf/scripts/gguf_dump.py b/gguf-py/gguf/scripts/gguf_dump.py index f95b4fd48..20f23d729 100755 --- a/gguf-py/gguf/scripts/gguf_dump.py +++ b/gguf-py/gguf/scripts/gguf_dump.py @@ -181,7 +181,7 @@ def element_count_rounded_notation(count: int) -> str: def translate_tensor_name(name): words = name.split(".") - # Source: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#standardized-tensor-names + # Source: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#standardized-tensor-names abbreviation_dictionary = { 'token_embd': 'Token embedding', 'pos_embd': 'Position embedding', diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py index 40d59b75e..ae92d786a 100644 --- a/gguf-py/gguf/utility.py +++ b/gguf-py/gguf/utility.py @@ -47,7 +47,7 @@ def size_label(total_params: int, shared_params: int, expert_params: int, expert def naming_convention(model_name: str | None, base_name: str | None, finetune_string: str | None, version_string: str | None, size_label: str | None, output_type: str | None, model_type: Literal['vocab', 'LoRA'] | None = None) -> str: - # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention + # Reference: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#gguf-naming-convention if base_name is not None: name = base_name.strip().replace(' ', '-').replace('/', '-') diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index f2645f921..2ef7d14ab 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -127,7 +127,7 @@ class SpecialVocab: self.merges = merges elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str): # New format since transformers 4.45 to support spaces in merges - # ref: https://github.com/ggerganov/llama.cpp/issues/9692 + # ref: https://github.com/ggml-org/llama.cpp/issues/9692 # TODO: internally store as the new format instead of converting to old if any(' ' in s for pair in merges for s in pair): logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}') diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 78c6baa64..b4a47333d 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -9,7 +9,7 @@ packages = [ ] readme = "README.md" homepage = "https://ggml.ai" -repository = "https://github.com/ggerganov/llama.cpp" +repository = "https://github.com/ggml-org/llama.cpp" keywords = ["ggml", "gguf", "llama.cpp"] classifiers = [ "Programming Language :: Python :: 3", diff --git a/grammars/README.md b/grammars/README.md index 976954091..935213f5c 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -98,7 +98,7 @@ This guide provides a brief overview. Check out the GBNF files in this directory ## Troubleshooting -Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218). +Grammars currently have performance gotchas (see https://github.com/ggml-org/llama.cpp/issues/4218). ### Efficient optional repetitions @@ -126,7 +126,7 @@ You can use GBNF grammars: - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py) - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI) -Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555). +Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggml-org/llama.cpp/pull/5978, https://github.com/ggml-org/llama.cpp/pull/6659 & https://github.com/ggml-org/llama.cpp/pull/6555). ```bash llama-cli \ @@ -185,10 +185,10 @@ Here is also a list of known limitations (contributions welcome): - `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations). - `"additionalProperties": true` may produce keys that contain unescaped newlines. - Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp). -- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703) +- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggml-org/llama.cpp/issues/7703) - [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works) - `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number` -- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073) +- Nested `$ref`s are broken (https://github.com/ggml-org/llama.cpp/issues/8073) - [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$` - Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs) - `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email` diff --git a/include/llama.h b/include/llama.h index 1f5f3a09b..b0726cbe6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -213,7 +213,7 @@ extern "C" { LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported }; - // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979) + // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -307,7 +307,7 @@ extern "C" { }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations - // https://github.com/ggerganov/llama.cpp/pull/7544 + // https://github.com/ggml-org/llama.cpp/pull/7544 struct llama_context_params { uint32_t n_ctx; // text context, 0 = from model uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode @@ -320,7 +320,7 @@ extern "C" { enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id enum llama_attention_type attention_type; // attention type to use for embeddings - // ref: https://github.com/ggerganov/llama.cpp/pull/2054 + // ref: https://github.com/ggml-org/llama.cpp/pull/2054 float rope_freq_base; // RoPE base frequency, 0 = from model float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model @@ -385,7 +385,7 @@ extern "C" { struct llama_adapter_lora; // Helpers for getting default parameters - // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172) + // TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172) LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void); @@ -1040,7 +1040,7 @@ extern "C" { /// Apply chat template. Inspired by hf apply_chat_template() on python. /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" - /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead. /// @param chat Pointer to a list of multiple llama_chat_message /// @param n_msg Number of llama_chat_message in this chat @@ -1149,7 +1149,7 @@ extern "C" { /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), - "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)"); + "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); @@ -1157,7 +1157,7 @@ extern "C" { /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep); - /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 + /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841 LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep); /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. @@ -1203,7 +1203,7 @@ extern "C" { const char * grammar_str, const char * grammar_root); - /// @details Lazy grammar sampler, introduced in https://github.com/ggerganov/llama.cpp/pull/9639 + /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639 /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future. /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy( diff --git a/pyproject.toml b/pyproject.toml index 84e71de6d..ed62264ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "Scripts that ship with llama.cpp" authors = ["GGML "] readme = "README.md" homepage = "https://ggml.ai" -repository = "https://github.com/ggerganov/llama.cpp" +repository = "https://github.com/ggml-org/llama.cpp" keywords = ["ggml", "gguf", "llama.cpp"] packages = [{ include = "*.py", from = "." }] classifiers = [ diff --git a/scripts/check-requirements.sh b/scripts/check-requirements.sh index d3bbded13..4c3b05f68 100755 --- a/scripts/check-requirements.sh +++ b/scripts/check-requirements.sh @@ -170,7 +170,7 @@ check_convert_script examples/convert_legacy_llama.py for py in convert_*.py; do # skip convert_hf_to_gguf_update.py # TODO: the check is failing for some reason: - # https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920 + # https://github.com/ggml-org/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920 [[ $py == convert_hf_to_gguf_update.py ]] && continue check_convert_script "$py" diff --git a/src/unicode.cpp b/src/unicode.cpp index a32ae6d08..e63bb4ab0 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -708,7 +708,7 @@ std::vector unicode_regex_split(const std::string & text, const std const auto cpts = unicode_cpts_from_utf8(text); // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte - // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 + // ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935 std::string text_collapsed; if (need_collapse) { // collapse all unicode categories