mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-04-13 01:56:09 +00:00
repo : update links to new url (#11886)
* repo : update links to new url ggml-ci * cont : more urls ggml-ci
This commit is contained in:
parent
f355229692
commit
68ff663a04
@ -17,10 +17,10 @@ Version: %( date "+%%Y%%m%%d" )
|
||||
Release: 1%{?dist}
|
||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||
License: MIT
|
||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
|
||||
Requires: cuda-toolkit
|
||||
URL: https://github.com/ggerganov/llama.cpp
|
||||
URL: https://github.com/ggml-org/llama.cpp
|
||||
|
||||
%define debug_package %{nil}
|
||||
%define source_date_epoch_from_changelog 0
|
||||
|
@ -18,10 +18,10 @@ Version: %( date "+%%Y%%m%%d" )
|
||||
Release: 1%{?dist}
|
||||
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
|
||||
License: MIT
|
||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
|
||||
BuildRequires: coreutils make gcc-c++ git libstdc++-devel
|
||||
Requires: libstdc++
|
||||
URL: https://github.com/ggerganov/llama.cpp
|
||||
URL: https://github.com/ggml-org/llama.cpp
|
||||
|
||||
%define debug_package %{nil}
|
||||
%define source_date_epoch_from_changelog 0
|
||||
|
@ -133,12 +133,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||
'';
|
||||
|
||||
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
||||
# With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
|
||||
# `default.metallib` may be compiled with Metal compiler from XCode
|
||||
# and we need to escape sandbox on MacOS to access Metal compiler.
|
||||
# `xcrun` is used find the path of the Metal compiler, which is varible
|
||||
# and not on $PATH
|
||||
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
|
||||
# see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion
|
||||
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
||||
|
||||
nativeBuildInputs =
|
||||
@ -220,7 +220,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||
|
||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||
homepage = "https://github.com/ggml-org/llama.cpp/";
|
||||
license = lib.licenses.mit;
|
||||
|
||||
# Accommodates `nix run` and `lib.getExe`
|
||||
|
@ -11,7 +11,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
|
||||
# gfx906 is deprecated
|
||||
|
6
.github/ISSUE_TEMPLATE/020-enhancement.yml
vendored
6
.github/ISSUE_TEMPLATE/020-enhancement.yml
vendored
@ -6,7 +6,7 @@ body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)
|
||||
|
||||
- type: checkboxes
|
||||
id: prerequisites
|
||||
@ -16,11 +16,11 @@ body:
|
||||
options:
|
||||
- label: I am running the latest code. Mention the version if possible as well.
|
||||
required: true
|
||||
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
- label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
|
||||
required: true
|
||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||
required: true
|
||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
||||
- label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
|
2
.github/ISSUE_TEMPLATE/030-research.yml
vendored
2
.github/ISSUE_TEMPLATE/030-research.yml
vendored
@ -6,7 +6,7 @@ body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
||||
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
||||
|
||||
- type: checkboxes
|
||||
id: research-stage
|
||||
|
4
.github/ISSUE_TEMPLATE/040-refactor.yml
vendored
4
.github/ISSUE_TEMPLATE/040-refactor.yml
vendored
@ -6,8 +6,8 @@ body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||
Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||
Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||
|
||||
- type: textarea
|
||||
id: background-description
|
||||
|
6
.github/ISSUE_TEMPLATE/config.yml
vendored
6
.github/ISSUE_TEMPLATE/config.yml
vendored
@ -1,11 +1,11 @@
|
||||
blank_issues_enabled: true
|
||||
contact_links:
|
||||
- name: Got an idea?
|
||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
|
||||
url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
|
||||
about: Pop it there. It may then become an enhancement ticket.
|
||||
- name: Got a question?
|
||||
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
|
||||
url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
|
||||
about: Ask a question there!
|
||||
- name: Want to contribute?
|
||||
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
||||
url: https://github.com/ggml-org/llama.cpp/wiki/contribute
|
||||
about: Head to the contribution guide page of the wiki for areas you can help with
|
||||
|
2
.github/pull_request_template.md
vendored
2
.github/pull_request_template.md
vendored
@ -1 +1 @@
|
||||
*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
|
||||
*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
|
||||
|
12
.github/workflows/bench.yml.disabled
vendored
12
.github/workflows/bench.yml.disabled
vendored
@ -1,5 +1,5 @@
|
||||
# TODO: there have been some issues with the workflow, so disabling for now
|
||||
# https://github.com/ggerganov/llama.cpp/issues/7893
|
||||
# https://github.com/ggml-org/llama.cpp/issues/7893
|
||||
#
|
||||
# Benchmark
|
||||
name: Benchmark
|
||||
@ -57,17 +57,7 @@ jobs:
|
||||
|
||||
if: |
|
||||
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|
||||
|| (
|
||||
github.event_name == 'schedule'
|
||||
&& github.ref_name == 'master'
|
||||
&& github.repository_owner == 'ggerganov'
|
||||
)
|
||||
|| github.event_name == 'pull_request_target'
|
||||
|| (
|
||||
github.event_name == 'push'
|
||||
&& github.event.ref == 'refs/heads/master'
|
||||
&& github.repository_owner == 'ggerganov'
|
||||
)
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
|
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@ -129,7 +129,7 @@ jobs:
|
||||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
|
2
.github/workflows/labeler.yml
vendored
2
.github/workflows/labeler.yml
vendored
@ -11,7 +11,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: "ggerganov/llama.cpp"
|
||||
repository: "ggml-org/llama.cpp"
|
||||
- uses: actions/labeler@v5
|
||||
with:
|
||||
configuration-path: '.github/labeler.yml'
|
||||
|
@ -12,7 +12,7 @@
|
||||
|
||||
- Squash-merge PRs
|
||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
|
||||
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
|
||||
|
||||
# Coding guidelines
|
||||
@ -40,14 +40,14 @@
|
||||
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
|
||||
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
|
||||
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
|
||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
||||
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
|
||||
|
||||

|
||||
|
||||
# Naming guidelines
|
||||
|
||||
- Use `snake_case` for function, variable and type names
|
||||
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
||||
- Naming usually optimizes for longest common prefix (see https://github.com/ggml-org/ggml/pull/302#discussion_r1243240963)
|
||||
|
||||
```cpp
|
||||
// not OK
|
||||
@ -122,4 +122,4 @@
|
||||
|
||||
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
|
||||
|
||||
https://github.com/ggerganov/llama.cpp/projects
|
||||
https://github.com/ggml-org/llama.cpp/projects
|
||||
|
8
Makefile
8
Makefile
@ -1,5 +1,5 @@
|
||||
ifndef LLAMA_MAKEFILE
|
||||
$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
|
||||
endif
|
||||
|
||||
# Define the default target now so that it is always the first target
|
||||
@ -463,7 +463,7 @@ endif
|
||||
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
|
||||
# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
|
||||
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
|
||||
# https://github.com/ggerganov/llama.cpp/issues/2922
|
||||
# https://github.com/ggml-org/llama.cpp/issues/2922
|
||||
MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||
MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
|
||||
|
||||
@ -1078,8 +1078,8 @@ endif
|
||||
ifdef REMOVE_WARNING
|
||||
$(info !!! REMOVAL WARNING !!!)
|
||||
$(info The following LLAMA_ options have been removed and are no longer supported)
|
||||
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418))
|
||||
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
|
||||
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggml-org/llama.cpp/pull/9418))
|
||||
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418))
|
||||
$(info )
|
||||
endif
|
||||
|
||||
|
52
README.md
52
README.md
@ -3,26 +3,26 @@
|
||||

|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||
[Roadmap](https://github.com/users/ggml-org/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
|
||||
## Recent API changes
|
||||
|
||||
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
|
||||
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
|
||||
- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
|
||||
- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427
|
||||
- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
|
||||
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
|
||||
- Universal tool call support in `llama-server`: https://github.com/ggerganov/llama.cpp/pull/9639
|
||||
- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
|
||||
----
|
||||
|
||||
@ -39,7 +39,7 @@ range of hardware - locally and in the cloud.
|
||||
- Vulkan and SYCL backend support
|
||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||
|
||||
The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
|
||||
The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library.
|
||||
|
||||
<details>
|
||||
<summary>Models</summary>
|
||||
@ -59,23 +59,23 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
|
||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
|
||||
- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423)
|
||||
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
||||
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
|
||||
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
|
||||
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
|
||||
- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187)
|
||||
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
|
||||
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
|
||||
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
|
||||
- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417)
|
||||
- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553)
|
||||
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
|
||||
- [X] [StableLM models](https://huggingface.co/stabilityai)
|
||||
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
|
||||
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
||||
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
||||
- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557)
|
||||
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
|
||||
- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003)
|
||||
- [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003)
|
||||
- [x] [GPT-2](https://huggingface.co/gpt2)
|
||||
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
|
||||
- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118)
|
||||
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
|
||||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||
- [x] [Gemma](https://ai.google.dev/gemma)
|
||||
@ -146,7 +146,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
|
||||
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
|
||||
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326)
|
||||
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
||||
@ -245,7 +245,7 @@ The project also includes many example programs and tools using the `llama` libr
|
||||
- Clone this repository and build locally, see [how to build](docs/build.md)
|
||||
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
|
||||
- Use a Docker image, see [documentation for Docker](docs/docker.md)
|
||||
- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
|
||||
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
|
||||
|
||||
## Obtaining and quantizing models
|
||||
|
||||
@ -258,14 +258,14 @@ You can either manually download the GGUF file or directly use any `llama.cpp`-c
|
||||
|
||||
After downloading a model, use the CLI tools to run it locally - see below.
|
||||
|
||||
`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
|
||||
`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
|
||||
|
||||
The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
|
||||
|
||||
- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
|
||||
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
|
||||
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
|
||||
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
|
||||
- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)
|
||||
- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
|
||||
- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
|
||||
|
||||
To learn more about model quantization, [read this documentation](examples/quantize/README.md)
|
||||
|
||||
@ -488,9 +488,9 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
|
||||
- Collaborators will be invited based on contributions
|
||||
- Any help with managing issues, PRs and projects is very appreciated!
|
||||
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
||||
- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
|
||||
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
|
||||
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
|
||||
- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205)
|
||||
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
|
||||
|
||||
## Other documentation
|
||||
@ -505,7 +505,7 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
- [Running on Docker](docs/docker.md)
|
||||
- [Build on Android](docs/android.md)
|
||||
- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
|
||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||
- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||
|
||||
#### Seminal papers and background on the models
|
||||
|
||||
|
@ -62,6 +62,6 @@ Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-
|
||||
<!-- normal version -->
|
||||
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
||||
|
||||
Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
|
||||
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
|
||||
|
||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
||||
|
@ -1,11 +1,11 @@
|
||||
# CI
|
||||
|
||||
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
|
||||
In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
|
||||
|
||||
https://github.com/ggml-org/ci
|
||||
|
||||
It monitors the `master` branch for new commits and runs the
|
||||
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
||||
[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
||||
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
|
||||
to cover various hardware architectures, including GPU and Apple Silicon instances.
|
||||
|
||||
|
@ -1569,7 +1569,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"- isolate: only spawn threads on CPUs on the node that execution started on\n"
|
||||
"- numactl: use the CPU map provided by numactl\n"
|
||||
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
||||
"see https://github.com/ggerganov/llama.cpp/issues/1437",
|
||||
"see https://github.com/ggml-org/llama.cpp/issues/1437",
|
||||
[](common_params & params, const std::string & value) {
|
||||
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
||||
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
||||
|
@ -558,7 +558,7 @@ class Model:
|
||||
|
||||
# NOTE: this function is generated by convert_hf_to_gguf_update.py
|
||||
# do not modify it manually!
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
|
||||
# Marker: Start get_vocab_base_pre
|
||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||
@ -708,7 +708,7 @@ class Model:
|
||||
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
||||
logger.warning("** - the pre-tokenization config has changed upstream")
|
||||
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
||||
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||
logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
|
||||
logger.warning("**")
|
||||
logger.warning(f"** chkhsh: {chkhsh}")
|
||||
logger.warning("**************************************************************************************")
|
||||
@ -2835,7 +2835,7 @@ class InternLM2Model(Model):
|
||||
if chat_eos_token_id is not None:
|
||||
# For the chat model, we replace the eos with '<|im_end|>'.
|
||||
# TODO: this is a hack, should be fixed
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
||||
# https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
|
||||
special_vocab.special_token_ids["eos"] = chat_eos_token_id
|
||||
logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
|
||||
" in chat mode so that the conversation can end normally.")
|
||||
|
@ -8,7 +8,7 @@
|
||||
# provide the necessary information to llama.cpp via the GGUF header in order to implement
|
||||
# the same pre-tokenizer.
|
||||
#
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/6920
|
||||
#
|
||||
# Instructions:
|
||||
#
|
||||
@ -246,7 +246,7 @@ src_func = f"""
|
||||
logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet")
|
||||
logger.warning("** - the pre-tokenization config has changed upstream")
|
||||
logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
|
||||
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
|
||||
logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920")
|
||||
logger.warning("**")
|
||||
logger.warning(f"** chkhsh: {{chkhsh}}")
|
||||
logger.warning("**************************************************************************************")
|
||||
|
@ -395,7 +395,7 @@ if __name__ == '__main__':
|
||||
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
|
||||
if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
|
||||
logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
|
||||
logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
|
||||
logger.error("Please refer to https://github.com/ggml-org/llama.cpp/pull/9948")
|
||||
sys.exit(1)
|
||||
|
||||
if base_name in tensor_map:
|
||||
@ -419,7 +419,7 @@ if __name__ == '__main__':
|
||||
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
||||
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
||||
# therefore, we ignore them for now
|
||||
# see: https://github.com/ggerganov/llama.cpp/issues/9065
|
||||
# see: https://github.com/ggml-org/llama.cpp/issues/9065
|
||||
if name == "lm_head.weight" and len(dest) == 0:
|
||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||
for dest_name, dest_data in dest:
|
||||
|
@ -12,7 +12,7 @@ $ apt update && apt upgrade -y
|
||||
$ apt install git cmake
|
||||
```
|
||||
|
||||
Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
|
||||
Then, follow the [build instructions](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md), specifically for CMake.
|
||||
|
||||
Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
|
||||
|
||||
|
@ -122,7 +122,7 @@ cp libOpenCL.so ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x
|
||||
```sh
|
||||
cd ~/dev/llm
|
||||
|
||||
git clone https://github.com/ggerganov/llama.cpp && \
|
||||
git clone https://github.com/ggml-org/llama.cpp && \
|
||||
cd llama.cpp && \
|
||||
mkdir build-android && cd build-android
|
||||
|
||||
@ -182,7 +182,7 @@ cmake --build . --target install
|
||||
mkdir -p ~/dev/llm
|
||||
cd ~/dev/llm
|
||||
|
||||
git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp && cd llama.cpp
|
||||
mkdir build && cd build
|
||||
|
||||
cmake .. -G Ninja `
|
||||
|
@ -36,8 +36,8 @@ The following release is verified with good quality:
|
||||
|
||||
|Commit ID|Tag|Release|Verified Platform| Update date|
|
||||
|-|-|-|-|-|
|
||||
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
||||
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|
||||
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|
||||
|
||||
|
||||
## News
|
||||
@ -58,7 +58,7 @@ The following release is verified with good quality:
|
||||
- 2024.3
|
||||
- Release binary files of Windows.
|
||||
- A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
|
||||
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
|
||||
- New base line is ready: [tag b2437](https://github.com/ggml-org/llama.cpp/tree/b2437).
|
||||
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
||||
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
|
||||
- Support detecting all GPUs with level-zero and same top **Max compute units**.
|
||||
|
@ -3,7 +3,7 @@
|
||||
**To get the Code:**
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
|
@ -248,7 +248,7 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t
|
||||
|
||||
- **Building `llama.cpp`:**
|
||||
|
||||
- With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
|
||||
- With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
|
||||
- Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration.
|
||||
|
||||
- **Using the Toolbox Environment:**
|
||||
|
@ -104,16 +104,16 @@ Note: to debug the inference graph: you can use [llama-eval-callback](/examples/
|
||||
|
||||
## GGUF specification
|
||||
|
||||
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
||||
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md
|
||||
|
||||
## Resources
|
||||
|
||||
- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
|
||||
- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
|
||||
- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
|
||||
- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
|
||||
- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
|
||||
- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
|
||||
- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
|
||||
- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
|
||||
- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
|
||||
- YaRN RoPE scaling https://github.com/ggml-org/llama.cpp/pull/2268
|
||||
- support Baichuan serial models https://github.com/ggml-org/llama.cpp/pull/3009
|
||||
- support attention bias https://github.com/ggml-org/llama.cpp/pull/4283
|
||||
- Mixtral support https://github.com/ggml-org/llama.cpp/pull/4406
|
||||
- BERT embeddings https://github.com/ggml-org/llama.cpp/pull/5423
|
||||
- Grok-1 support https://github.com/ggml-org/llama.cpp/pull/6204
|
||||
- Command R Plus support https://github.com/ggml-org/llama.cpp/pull/6491
|
||||
- support arch DBRX https://github.com/ggml-org/llama.cpp/pull/6515
|
||||
- How to convert HuggingFace model to GGUF format https://github.com/ggml-org/llama.cpp/discussions/2948
|
||||
|
@ -7,21 +7,21 @@
|
||||
## Images
|
||||
We have three Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
Additionally, there the following images, similar to the above:
|
||||
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
||||
|
||||
@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i
|
||||
Replace `/path/to/models` below with the actual path where you downloaded the models.
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
|
||||
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-one "/models/" 7B
|
||||
```
|
||||
|
||||
On completion, you are ready to play!
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
or with a light image:
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
or with a server image:
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
||||
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
||||
```
|
||||
|
||||
## Docker With CUDA
|
||||
|
@ -7,7 +7,7 @@ On Mac and Linux, the homebrew package manager can be used via
|
||||
```sh
|
||||
brew install llama.cpp
|
||||
```
|
||||
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
|
||||
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
|
||||
|
||||
## Nix
|
||||
|
||||
|
@ -3,9 +3,9 @@
|
||||
This example demonstrates how to generate a control vector using gguf models.
|
||||
|
||||
Related PRs:
|
||||
- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
|
||||
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
|
||||
- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
|
||||
- [Add support for control vectors](https://github.com/ggml-org/llama.cpp/pull/5970)
|
||||
- (Issue) [Generate control vector using llama.cpp](https://github.com/ggml-org/llama.cpp/issues/6880)
|
||||
- [Add cvector-generator example](https://github.com/ggml-org/llama.cpp/pull/7514)
|
||||
|
||||
## Examples
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
# llama.cpp/examples/imatrix
|
||||
|
||||
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
|
||||
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
|
||||
More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861
|
||||
|
||||
## Usage
|
||||
|
||||
|
@ -100,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
|
||||
|
||||
// this has been adapted to the new format of storing merged experts in a single 3d tensor
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
|
||||
if (t->op == GGML_OP_MUL_MAT_ID) {
|
||||
// ids -> [n_experts_used, n_tokens]
|
||||
// src1 -> [cols, n_expert_used, n_tokens]
|
||||
|
@ -14,7 +14,7 @@ project("llama-android")
|
||||
#include(FetchContent)
|
||||
#FetchContent_Declare(
|
||||
# llama
|
||||
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
# GIT_REPOSITORY https://github.com/ggml-org/llama.cpp
|
||||
# GIT_TAG master
|
||||
#)
|
||||
|
||||
|
@ -3,9 +3,9 @@
|
||||
Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
|
||||
point for more advanced projects.
|
||||
|
||||
For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
|
||||
For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508
|
||||
|
||||

|
||||

|
||||
|
||||
Video demonstration:
|
||||
|
||||
|
@ -39,7 +39,7 @@
|
||||
"
|
||||
" :call llama#init()
|
||||
"
|
||||
" more info: https://github.com/ggerganov/llama.cpp/pull/9787
|
||||
" more info: https://github.com/ggml-org/llama.cpp/pull/9787
|
||||
"
|
||||
|
||||
" colors (adjust to your liking)
|
||||
|
@ -26,7 +26,7 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md
|
||||
https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md
|
||||
|
||||
```bash
|
||||
cmake -B build
|
||||
|
@ -6,7 +6,7 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
|
@ -4,4 +4,4 @@ Demonstration of lookahead decoding technique:
|
||||
|
||||
https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||
|
||||
More info: https://github.com/ggerganov/llama.cpp/pull/4207
|
||||
More info: https://github.com/ggml-org/llama.cpp/pull/4207
|
||||
|
@ -8,5 +8,5 @@ The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft
|
||||
|
||||
More info:
|
||||
|
||||
https://github.com/ggerganov/llama.cpp/pull/4484
|
||||
https://github.com/ggerganov/llama.cpp/issues/4226
|
||||
https://github.com/ggml-org/llama.cpp/pull/4484
|
||||
https://github.com/ggml-org/llama.cpp/issues/4226
|
||||
|
@ -1,6 +1,6 @@
|
||||
# llama.cpp/examples/main
|
||||
|
||||
This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
|
||||
This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
@ -121,7 +121,7 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t
|
||||
|
||||
### Chat templates
|
||||
|
||||
`--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name. Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
|
||||
`--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name. Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
|
||||
|
||||
Example usage: `--chat-template gemma`
|
||||
|
||||
|
@ -5,8 +5,8 @@ models ability to recall information from long contexts.
|
||||
|
||||
See the following PRs for more info:
|
||||
|
||||
- https://github.com/ggerganov/llama.cpp/pull/3856
|
||||
- https://github.com/ggerganov/llama.cpp/pull/4810
|
||||
- https://github.com/ggml-org/llama.cpp/pull/3856
|
||||
- https://github.com/ggml-org/llama.cpp/pull/4810
|
||||
|
||||
### Usage
|
||||
|
||||
|
@ -23,7 +23,7 @@ def create_completion(host, prompt, gbnf_grammar):
|
||||
"""Calls the /completion API on llama-server.
|
||||
|
||||
See
|
||||
https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
|
||||
https://github.com/ggml-org/llama.cpp/tree/HEAD/examples/server#api-endpoints
|
||||
"""
|
||||
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
@ -69,22 +69,22 @@ Several quantization methods are supported. They differ in the resulting model d
|
||||
| 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 |
|
||||
| 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 |
|
||||
|
||||
- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
|
||||
- [k-quants](https://github.com/ggml-org/llama.cpp/pull/1684)
|
||||
- recent k-quants improvements and new i-quants
|
||||
- [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
|
||||
- [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
|
||||
- [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
|
||||
- [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
|
||||
- [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
|
||||
- [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
|
||||
- [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
|
||||
- [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
|
||||
- [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
|
||||
- [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
|
||||
- [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
|
||||
- [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
|
||||
- [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
|
||||
- [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
|
||||
- [#2707](https://github.com/ggml-org/llama.cpp/pull/2707)
|
||||
- [#2807](https://github.com/ggml-org/llama.cpp/pull/2807)
|
||||
- [#4773 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4773)
|
||||
- [#4856 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4856)
|
||||
- [#4861 - importance matrix](https://github.com/ggml-org/llama.cpp/pull/4861)
|
||||
- [#4872 - MoE models](https://github.com/ggml-org/llama.cpp/pull/4872)
|
||||
- [#4897 - 2-bit quantization](https://github.com/ggml-org/llama.cpp/pull/4897)
|
||||
- [#4930 - imatrix for all k-quants](https://github.com/ggml-org/llama.cpp/pull/4930)
|
||||
- [#4951 - imatrix on the GPU](https://github.com/ggml-org/llama.cpp/pull/4957)
|
||||
- [#4969 - imatrix for legacy quants](https://github.com/ggml-org/llama.cpp/pull/4969)
|
||||
- [#4996 - k-quants tuning](https://github.com/ggml-org/llama.cpp/pull/4996)
|
||||
- [#5060 - Q3_K_XS](https://github.com/ggml-org/llama.cpp/pull/5060)
|
||||
- [#5196 - 3-bit i-quants](https://github.com/ggml-org/llama.cpp/pull/5196)
|
||||
- [quantization tuning](https://github.com/ggml-org/llama.cpp/pull/5320), [another one](https://github.com/ggml-org/llama.cpp/pull/5334), and [another one](https://github.com/ggml-org/llama.cpp/pull/5361)
|
||||
|
||||
**Llama 2 7B**
|
||||
|
||||
|
@ -3,7 +3,7 @@
|
||||
Demonstration of simple retrieval technique based on cosine similarity
|
||||
|
||||
More info:
|
||||
https://github.com/ggerganov/llama.cpp/pull/6193
|
||||
https://github.com/ggml-org/llama.cpp/pull/6193
|
||||
|
||||
### How to use
|
||||
|
||||
|
@ -5,7 +5,7 @@ option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
if (MINGW)
|
||||
# fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
|
||||
# fix: https://github.com/ggml-org/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
|
||||
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
||||
endif()
|
||||
|
||||
|
@ -7,14 +7,14 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantized models on GPU and CPU
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||
* Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510)
|
||||
* Reranking endoint (WIP: https://github.com/ggml-org/llama.cpp/pull/9510)
|
||||
* Parallel decoding with multi-user support
|
||||
* Continuous batching
|
||||
* Multimodal (wip)
|
||||
* Monitoring endpoints
|
||||
* Schema-constrained JSON response format
|
||||
|
||||
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
|
||||
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggml-org/llama.cpp/issues/4216).
|
||||
|
||||
## Usage
|
||||
|
||||
@ -65,7 +65,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
|
||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
||||
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
|
||||
| `--list-devices` | print list of available devices and exit |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
@ -178,7 +178,7 @@ Example usage of docker compose with environment variables:
|
||||
```yml
|
||||
services:
|
||||
llamacpp-server:
|
||||
image: ghcr.io/ggerganov/llama.cpp:server
|
||||
image: ghcr.io/ggml-org/llama.cpp:server
|
||||
ports:
|
||||
- 8080:8080
|
||||
volumes:
|
||||
@ -273,10 +273,10 @@ You can consume the endpoints with Postman or NodeJS with axios library. You can
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
|
||||
docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
|
||||
|
||||
# or, with CUDA:
|
||||
docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
|
||||
docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggml-org/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
|
||||
```
|
||||
|
||||
## Testing with CURL
|
||||
@ -1066,7 +1066,7 @@ print(completion.choices[0].text)
|
||||
|
||||
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
|
||||
|
||||
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
|
||||
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
|
||||
|
||||
*Options:*
|
||||
|
||||
@ -1120,7 +1120,7 @@ curl http://localhost:8080/v1/chat/completions \
|
||||
|
||||
*Tool call support*
|
||||
|
||||
[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639):
|
||||
[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggml-org/llama.cpp/pull/9639):
|
||||
|
||||
- Requires `--jinja` flag
|
||||
- Native tool call formats supported:
|
||||
@ -1599,7 +1599,7 @@ Apart from error types supported by OAI, we also have custom types that are spec
|
||||
|
||||
### Legacy completion web UI
|
||||
|
||||
A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
|
||||
A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggml-org/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
|
||||
|
||||
For example:
|
||||
|
||||
|
@ -42,7 +42,7 @@ enum stop_type {
|
||||
STOP_TYPE_LIMIT,
|
||||
};
|
||||
|
||||
// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
|
||||
// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
|
||||
enum slot_state {
|
||||
SLOT_STATE_IDLE,
|
||||
SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
|
||||
|
@ -367,10 +367,10 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
||||
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
|
||||
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
||||
}
|
||||
|
||||
chat.push_back({role, content, /* tool_calls= */ {}});
|
||||
|
@ -1,6 +1,6 @@
|
||||
# llama.cpp/example/simple-cmake-pkg
|
||||
|
||||
This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||
This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggml-org/llama.cpp) in projects which live outside of the source tree.
|
||||
|
||||
## Building
|
||||
|
||||
@ -13,7 +13,7 @@ When hardware acceleration libraries are used (e.g. CUDA, Metal, Vulkan, etc.),
|
||||
### Build llama.cpp and install to llama.cpp/inst
|
||||
|
||||
```sh
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
cmake -S . -B build
|
||||
cmake --build build
|
||||
|
@ -4,6 +4,6 @@ Demonstration of speculative decoding and tree-based speculative decoding techni
|
||||
|
||||
More info:
|
||||
|
||||
- https://github.com/ggerganov/llama.cpp/pull/2926
|
||||
- https://github.com/ggerganov/llama.cpp/pull/3624
|
||||
- https://github.com/ggerganov/llama.cpp/pull/5625
|
||||
- https://github.com/ggml-org/llama.cpp/pull/2926
|
||||
- https://github.com/ggml-org/llama.cpp/pull/3624
|
||||
- https://github.com/ggml-org/llama.cpp/pull/5625
|
||||
|
@ -36,7 +36,7 @@
|
||||
# ```
|
||||
# nixConfig = {
|
||||
# extra-substituters = [
|
||||
# # Populated by the CI in ggerganov/llama.cpp
|
||||
# # Populated by the CI in ggml-org/llama.cpp
|
||||
# "https://llama-cpp.cachix.org"
|
||||
#
|
||||
# # A development cache for nixpkgs imported with `config.cudaSupport = true`.
|
||||
@ -56,11 +56,11 @@
|
||||
# };
|
||||
# ```
|
||||
|
||||
# For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
|
||||
# For inspection, use `nix flake show github:ggml-org/llama.cpp` or the nix repl:
|
||||
#
|
||||
# ```bash
|
||||
# ❯ nix repl
|
||||
# nix-repl> :lf github:ggerganov/llama.cpp
|
||||
# nix-repl> :lf github:ggml-org/llama.cpp
|
||||
# Added 13 variables.
|
||||
# nix-repl> outputs.apps.x86_64-linux.quantize
|
||||
# { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/llama-quantize"; type = "app"; }
|
||||
@ -176,7 +176,7 @@
|
||||
#
|
||||
# We could test all outputs e.g. as `checks = confg.packages`.
|
||||
#
|
||||
# TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed
|
||||
# TODO: Build more once https://github.com/ggml-org/llama.cpp/issues/6346 has been addressed
|
||||
checks = {
|
||||
inherit (config.packages) default vulkan;
|
||||
};
|
||||
|
@ -8,7 +8,7 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
// since https://github.com/ggml-org/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||
|
@ -45,7 +45,7 @@ GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
GGML_DEPRECATED(
|
||||
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
||||
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
|
||||
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||
|
||||
|
@ -1816,7 +1816,7 @@ inline static float ggml_silu_f32(float x) {
|
||||
|
||||
#if __FINITE_MATH_ONLY__
|
||||
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
||||
#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
|
||||
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||
@ -7574,7 +7574,7 @@ UseGgmlGemm2:;
|
||||
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
||||
|
||||
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
||||
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
||||
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915
|
||||
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
||||
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||
|
@ -1983,7 +1983,7 @@ static void ggml_metal_encode_node(
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
// TODO: add ggml_metal_kargs struct
|
||||
// TODO: optimize (see https://github.com/ggerganov/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
|
||||
// TODO: optimize (see https://github.com/ggml-org/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
if (id_src1) {
|
||||
|
@ -1058,7 +1058,7 @@ kernel void kernel_soft_max(
|
||||
}
|
||||
|
||||
// This barrier fixes a failing test
|
||||
// ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
|
||||
// ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
|
||||
threadgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
float sum = simd_sum(lsum);
|
||||
@ -1163,7 +1163,7 @@ kernel void kernel_soft_max_4(
|
||||
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
||||
|
||||
// This barrier fixes a failing test
|
||||
// ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
|
||||
// ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
|
||||
threadgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
float sum = simd_sum(lsum);
|
||||
|
@ -1,9 +1,9 @@
|
||||
## gguf
|
||||
|
||||
This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
|
||||
This is a Python package for writing binary files in the [GGUF](https://github.com/ggml-org/ggml/pull/302)
|
||||
(GGML Universal File) format.
|
||||
|
||||
See [convert_hf_to_gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py)
|
||||
See [convert_hf_to_gguf.py](https://github.com/ggml-org/llama.cpp/blob/master/convert_hf_to_gguf.py)
|
||||
as an example for its usage.
|
||||
|
||||
## Installation
|
||||
@ -13,17 +13,17 @@ pip install gguf
|
||||
|
||||
## API Examples/Simple Tools
|
||||
|
||||
[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
|
||||
[examples/writer.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
|
||||
|
||||
[examples/reader.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/reader.py) — Extracts and displays key-value pairs and tensor details from a GGUF file in a readable format.
|
||||
[examples/reader.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/examples/reader.py) — Extracts and displays key-value pairs and tensor details from a GGUF file in a readable format.
|
||||
|
||||
[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
|
||||
[gguf/scripts/gguf_dump.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
|
||||
|
||||
[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
||||
[gguf/scripts/gguf_set_metadata.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
|
||||
|
||||
[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
|
||||
[gguf/scripts/gguf_convert_endian.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
|
||||
|
||||
[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
||||
[gguf/scripts/gguf_new_metadata.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
|
||||
|
||||
## Development
|
||||
Maintainers who participate in development of this package are advised to install it in editable mode:
|
||||
|
@ -181,7 +181,7 @@ def element_count_rounded_notation(count: int) -> str:
|
||||
def translate_tensor_name(name):
|
||||
words = name.split(".")
|
||||
|
||||
# Source: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#standardized-tensor-names
|
||||
# Source: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#standardized-tensor-names
|
||||
abbreviation_dictionary = {
|
||||
'token_embd': 'Token embedding',
|
||||
'pos_embd': 'Position embedding',
|
||||
|
@ -47,7 +47,7 @@ def size_label(total_params: int, shared_params: int, expert_params: int, expert
|
||||
|
||||
|
||||
def naming_convention(model_name: str | None, base_name: str | None, finetune_string: str | None, version_string: str | None, size_label: str | None, output_type: str | None, model_type: Literal['vocab', 'LoRA'] | None = None) -> str:
|
||||
# Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
|
||||
# Reference: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#gguf-naming-convention
|
||||
|
||||
if base_name is not None:
|
||||
name = base_name.strip().replace(' ', '-').replace('/', '-')
|
||||
|
@ -127,7 +127,7 @@ class SpecialVocab:
|
||||
self.merges = merges
|
||||
elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
|
||||
# New format since transformers 4.45 to support spaces in merges
|
||||
# ref: https://github.com/ggerganov/llama.cpp/issues/9692
|
||||
# ref: https://github.com/ggml-org/llama.cpp/issues/9692
|
||||
# TODO: internally store as the new format instead of converting to old
|
||||
if any(' ' in s for pair in merges for s in pair):
|
||||
logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
|
||||
|
@ -9,7 +9,7 @@ packages = [
|
||||
]
|
||||
readme = "README.md"
|
||||
homepage = "https://ggml.ai"
|
||||
repository = "https://github.com/ggerganov/llama.cpp"
|
||||
repository = "https://github.com/ggml-org/llama.cpp"
|
||||
keywords = ["ggml", "gguf", "llama.cpp"]
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
|
@ -98,7 +98,7 @@ This guide provides a brief overview. Check out the GBNF files in this directory
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218).
|
||||
Grammars currently have performance gotchas (see https://github.com/ggml-org/llama.cpp/issues/4218).
|
||||
|
||||
### Efficient optional repetitions
|
||||
|
||||
@ -126,7 +126,7 @@ You can use GBNF grammars:
|
||||
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
||||
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
|
||||
|
||||
Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
|
||||
Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggml-org/llama.cpp/pull/5978, https://github.com/ggml-org/llama.cpp/pull/6659 & https://github.com/ggml-org/llama.cpp/pull/6555).
|
||||
|
||||
```bash
|
||||
llama-cli \
|
||||
@ -185,10 +185,10 @@ Here is also a list of known limitations (contributions welcome):
|
||||
- `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations).
|
||||
- `"additionalProperties": true` may produce keys that contain unescaped newlines.
|
||||
- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
|
||||
- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
|
||||
- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggml-org/llama.cpp/issues/7703)
|
||||
- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
|
||||
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
|
||||
- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
|
||||
- Nested `$ref`s are broken (https://github.com/ggml-org/llama.cpp/issues/8073)
|
||||
- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
|
||||
- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
|
||||
- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
|
||||
|
@ -213,7 +213,7 @@ extern "C" {
|
||||
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
||||
};
|
||||
|
||||
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
float logit; // log-odds of the token
|
||||
@ -307,7 +307,7 @@ extern "C" {
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
// https://github.com/ggerganov/llama.cpp/pull/7544
|
||||
// https://github.com/ggml-org/llama.cpp/pull/7544
|
||||
struct llama_context_params {
|
||||
uint32_t n_ctx; // text context, 0 = from model
|
||||
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
||||
@ -320,7 +320,7 @@ extern "C" {
|
||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||
enum llama_attention_type attention_type; // attention type to use for embeddings
|
||||
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
|
||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
||||
@ -385,7 +385,7 @@ extern "C" {
|
||||
struct llama_adapter_lora;
|
||||
|
||||
// Helpers for getting default parameters
|
||||
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
|
||||
// TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
|
||||
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
|
||||
@ -1040,7 +1040,7 @@ extern "C" {
|
||||
|
||||
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
||||
/// @param chat Pointer to a list of multiple llama_chat_message
|
||||
/// @param n_msg Number of llama_chat_message in this chat
|
||||
@ -1149,7 +1149,7 @@ extern "C" {
|
||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
||||
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
||||
"will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
|
||||
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
||||
@ -1157,7 +1157,7 @@ extern "C" {
|
||||
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep);
|
||||
|
||||
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
||||
/// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
|
||||
|
||||
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
||||
@ -1203,7 +1203,7 @@ extern "C" {
|
||||
const char * grammar_str,
|
||||
const char * grammar_root);
|
||||
|
||||
/// @details Lazy grammar sampler, introduced in https://github.com/ggerganov/llama.cpp/pull/9639
|
||||
/// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
/// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
|
||||
/// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
|
||||
|
@ -5,7 +5,7 @@ description = "Scripts that ship with llama.cpp"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
readme = "README.md"
|
||||
homepage = "https://ggml.ai"
|
||||
repository = "https://github.com/ggerganov/llama.cpp"
|
||||
repository = "https://github.com/ggml-org/llama.cpp"
|
||||
keywords = ["ggml", "gguf", "llama.cpp"]
|
||||
packages = [{ include = "*.py", from = "." }]
|
||||
classifiers = [
|
||||
|
@ -170,7 +170,7 @@ check_convert_script examples/convert_legacy_llama.py
|
||||
for py in convert_*.py; do
|
||||
# skip convert_hf_to_gguf_update.py
|
||||
# TODO: the check is failing for some reason:
|
||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
|
||||
[[ $py == convert_hf_to_gguf_update.py ]] && continue
|
||||
|
||||
check_convert_script "$py"
|
||||
|
@ -708,7 +708,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
||||
const auto cpts = unicode_cpts_from_utf8(text);
|
||||
|
||||
// generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
|
||||
std::string text_collapsed;
|
||||
if (need_collapse) {
|
||||
// collapse all unicode categories
|
||||
|
Loading…
x
Reference in New Issue
Block a user