Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama_model_loader: support multiple split/shard GGUFs #6187

Merged
merged 28 commits into from Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
7c64fef
split: support in llama_model_loader
phymbert Mar 19, 2024
b8feff4
Avoir copying the entire vector
phymbert Mar 21, 2024
18ff6ca
split: move llama_tensor_offset to llama_model_loader
phymbert Mar 21, 2024
60a87ae
Merge branch 'master' into hp/split/load-model
phymbert Mar 21, 2024
1892ae7
llama_model_loader: PR feedbacks:
phymbert Mar 21, 2024
00381b0
avoid copying the entire vector
phymbert Mar 21, 2024
c34a5de
Simplify this by making these optional, switch some layer creation te…
phymbert Mar 21, 2024
1c931f3
Handle optional tensors
phymbert Mar 21, 2024
d8b567d
llama_model_loader: fail if backend cannot allocate buffer
phymbert Mar 21, 2024
02020b0
fix mmap buffer management
slaren Mar 21, 2024
078a1ac
llama_model_loader: map file to backend buffer if the allocation succ…
phymbert Mar 21, 2024
69bdee9
llama_model_loader: only map tensors included in the context
phymbert Mar 21, 2024
6df9757
llama_model_loader: minor, use same variable name for consistency, fi…
phymbert Mar 21, 2024
f9a2973
llama_model_loader: fail if any of backend buffer cannot be allocated
phymbert Mar 21, 2024
0fd652e
spacing
phymbert Mar 21, 2024
1a179bf
fix loop over pointer
phymbert Mar 21, 2024
7cbe1ea
llama_model_loader: if n_tensors declared not equals to loaded tensor…
phymbert Mar 22, 2024
9940df4
llama_model_loader: ensure mappings vector has the expected size
phymbert Mar 22, 2024
ec372c6
llama_model_loader: use at instead of operator[] if this should neve…
phymbert Mar 22, 2024
a9e88c6
llama_model_loader: immediately add the backend buffer to the model b…
phymbert Mar 22, 2024
b19af36
llama_model_loader: be sure the model mappings has enough capacity be…
phymbert Mar 22, 2024
4c04400
llama_model_loader: fix map -> unordered map
phymbert Mar 22, 2024
e474e45
llama_split_prefix: use a clearer version, not pass split path len bu…
phymbert Mar 22, 2024
8326607
llama : minor
ggerganov Mar 22, 2024
dbc35ac
llama : introduce some typedef helpers
ggerganov Mar 22, 2024
f616b38
docs: add model shard in hot topic
phymbert Mar 22, 2024
1f38759
llama_model_loader: put mapping in a unique_ptr from the moment it is…
phymbert Mar 22, 2024
764c7af
fix llama_split_prefix
ngxson Mar 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -21,6 +21,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187

----

Expand Down
149 changes: 66 additions & 83 deletions examples/gguf-split/gguf-split.cpp
@@ -1,31 +1,34 @@
#include "llama.h"
#include "ggml.h"
#include "common.h"

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <ios>
#include <string>
#include <vector>

#include <stdio.h>
#include <fcntl.h>
#include <string.h>
#include <climits>
#include <stdexcept>
phymbert marked this conversation as resolved.
Show resolved Hide resolved

#if defined(_WIN32)
#include <windows.h>
#ifndef PATH_MAX
#define PATH_MAX MAX_PATH
#endif
#include <io.h>
#endif

enum split_operation : uint8_t {
SPLIT_OP_SPLIT,
SPLIT_OP_MERGE,
};

static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";

static const int SPLIT_FILENAME_MAX = 256;

static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

struct split_params {
split_operation operation = SPLIT_OP_SPLIT;
Expand Down Expand Up @@ -116,13 +119,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para
try {
if (!split_params_parse_ex(argc, argv, params)) {
split_print_usage(argv[0]);
exit(1);
exit(EXIT_FAILURE);
}
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
split_print_usage(argv[0]);
exit(1);
exit(EXIT_FAILURE);
}
return result;
}
Expand All @@ -134,12 +137,6 @@ static void zeros(std::ofstream & file, size_t n) {
}
}

static std::string split_file_name(const std::string & path, int i_split, int n_split) {
char f_split[SPLIT_FILENAME_MAX] = {0};
snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
return std::string(f_split);
}

struct split_strategy {
const split_params params;
std::ifstream & f_input;
Expand Down Expand Up @@ -180,19 +177,21 @@ struct split_strategy {
if (i_split == 0) {
gguf_set_kv(ctx_out, ctx_gguf);
}
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);

// populate the original tensors, so we get an initial metadata
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
gguf_add_tensor(ctx_out, meta);
}

auto split_name = split_file_name(params.output, i_split, n_split);
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);

fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
fout = std::ofstream(split_name, std::ios::binary);
fprintf(stderr, "%s: %s ...", __func__, split_path);
fout = std::ofstream(split_path, std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors

auto meta_size = gguf_get_meta_size(ctx_out);
Expand Down Expand Up @@ -250,19 +249,23 @@ static void gguf_split(const split_params & split_params) {
std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
if (!f_input.is_open()) {
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
exit(1);
exit(EXIT_FAILURE);
}

auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
exit(1);
exit(EXIT_FAILURE);
}

split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);

char first_split_path[PATH_MAX] = {0};
llama_split_path(first_split_path, sizeof(first_split_path),
split_params.output.c_str(), strategy.i_split, strategy.n_split);
fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
__func__, split_params.input.c_str(),
split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
first_split_path,
split_params.n_split_tensors);

strategy.split_start();
Expand Down Expand Up @@ -298,7 +301,9 @@ static void gguf_merge(const split_params & split_params) {
std::vector<ggml_context *> ctx_metas;
std::vector<gguf_context *> ctx_ggufs;

std::string split_prefix;
char split_path[PATH_MAX] = {0};
strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
char split_prefix[PATH_MAX] = {0};

// First pass to find KV and tensors metadata
for (int i_split = 0; i_split < n_split; i_split++) {
Expand All @@ -309,87 +314,64 @@ static void gguf_merge(const split_params & split_params) {
/*.ctx = */ &ctx_meta,
};

auto split_name = split_params.input;
if (i_split > 0) {
split_name = split_file_name(split_prefix, i_split, n_split);
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
}
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);

auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
auto * ctx_gguf = gguf_init_from_file(split_path, params);
if (!ctx_gguf) {
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
exit(1);
exit(EXIT_FAILURE);
}
ctx_ggufs.push_back(ctx_gguf);
ctx_metas.push_back(ctx_meta);

if (i_split == 0) {
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split < 0) {
fprintf(stderr,
"\n%s: input file does not contain %s metadata\n",
__func__,
LLM_KV_GENERAL_SPLIT_N_SPLIT);
LLM_KV_SPLIT_COUNT);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(1);
exit(EXIT_FAILURE);
}

n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
if (n_split < 1) {
fprintf(stderr,
"\n%s: input file does not contain a valid split count %d\n",
__func__,
n_split);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(1);
exit(EXIT_FAILURE);
}

// Do not trigger merge if we try to merge again the output
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);

// Set metadata from the first split
gguf_set_kv(ctx_out, ctx_gguf);
}

// Verify the file naming
{
int i_split_file = 0;
int n_split_file = 0;
const char * i_split_format = "-00000-of-00000.gguf";

if (split_name.size() < strlen(i_split_format)) {
fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
for (auto * _ctx_gguf : ctx_ggufs) {
gguf_free(_ctx_gguf);
}
// Verify the file naming and extract split_prefix
if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
fprintf(stderr, "\n%s: unexpected input file name: %s"
" i_split=%d"
" n_split=%d\n", __func__,
split_path, i_split, n_split);
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(1);
exit(EXIT_FAILURE);
}

split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));

const char * split_name_c_str = split_name.c_str();
int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
// Do not trigger merge if we try to merge again the output
gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);

if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
fprintf(stderr, "\n%s: unexpected input file name: %s"
" i_split=%d i_split_file=%d"
" n_split=%d n_split_file=%d\n", __func__,
split_params.input.c_str(),
i_split, i_split_file,
n_split, n_split_file);
for (auto * _ctx_gguf : ctx_ggufs) {
gguf_free(_ctx_gguf);
}
gguf_free(ctx_out);
fout.close();
exit(1);
}
// Set metadata from the first split
gguf_set_kv(ctx_out, ctx_gguf);
}

auto n_tensors = gguf_get_n_tensors(ctx_gguf);
Expand All @@ -411,18 +393,19 @@ static void gguf_merge(const split_params & split_params) {

// Write tensors data
for (int i_split = 0; i_split < n_split; i_split++) {
auto split_name = split_file_name(split_prefix, i_split, n_split);
std::ifstream f_input(split_name.c_str(), std::ios::binary);
llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
std::ifstream f_input(split_path, std::ios::binary);
if (!f_input.is_open()) {
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str());
for (auto * _ctx_gguf : ctx_ggufs) {
gguf_free(_ctx_gguf);
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path);
for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
gguf_free(ctx_ggufs[i]);
ggml_free(ctx_metas[i]);
}
gguf_free(ctx_out);
fout.close();
exit(1);
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);

auto * ctx_gguf = ctx_ggufs[i_split];
auto * ctx_meta = ctx_metas[i_split];
Expand Down Expand Up @@ -481,8 +464,8 @@ int main(int argc, const char ** argv) {
break;
case SPLIT_OP_MERGE: gguf_merge(params);
break;
default:split_print_usage(argv[0]);
exit(1);
default: split_print_usage(argv[0]);
exit(EXIT_FAILURE);
}

return 0;
Expand Down