Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama_model_loader: support multiple split/shard GGUFs #6187

Merged
merged 28 commits into from
Mar 22, 2024
Merged
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
7c64fef
split: support in llama_model_loader
phymbert Mar 19, 2024
b8feff4
Avoir copying the entire vector
phymbert Mar 21, 2024
18ff6ca
split: move llama_tensor_offset to llama_model_loader
phymbert Mar 21, 2024
60a87ae
Merge branch 'master' into hp/split/load-model
phymbert Mar 21, 2024
1892ae7
llama_model_loader: PR feedbacks:
phymbert Mar 21, 2024
00381b0
avoid copying the entire vector
phymbert Mar 21, 2024
c34a5de
Simplify this by making these optional, switch some layer creation te…
phymbert Mar 21, 2024
1c931f3
Handle optional tensors
phymbert Mar 21, 2024
d8b567d
llama_model_loader: fail if backend cannot allocate buffer
phymbert Mar 21, 2024
02020b0
fix mmap buffer management
slaren Mar 21, 2024
078a1ac
llama_model_loader: map file to backend buffer if the allocation succ…
phymbert Mar 21, 2024
69bdee9
llama_model_loader: only map tensors included in the context
phymbert Mar 21, 2024
6df9757
llama_model_loader: minor, use same variable name for consistency, fi…
phymbert Mar 21, 2024
f9a2973
llama_model_loader: fail if any of backend buffer cannot be allocated
phymbert Mar 21, 2024
0fd652e
spacing
phymbert Mar 21, 2024
1a179bf
fix loop over pointer
phymbert Mar 21, 2024
7cbe1ea
llama_model_loader: if n_tensors declared not equals to loaded tensor…
phymbert Mar 22, 2024
9940df4
llama_model_loader: ensure mappings vector has the expected size
phymbert Mar 22, 2024
ec372c6
llama_model_loader: use at instead of operator[] if this should neve…
phymbert Mar 22, 2024
a9e88c6
llama_model_loader: immediately add the backend buffer to the model b…
phymbert Mar 22, 2024
b19af36
llama_model_loader: be sure the model mappings has enough capacity be…
phymbert Mar 22, 2024
4c04400
llama_model_loader: fix map -> unordered map
phymbert Mar 22, 2024
e474e45
llama_split_prefix: use a clearer version, not pass split path len bu…
phymbert Mar 22, 2024
8326607
llama : minor
ggerganov Mar 22, 2024
dbc35ac
llama : introduce some typedef helpers
ggerganov Mar 22, 2024
f616b38
docs: add model shard in hot topic
phymbert Mar 22, 2024
1f38759
llama_model_loader: put mapping in a unique_ptr from the moment it is…
phymbert Mar 22, 2024
764c7af
fix llama_split_prefix
ngxson Mar 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Prev Previous commit
Next Next commit
llama_model_loader: map file to backend buffer if the allocation succ…
…eeds only
  • Loading branch information
phymbert committed Mar 21, 2024
commit 078a1aca0648204c2abaec097b04c1bac8cf3795
43 changes: 19 additions & 24 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3192,7 +3192,7 @@ struct llama_model_loader {

void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
GGML_ASSERT(!mappings.empty());
const auto & mapping = mappings[idx];
const auto & mapping = mappings.at(idx);

*first = mapping->size;
*last = 0;
Expand All @@ -3211,7 +3211,7 @@ struct llama_model_loader {
void load_data_for(struct ggml_tensor * cur) const {
const auto & w = get_weights(ggml_get_name(cur));

if (use_mmap && w.idx < mappings.size()) {
if (use_mmap) {
const auto & mapping = mappings.at(w.idx);
if (cur->data == nullptr) {
cur->data = (uint8_t *)mapping->addr + w.offs;
Expand All @@ -3232,7 +3232,7 @@ struct llama_model_loader {
std::vector<std::pair<size_t, size_t>> mmaps_used;

// Returns false if cancelled by progress_callback
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::vector<ggml_backend_buffer_t> bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map<uint32_t, ggml_backend_buffer *> bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
phymbert marked this conversation as resolved.
Show resolved Hide resolved
phymbert marked this conversation as resolved.
Show resolved Hide resolved
GGML_ASSERT(size_data != 0 && "call init_mappings() first");

std::vector<no_init<uint8_t>> read_buf;
Expand All @@ -3246,9 +3246,12 @@ struct llama_model_loader {
const auto & w = get_weights(ggml_get_name(cur));
size_t n_size = ggml_nbytes(cur);

if (use_mmap && w.idx < mappings.size()) {
if (use_mmap) {
const auto & mapping = mappings.at(w.idx);
ggml_backend_buffer_t buf_mmap = bufs_mmap.size() > w.idx ? bufs_mmap.at(w.idx) : nullptr;
ggml_backend_buffer_t buf_mmap = nullptr;
if (bufs_mmap.count(w.idx)) {
buf_mmap = bufs_mmap.at(w.idx);
}
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
if (buf_mmap && cur->data == nullptr) {
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs);
Expand Down Expand Up @@ -3283,7 +3286,7 @@ struct llama_model_loader {
// check if this is the last call and do final cleanup
if (size_done >= size_data) {
// unmap offloaded tensors and metadata
if (use_mmap && !mappings.empty()) {
if (use_mmap) {
for (uint32_t file_no = 0; file_no < mappings.size(); file_no++) {
const auto & mmap_used = mmaps_used[file_no];
auto & mapping = mappings.at(file_no);
Expand Down Expand Up @@ -5129,12 +5132,12 @@ static bool llm_load_tensors(
ml.init_mappings(true, &model.mlock_mmaps);

// create the backend buffers
std::vector<std::pair<ggml_context *, std::vector<ggml_backend_buffer_t>>> ctx_bufs;
std::vector<std::pair<ggml_context *, std::map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;

for (auto & it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
std::vector<ggml_backend_buffer_t> bufs;
std::map<uint32_t, ggml_backend_buffer_t> bufs;

// only the mmap region containing the tensors in the model is mapped to the backend buffer
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
Expand All @@ -5145,21 +5148,18 @@ static bool llm_load_tensors(
size_t first, last;
ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
if (first >= last) {
bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync
continue;
}
ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first);
if (buf != nullptr) {
bufs.push_back(buf);
bufs.emplace(file_no, buf);
#ifdef GGML_USE_CUBLAS
if (n_layer >= n_gpu_layers) {
ggml_backend_cuda_register_host_buffer(
ggml_backend_buffer_get_base(buf),
ggml_backend_buffer_get_size(buf));
}
#endif
} else {
throw std::runtime_error("failed to allocate cpu buffer");
}
}
}
Expand All @@ -5176,9 +5176,7 @@ static bool llm_load_tensors(
}
ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
if (buf != nullptr) {
bufs.push_back(buf);
} else {
throw std::runtime_error("failed to allocate metal buffer");
bufs.emplace(file_no, buf);
}
}
}
Expand All @@ -5192,22 +5190,19 @@ static bool llm_load_tensors(
mlock_buf->init(ggml_backend_buffer_get_base(buf));
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
}
bufs.push_back(buf);
} else {
throw std::runtime_error("failed to allocate backend buffer");
for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) {
bufs.emplace(file_no, buf);
}
}
}
if (bufs.empty()) {
throw std::runtime_error("failed to allocate buffer");
}
// indicate that this buffer contains weights
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
for (ggml_backend_buffer_t buf : bufs) {
if (buf == nullptr) {
continue;
}
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
model.bufs.push_back(buf);
for (auto & buf : bufs) {
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
model.bufs.push_back(buf.second);
}

ctx_bufs.emplace_back(ctx, bufs);
Expand Down