Introduction to ggml
Table of contents
Open Table of contents
Terminology and concepts
- ggml_context: a container that holds objects such as tensors, graphs, and optionally data.
- ggml_cgraph: represents a computational graph.
- ggml_backend: represents an interface for executing computation graphs.
- ggml_backend_buffer_type: represents a buffer type. as a “memory allocator” connected to each
ggml_backend. - ggml_backend_buffer: represents a buffer allocated by
buffer_type. A buffer can hold the data of multiple tensors. - ggml_gallocr: represents a graph memory allocator, used to allocate efficiently the tensors used in a computation graph.
- ggml_backend_sched: A scheduler that enables concurrent use of mulitple backends. It can distribute comp. across different hardware (e.g. GPU and CPU) when dealing with large models or multiple GPUS.
Simple Example
cd ggml
touch examples/demo/demo.c
touch examples/demo/CMakeLists.txt
#include "ggml.h"
#include "ggml-cpu.h"
#include <string.h>
#include <stdio.h>
int main(void) {
// initialize data of matrices to perform matrix multiplication
const int rows_A = 4, cols_A = 2;
float matrix_A[rows_A * cols_A] = {
2, 8,
5, 1,
4, 2,
8, 6
};
const int rows_B = 3, cols_B = 2;
float matrix_B[rows_B * cols_B] = {
10, 5,
9, 9,
5, 4
};
// 1. Allocate `ggml_context` to store tensor data
// Calculate the size needed to allocate
size_t ctx_size = 0;
ctx_size += rows_A * cols_A * ggml_type_size(GGML_TYPE_F32); // tensor a
ctx_size += rows_B * cols_B * ggml_type_size(GGML_TYPE_F32); // tensor b
ctx_size += rows_A * rows_B * ggml_type_size(GGML_TYPE_F32); // result
ctx_size += 3 * ggml_tensor_overhead(); // metadata for 3 tensors
ctx_size += ggml_graph_overhead(); // compute graph
ctx_size += 1024; // some overhead (exact calculation omitted for simplicity)
// Allocate `ggml_context` to store tensor data
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx = ggml_init(params);
// 2. Create tensors and set data
struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_A, rows_A);
struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_B, rows_B);
memcpy(tensor_a->data, matrix_A, ggml_nbytes(tensor_a));
memcpy(tensor_b->data, matrix_B, ggml_nbytes(tensor_b));
// 3. Create a `ggml_cgraph` for mul_mat operation
struct ggml_cgraph * gf = ggml_new_graph(ctx);
// result = a*b^T
// Pay attention: ggml_mul_mat(A, B) ==> B will be transposed internally
// the result is transposed
struct ggml_tensor * result = ggml_mul_mat(ctx, tensor_a, tensor_b);
// Mark the "result" tensor to be computed
ggml_build_forward_expand(gf, result);
// 4. Run the computation
int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
ggml_graph_compute_with_ctx(ctx, gf, n_threads);
// 5. Retrieve results (output tensors)
float * result_data = (float *) result->data;
printf("mul mat (%d x %d) (transposed result):\n[", (int) result->ne[0], (int) result->ne[1]);
for (int j = 0; j < result->ne[1] /* rows */; j++) {
if (j > 0) {
printf("\n");
}
for (int i = 0; i < result->ne[0] /* cols */; i++) {
printf(" %.2f", result_data[j * result->ne[0] + i]);
}
}
printf(" ]\n");
// 6. Free memory and exit
ggml_free(ctx);
return 0;
}
set(TEST_TARGET demo)
add_executable(${TEST_TARGET} demo)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)