Skip to content

Commit

Permalink
Server-side implementation of graph_compute
Browse files Browse the repository at this point in the history
  • Loading branch information
rgerganov committed Mar 8, 2024
1 parent 355e1de commit 1c6551e
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 13 deletions.
48 changes: 36 additions & 12 deletions examples/rpc/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,11 @@ struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr)
// allocate tensors
ggml_gallocr_alloc_graph(allocr, gf);

int n_threads = 1; // number of threads to perform some operations with multi-threading

if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
}

#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(model.backend)) {
ggml_backend_metal_set_n_cb(model.backend, n_threads);
ggml_status status = ggml_backend_graph_compute(model.backend, gf);
if (status != GGML_STATUS_SUCCESS) {
fprintf(stderr, "%s: ggml_backend_graph_compute() failed\n", __func__);
exit(1);
}
#endif

ggml_backend_graph_compute(model.backend, gf);

// in this case, the output tensor is the last one in the graph
return gf->nodes[gf->n_nodes - 1];
Expand Down Expand Up @@ -154,5 +146,37 @@ int main(int argc, char * argv[])
// perform computation
struct ggml_tensor * result = compute(model, allocr);

// create a array to print result
std::vector<float> out_data(ggml_nelements(result));

// bring the data from the backend memory
ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result));

// expected result:
// [ 60.00 110.00 54.00 29.00
// 55.00 90.00 126.00 28.00
// 50.00 54.00 42.00 64.00 ]

printf("mul mat (%d x %d) (transposed result):\n[", (int) result->ne[0], (int) result->ne[1]);
for (int j = 0; j < result->ne[1] /* rows */; j++) {
if (j > 0) {
printf("\n");
}

for (int i = 0; i < result->ne[0] /* cols */; i++) {
printf(" %.2f", out_data[i * result->ne[1] + j]);
}
}
printf(" ]\n");

// release backend memory used for computation
ggml_gallocr_free(allocr);

// free memory
ggml_free(model.ctx);

// release backend memory and free backend
ggml_backend_buffer_free(model.buffer);
ggml_backend_free(model.backend);
return 0;
}
73 changes: 72 additions & 1 deletion src/ggml-rpc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,8 @@ BackendImpl::BackendImpl() {
};
dummy_context = ggml_init(params);
dummy_tensor = ggml_new_tensor_2d(dummy_context, GGML_TYPE_F32, 1, 1);
dummy_tensor->backend = GGML_BACKEND_TYPE_GPU;
// TODO: do not hardcode the backend type
dummy_tensor->backend = GGML_BACKEND_TYPE_CPU;
}

BackendImpl::~BackendImpl() {
Expand Down Expand Up @@ -406,6 +407,76 @@ grpc::Status BackendImpl::GetTensor(grpc::ServerContext* context, const ggml::Ge
return grpc::Status::OK;
}

static struct ggml_tensor * create_node(uint64_t id,
struct ggml_context * ctx,
const ggml::GraphComputeRequest* request,
std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
if (id == 0) {
return nullptr;
}
if (tensor_map.find(id) != tensor_map.end()) {
return tensor_map[id];
}
for (int i = 0; i < request->tensors_size(); i++) {
if (request->tensors(i).id() == id) {
GGML_PRINT_DEBUG("create node: %lx\n", id);
const ggml::Tensor & tensor = request->tensors(i);
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor.type(),
tensor.ne(0), tensor.ne(1), tensor.ne(2), tensor.ne(3));
result->backend = (ggml_backend_type) tensor.backend();
result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor.bufptr());
result->op = (ggml_op) tensor.op();
for (int i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
result->op_params[i] = tensor.op_params(i);
}
result->flags = tensor.flags();
for (int i = 0; i < GGML_MAX_SRC; i++) {
result->src[i] = create_node(tensor.src(i), ctx, request, tensor_map);
}
result->view_src = create_node(tensor.view_src(), ctx, request, tensor_map);
result->view_offs = tensor.view_offs();
result->data = reinterpret_cast<void *>(tensor.data());
strncpy(result->name, tensor.name().c_str(), GGML_MAX_NAME);
tensor_map[id] = result;
return result;
}
}
fprintf(stderr, "tensor not found: %lu\n", id);
return nullptr;
}

grpc::Status BackendImpl::GraphCompute(grpc::ServerContext* context, const ggml::GraphComputeRequest* request, ggml::GraphComputeReply* reply) {
GGML_PRINT_DEBUG("GraphCompute\n");

int num_tensors = request->tensors_size();
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
struct ggml_context * ctx = ggml_init(params);
std::unordered_map<uint64_t, ggml_tensor*> tensor_map;

int num_nodes = request->nodes_size();
static size_t buf_size = ggml_tensor_overhead()*num_nodes + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);

struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true,
};

struct ggml_context * ctx0 = ggml_init(params0);
struct ggml_cgraph * graph = ggml_new_graph_custom(ctx0, num_nodes, false);
graph->n_nodes = num_nodes;
for (int i = 0; i < num_nodes; i++) {
graph->nodes[i] = create_node(request->nodes(i), ctx, request, tensor_map);
}
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
ggml_status status = ggml_backend_graph_compute(backend, graph);
reply->set_status(status);
ggml_free(ctx);
ggml_free(ctx0);
return grpc::Status::OK;
}

0 comments on commit 1c6551e

Please sign in to comment.