llama : remove MPI backend (#7395)

ggerganov · May 19, 2024 · d359f30 · d359f30
1 parent 1ea2a00
commit d359f30
Show file tree

Hide file tree

Showing 9 changed files with 2 additions and 425 deletions.
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
@@ -214,7 +214,6 @@ effectiveStdenv.mkDerivation (
  (cmakeBool "LLAMA_CUDA" useCuda)
  (cmakeBool "LLAMA_HIPBLAS" useRocm)
  (cmakeBool "LLAMA_METAL" useMetalKit)
- (cmakeBool "LLAMA_MPI" useMpi)
  (cmakeBool "LLAMA_VULKAN" useVulkan)
  (cmakeBool "LLAMA_STATIC" enableStatic)
  ]

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -306,40 +306,6 @@ jobs:
  cd build
  ctest -L main --verbose --timeout 900
 
- ubuntu-latest-cmake-mpi:
- runs-on: ubuntu-latest
-
- continue-on-error: true
-
- strategy:
- matrix:
- mpi_library: [mpich, libopenmpi-dev]
-
- steps:
- - name: Clone
- id: checkout
- uses: actions/checkout@v4
-
- - name: Dependencies
- id: depends
- run: |
- sudo apt-get update
- sudo apt-get install build-essential ${{ matrix.mpi_library }}
-
- - name: Build
- id: cmake_build
- run: |
- mkdir build
- cd build
- cmake -DLLAMA_MPI=ON ..
- cmake --build . --config Release -j $(nproc)
-
- - name: Test
- id: cmake_test
- run: |
- cd build
- ctest -L main --verbose
-
  ubuntu-latest-cmake-rpc:
  runs-on: ubuntu-latest
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -122,7 +122,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
  "llama: metal minimum macOS version")
 set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
-option(LLAMA_MPI "llama: use MPI" OFF)
 option(LLAMA_RPC "llama: use RPC" OFF)
 option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
 option(LLAMA_SYCL "llama: use SYCL" OFF)
@@ -466,35 +465,6 @@ if (LLAMA_CUDA)
  endif()
 endif()
 
-if (LLAMA_MPI)
- cmake_minimum_required(VERSION 3.10)
- find_package(MPI)
- if (MPI_C_FOUND)
- message(STATUS "MPI found")
-
- set(GGML_HEADERS_MPI ggml-mpi.h)
- set(GGML_SOURCES_MPI ggml-mpi.c)
-
- add_compile_definitions(GGML_USE_MPI)
- add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
-
- if (NOT MSVC)
- add_compile_options(-Wno-cast-qual)
- endif()
-
- set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
- set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
-
- # Even if you're only using the C header, C++ programs may bring in MPI
- # C++ functions, so more linkage is needed
- if (MPI_CXX_FOUND)
- set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
- endif()
- else()
- message(WARNING "MPI not found")
- endif()
-endif()
-
 if (LLAMA_RPC)
  add_compile_definitions(GGML_USE_RPC)
 
@@ -1218,7 +1188,6 @@ add_library(ggml OBJECT
  ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
  ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
  ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
- ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
  ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
  ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
  ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
@@ -1306,7 +1275,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
 
 set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
  "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
- "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
+ "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 install(TARGETS ggml PUBLIC_HEADER)

diff --git a/Makefile b/Makefile
@@ -399,13 +399,6 @@ ifndef LLAMA_NO_ACCELERATE
  endif
 endif # LLAMA_NO_ACCELERATE
 
-ifdef LLAMA_MPI
- MK_CPPFLAGS += -DGGML_USE_MPI
- MK_CFLAGS += -Wno-cast-qual
- MK_CXXFLAGS += -Wno-cast-qual
- OBJS += ggml-mpi.o
-endif # LLAMA_MPI
-
 ifdef LLAMA_OPENBLAS
  MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
  MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -629,11 +622,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
 endif
 endif # LLAMA_METAL
 
-ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
- $(CC) $(CFLAGS) -c $< -o $@
-endif # LLAMA_MPI
-
 ifndef LLAMA_NO_LLAMAFILE
 sgemm.o: sgemm.cpp sgemm.h ggml.h
  $(CXX) $(CXXFLAGS) -c $< -o $@

diff --git a/README.md b/README.md
@@ -382,45 +382,6 @@ To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or th
 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
 
-### MPI Build
-
-MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
-
-First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
-
-Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
-
-- Using `make`:
-
- ```bash
- make CC=mpicc CXX=mpicxx LLAMA_MPI=1
- ```
-
-- Using `CMake`:
-
- ```bash
- cmake -S . -B build -DLLAMA_MPI=ON
- ```
-
-Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
-
-Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
-
-Here is an example hostfile:
-
-```
-192.168.0.1:2
-malvolio.local:1
-```
-
-The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
-
-Finally, you're ready to run a computation using `mpirun`:
-
-```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
-```
-
 ### BLAS Build
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:

diff --git a/ggml-mpi.c b/ggml-mpi.c