smp: allocate hugepages eagerly when kernel support is available

Instead of deferring merging pages into hugepages to the transparent hugepage scanner, advise the kernel to do so immediately using the new MADV_POPULATE_WRITE and MADV_COLLAPSE advices. Refactor the prefaulter to attempt first to use MAP_POPULATE_WRITE to fault in a whole hugepage's worth of memory. This should fault the range as a hugepage but for good measure use MADV_COLLAPSE too (which would be a no-op if the work was done in MADV_POPULATE_WRITE).
avikivity · May 18, 2024 · 99d2465 · 99d2465
1 parent a7fb8ff
commit 99d2465
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 19 deletions.
diff --git a/src/core/prefault.hh b/src/core/prefault.hh
@@ -22,6 +22,7 @@
 
 #include <atomic>
 #include <map>
+#include <optional>
 
 #include <seastar/core/posix.hh>
 #include <seastar/core/resource.hh>
@@ -40,7 +41,7 @@ public:
  explicit memory_prefaulter(const resource::resources& res, memory::internal::numa_layout layout);
  ~memory_prefaulter();
 private:
- void work(std::vector<memory::internal::memory_range>& ranges, size_t page_size);
+ void work(std::vector<memory::internal::memory_range>& ranges, size_t page_size, std::optional<size_t> huge_page_size_opt);
 };
 
 

diff --git a/src/core/smp.cc b/src/core/smp.cc
@@ -25,6 +25,10 @@ module;
 #include <boost/range/algorithm/find_if.hpp>
 #include <memory>
 #include <vector>
+#include <regex>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
 
 #ifdef SEASTAR_MODULE
 module seastar;
@@ -35,6 +39,8 @@ module seastar;
 #include <seastar/core/semaphore.hh>
 #include <seastar/core/print.hh>
 #include <seastar/core/on_internal_error.hh>
+#include <seastar/core/posix.hh>
+#include <seastar/core/align.hh>
 #include "prefault.hh"
 #endif
 
@@ -173,11 +179,32 @@ smp::setup_prefaulter(const seastar::resource::resources& res, seastar::memory::
 #endif
 }
 
+static
+std::optional<size_t>
+get_huge_page_size() {
+ auto meminfo_fd = file_desc::open("/proc/meminfo", O_RDONLY | O_CLOEXEC);
+ std::string meminfo;
+ char buf[4096];
+ while (auto size_opt = meminfo_fd.read(buf, sizeof(buf))) {
+ if (!*size_opt) {
+ break;
+ }
+ meminfo.append(buf, *size_opt);
+ }
+ static std::regex re(R"(Hugepagesize:\s*(\d+) kB)");
+ auto m = std::smatch{};
+ if (std::regex_search(meminfo, m, re)) {
+ return std::stoi(m[1]) * size_t(1024);
+ }
+ return std::nullopt;
+}
+
 internal::memory_prefaulter::memory_prefaulter(const resource::resources& res, memory::internal::numa_layout layout) {
  for (auto& range : layout.ranges) {
  _layout_by_node_id[range.numa_node_id].push_back(std::move(range));
  }
  auto page_size = getpagesize();
+ auto huge_page_size_opt = get_huge_page_size();
  for (auto& numa_node_id_and_ranges : _layout_by_node_id) {
  auto& numa_node_id = numa_node_id_and_ranges.first;
  auto& ranges = numa_node_id_and_ranges.second;
@@ -191,8 +218,8 @@ internal::memory_prefaulter::memory_prefaulter(const resource::resources& res, m
  }
  a.set(cpuset);
  }
- _worker_threads.emplace_back(a, [this, &ranges, page_size] {
- work(ranges, page_size);
+ _worker_threads.emplace_back(a, [this, &ranges, page_size, huge_page_size_opt] {
+ work(ranges, page_size, huge_page_size_opt);
  });
  }
 }
@@ -205,12 +232,33 @@ internal::memory_prefaulter::~memory_prefaulter() {
 }
 
 void
-internal::memory_prefaulter::work(std::vector<memory::internal::memory_range>& ranges, size_t page_size) {
+internal::memory_prefaulter::work(std::vector<memory::internal::memory_range>& ranges, size_t page_size,
+ std::optional<size_t> huge_page_size_opt) {
  sched_param param = { .sched_priority = 0 };
  // SCHED_IDLE doesn't work via thread attributes
  pthread_setschedparam(pthread_self(), SCHED_IDLE, &param);
  size_t current_range = 0;
- const size_t batch_size = 512; // happens to match huge page size on x86, through not critical
+ const size_t batch_size = huge_page_size_opt.value_or(512*4096);
+ auto populate_memory_madvise = [works = true] (char* start, char* end) mutable {
+#ifdef MADV_POPULATE_WRITE
+ if (works) {
+ auto result = madvise(start, end - start, MADV_POPULATE_WRITE);
+ if (result == -1 && errno == EINVAL) {
+ // Unsupported by kernel
+ works = false;
+ return false;
+ }
+ // Ignore other errors. This is just an optimization anyway.
+#ifdef MADV_COLLAPSE
+ madvise(start, end - start, MADV_COLLAPSE);
+ // Also ignore problems with MADV_COLLAPSE, it's just an optimization.
+#endif
+ return true;
+ };
+#endif
+ (void)works; // suppress warning if block above is elided
+ return false;
+ };
  auto fault_in_memory = [] (char* p) {
  // Touch the page for write, but be sure not to modify anything
  // The compilers tend to optimize things, so prefer assembly
@@ -227,25 +275,29 @@ internal::memory_prefaulter::work(std::vector<memory::internal::memory_range>& r
  p1->fetch_or(0, std::memory_order_relaxed);
 #endif
  };
- while (!_stop_request.load(std::memory_order_relaxed) && !ranges.empty()) {
- auto& range = ranges[current_range];
- // copy eveything into locals, or the optimizer will worry about them due
- // to the cast below and not optimize anything
- auto start = range.start;
- auto end = range.end;
- for (size_t i = 0; i < batch_size && start < end; ++i) {
+ auto populate_memory = [&] (char* start, char* end) {
+ if (populate_memory_madvise(start, end)) {
+ return;
+ }
+ while (start < end) {
  fault_in_memory(start);
  start += page_size;
  }
+ };
+ while (!_stop_request.load(std::memory_order_relaxed) && !ranges.empty()) {
+ auto& range = ranges[current_range];
+
+ auto batch_end = std::min(align_up(range.start + 1, batch_size), range.end);
+ populate_memory(range.start, batch_end);
+ range.start = batch_end;
+
  // An end-to-start scan for applications that manage two heaps that
  // grow towards each other.
- for (size_t i = 0; i < batch_size && start < end; ++i) {
- fault_in_memory(end - 1);
- end -= page_size;
- }
- range.start = start;
- range.end = end;
- if (start >= end) {
+ auto batch_start = std::max(align_down(range.end - 1, batch_size), range.start);
+ populate_memory(batch_start, range.end);
+ range.end = batch_start;
+
+ if (range.start >= range.end) {
  ranges.erase(ranges.begin() + current_range);
  current_range = 0;
  }