Cheng Shao pushed to branch wip/hugepages at Glasgow Haskell Compiler / GHC
Commits:
4cfbe8ff by Teo Camarasu at 2026-03-11T04:46:04+00:00
rts: Implement support for 2MB hugepages
We enable/disable it through a runtime flag (-xH).
When enabled we ensure we only (de)allocate in aligned multiples of 2MB.
Relates to #24760
Co-authored-by: Matthew Pickering
Co-authored-by: Ben Gamari
- - - - -
9 changed files:
- docs/users_guide/runtime_control.rst
- rts/RtsFlags.c
- rts/configure.ac
- rts/include/rts/Flags.h
- rts/posix/OSMem.c
- rts/sm/BlockAlloc.c
- rts/sm/OSMem.h
- testsuite/tests/rts/all.T
- + testsuite/tests/rts/testhugepagesmblockalloc.c
Changes:
=====================================
docs/users_guide/runtime_control.rst
=====================================
@@ -384,6 +384,16 @@ Miscellaneous RTS options
If given, instruct the runtime linker to try to continue linking in the
presence of an unresolved symbol.
+.. rts-flag:: -xH
+
+ This option enables using huge pages to back memory allocations.
+ Use of huge pages can make memory lookups more efficient for applications
+ with high memory usage.
+ Currently we only support 2MB hugepages on Linux.
+
+ If huge pages aren't available to back allocations, then we fall back to
+ regular pages.
+
.. _rts-options-gc:
RTS options to control the garbage collector
=====================================
rts/RtsFlags.c
=====================================
@@ -182,6 +182,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.GcFlags.allocLimitGrace = (100*1024) / BLOCK_SIZE;
RtsFlags.GcFlags.numa = false;
RtsFlags.GcFlags.numaMask = 1;
+ RtsFlags.GcFlags.hugepages = false;
RtsFlags.GcFlags.ringBell = false;
RtsFlags.GcFlags.longGCSync = 0; /* detection turned off */
@@ -570,7 +571,10 @@ usage_text[] = {
#endif
" -xq The allocation limit given to a thread after it receives",
" an AllocationLimitExceeded exception. (default: 100k)",
+#if defined(HUGEPAGE_FLAGS)
+" -xH Try to use hugepages to allocate memory.",
"",
+#endif
#if defined(USE_LARGE_ADDRESS_SPACE)
" -xr The size of virtual memory address space reserved by the",
" two step allocator (default: 1T)",
@@ -1848,11 +1852,11 @@ error = true;
*/
case 'q':
- OPTION_UNSAFE;
- RtsFlags.GcFlags.allocLimitGrace
- = decodeSize(rts_argv[arg], 3, BLOCK_SIZE, HS_INT_MAX)
- / BLOCK_SIZE;
- break;
+ OPTION_UNSAFE;
+ RtsFlags.GcFlags.allocLimitGrace
+ = decodeSize(rts_argv[arg], 3, BLOCK_SIZE, HS_INT_MAX)
+ / BLOCK_SIZE;
+ break;
case 'r':
OPTION_UNSAFE;
@@ -1860,7 +1864,16 @@ error = true;
= decodeSize(rts_argv[arg], 3, MBLOCK_SIZE, HS_WORD64_MAX);
break;
- default:
+ case 'H':
+ OPTION_UNSAFE;
+#if defined(HUGEPAGE_FLAGS)
+ RtsFlags.GcFlags.hugepages = true;
+#else
+ errorBelch("Program not compiled with hugepages support.");
+#endif
+ break;
+
+ default:
OPTION_SAFE;
errorBelch("unknown RTS option: %s",rts_argv[arg]);
error = true;
=====================================
rts/configure.ac
=====================================
@@ -96,7 +96,7 @@ dnl off_t, because it will affect the result of that test.
AC_SYS_LARGEFILE
dnl ** check for specific header (.h) files that we are interested in
-AC_CHECK_HEADERS([ctype.h dlfcn.h errno.h fcntl.h limits.h locale.h nlist.h pthread.h signal.h sys/param.h sys/mman.h sys/resource.h sys/select.h sys/time.h sys/timeb.h sys/timerfd.h sys/timers.h sys/times.h sys/utsname.h sys/wait.h termios.h utime.h windows.h winsock.h sched.h])
+AC_CHECK_HEADERS([ctype.h dlfcn.h errno.h fcntl.h limits.h locale.h nlist.h pthread.h signal.h sys/param.h sys/mman.h linux/mman.h sys/resource.h sys/select.h sys/time.h sys/timeb.h sys/timerfd.h sys/timers.h sys/times.h sys/utsname.h sys/wait.h termios.h utime.h windows.h winsock.h sched.h])
dnl sys/cpuset.h needs sys/param.h to be included first on FreeBSD 9.1; #7708
AC_CHECK_HEADERS([sys/cpuset.h], [], [],
=====================================
rts/include/rts/Flags.h
=====================================
@@ -91,6 +91,7 @@ typedef struct _GC_FLAGS {
StgWord numaMask;
StgWord64 addressSpaceSize; /* large address space size in bytes */
+ bool hugepages; /* Enable hugepages support */
} GC_FLAGS;
/* See Note [Synchronization of flags and base APIs] */
=====================================
rts/posix/OSMem.c
=====================================
@@ -73,6 +73,11 @@
# endif
#endif
+#if defined(HUGEPAGE_FLAGS)
+static int huge_tried = 0;
+static int huge_failed = 0;
+#endif
+
static void *next_request = 0;
void osMemInit(void)
@@ -233,12 +238,28 @@ my_mmap (void *addr, W_ size, int operation)
errorBelch("my_mmap(,,MEM_RESERVE) not supported on this platform");
# endif
} else if (operation == MEM_COMMIT) {
- flags = MAP_FIXED | MAP_ANON | MAP_PRIVATE;
+ flags = MAP_FIXED | MAP_ANONYMOUS | MAP_PRIVATE;
+#if defined(HUGEPAGE_FLAGS)
+ if ( RtsFlags.GcFlags.hugepages &&
+ (size & (HUGEPAGE_SIZE - 1)) == 0) {
+ huge_tried += 1;
+ flags |= HUGEPAGE_FLAGS;
+ }
+#endif /* defined(HUGEPAGE_FLAGS) */
} else {
flags = MAP_ANON | MAP_PRIVATE;
}
ret = mmap(addr, size, prot, flags, -1, 0);
+#if defined(HUGEPAGE_FLAGS)
+ // If the mmap failed, and we tried with HUGEPAGE_FLAGS
+ // then retry without.
+ if (ret == MAP_FAILED && flags & HUGEPAGE_FLAGS){
+ huge_failed += 1;
+ flags &= ~HUGEPAGE_FLAGS;
+ ret = mmap(addr, size, prot, flags, -1, 0);
+ }
+#endif
# if defined(linux_HOST_OS)
if (ret == MAP_FAILED && errno == EPERM) {
// Linux may return EPERM if it tried to give us
@@ -457,6 +478,7 @@ StgWord64 getPhysicalMemorySize (void)
#if defined(USE_LARGE_ADDRESS_SPACE)
+
static void *
osTryReserveHeapMemory (W_ len, void *hint)
{
@@ -470,6 +492,7 @@ osTryReserveHeapMemory (W_ len, void *hint)
and then we discard what we don't need */
base = my_mmap(hint, len + MBLOCK_SIZE, MEM_RESERVE);
+
if (base == NULL)
return NULL;
=====================================
rts/sm/BlockAlloc.c
=====================================
@@ -25,7 +25,8 @@
#include
-static void initMBlock(void *mblock, uint32_t node);
+static void initMBlock(void *mblock, uint32_t node);
+static void free_mega_group (bdescr *mg);
/*
* By default the DEBUG RTS is built with block allocator assertions
@@ -505,13 +506,30 @@ alloc_mega_group (uint32_t node, StgWord mblocks)
else
{
void *mblock;
+ StgWord hugepage_mblocks;
+ if(RtsFlags.GcFlags.hugepages) {
+ // Round up allocation to hugepage size
+ hugepage_mblocks = MBLOCK_ROUND_UP_HUGEPAGE(mblocks);
+ }
+ else {
+ hugepage_mblocks = mblocks;
+ }
+
if (RtsFlags.GcFlags.numa) {
- mblock = getMBlocksOnNode(node, mblocks);
+ mblock = getMBlocksOnNode(node, hugepage_mblocks);
} else {
- mblock = getMBlocks(mblocks);
+ mblock = getMBlocks(hugepage_mblocks);
}
initMBlock(mblock, node); // only need to init the 1st one
bd = FIRST_BDESCR(mblock);
+
+ // Free the slop
+ if(hugepage_mblocks > mblocks) {
+ bdescr *mblock_slop_bd = FIRST_BDESCR((uintptr_t)mblock + (uintptr_t)mblocks*MBLOCK_SIZE);
+ initMBlock(MBLOCK_ROUND_DOWN(mblock_slop_bd), node);
+ mblock_slop_bd->blocks = MBLOCK_GROUP_BLOCKS(hugepage_mblocks - mblocks);
+ free_mega_group(mblock_slop_bd);
+ }
}
bd->blocks = MBLOCK_GROUP_BLOCKS(mblocks);
return bd;
@@ -839,7 +857,7 @@ coalesce_mblocks (bdescr *p)
return q;
}
-static void
+void
free_mega_group (bdescr *mg)
{
bdescr *bd, *prev;
@@ -1226,10 +1244,17 @@ uint32_t returnMemoryToOS(uint32_t n /* megablocks */)
return 0;
#else
bdescr *bd;
+ bdescr *rejects;
+ bdescr *next;
uint32_t node;
- StgWord size;
+ StgWord size, unaligned_size, freeable_size;
uint32_t init_n;
init_n = n;
+ if(RtsFlags.GcFlags.hugepages) {
+ // Invariant: n is always a multiple of the hugepage size
+ // as we can only free whole hugepages.
+ n = MBLOCK_ROUND_DOWN_HUGEPAGE(n);
+ }
// TODO: This is inefficient because this loop will essentially result in
// quadratic runtime behavior: for each call to `freeMBlocks`, the
@@ -1242,22 +1267,72 @@ uint32_t returnMemoryToOS(uint32_t n /* megablocks */)
// ToDo: not fair, we free all the memory starting with node 0.
for (node = 0; n > 0 && node < n_numa_nodes; node++) {
bd = free_mblock_list[node];
+ // 'rejects' is a reversed list of mblocks that need to go back on the
+ // free list.
+ rejects = NULL;
while ((n > 0) && (bd != NULL)) {
size = BLOCKS_TO_MBLOCKS(bd->blocks);
- if (size > n) {
- StgWord newSize = size - n;
- char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
- freeAddr += newSize * MBLOCK_SIZE;
- bd->blocks = MBLOCK_GROUP_BLOCKS(newSize);
- freeMBlocks(freeAddr, n);
- n = 0;
+ next = bd->link;
+ char *aligned_start;
+
+ if(RtsFlags.GcFlags.hugepages) {
+ // we can only free hugepage aligned mblock groups
+ aligned_start = (char*)MBLOCK_ROUND_DOWN(bd) + ((uintptr_t)MBLOCK_ROUND_DOWN(bd) & HUGEPAGE_MASK);
+ unaligned_size = (aligned_start - (char*)MBLOCK_ROUND_DOWN(bd)) / MBLOCK_SIZE;
+ freeable_size = MBLOCK_ROUND_DOWN_HUGEPAGE(size - unaligned_size);
}
else {
- char *freeAddr = MBLOCK_ROUND_DOWN(bd->start);
- n -= size;
- bd = bd->link;
- freeMBlocks(freeAddr, size);
+ aligned_start = (char*)MBLOCK_ROUND_DOWN(bd);
+ unaligned_size = 0;
+ freeable_size = size;
}
+
+ // We cannot free more than n
+ // Note: n is a multiple of the hugepage size,
+ // so freeable_size will also continue to be a multiple.
+ freeable_size = stg_min(n, freeable_size);
+
+ // Place the front unaligned section back on the list.
+ // If we can't free any of it then this is the entire thing.
+ if (unaligned_size > 0 || freeable_size == 0) {
+ bd->link = rejects;
+ rejects = bd;
+ // We are freeing some mblocks from the middle
+ if (freeable_size > 0) {
+ bd->blocks = MBLOCK_GROUP_BLOCKS(unaligned_size);
+ bdescr *aligned_bd;
+ aligned_bd = FIRST_BDESCR(aligned_start);
+ aligned_bd->blocks = MBLOCK_GROUP_BLOCKS(freeable_size);
+ }
+ }
+
+ if(freeable_size > 0) {
+ // Free the mblocks
+ n -= freeable_size;
+ freeMBlocks(aligned_start, freeable_size);
+ // add the slop to the rejects list
+ if (size - unaligned_size - freeable_size > 0)
+ {
+ void *slop = aligned_start + freeable_size * MBLOCK_SIZE;
+ bdescr* slop_bd = FIRST_BDESCR(slop);
+ slop_bd->blocks = MBLOCK_GROUP_BLOCKS(size - unaligned_size - freeable_size);
+ slop_bd->link = rejects;
+ initMBlock(slop, node);
+ rejects = slop_bd;
+ }
+ }
+ bd = next;
+ }
+ // Place the rejected mblocks back on the free list.
+ // Note: this preserves the order.
+ while(rejects) {
+ // pop the top of the rejects list.
+ next = rejects;
+ rejects = next->link;
+ // place it back on the free list.
+ next->link = bd;
+ ASSERT(next < bd || bd == NULL);
+ bd = next;
}
free_mblock_list[node] = bd;
}
=====================================
rts/sm/OSMem.h
=====================================
@@ -8,7 +8,21 @@
#pragma once
+#if defined(HAVE_LINUX_MMAN_H)
+#include
+
+#define HUGEPAGE_SHIFT 21
+#define HUGEPAGE_FLAGS (MAP_HUGETLB | MAP_HUGE_2MB)
+#else
+#define HUGEPAGE_SHIFT MBLOCK_SHIFT
+#endif
+
#include "BeginPrivate.h"
+GHC_STATIC_ASSERT(HUGEPAGE_SHIFT >= MBLOCK_SHIFT, "mblock size must not exceed 2MB huge page size");
+#define HUGEPAGE_SIZE (1 << HUGEPAGE_SHIFT)
+#define HUGEPAGE_MASK ((1 << HUGEPAGE_SHIFT) - 1)
+#define MBLOCK_ROUND_DOWN_HUGEPAGE(x) ((x) & ~(HUGEPAGE_SHIFT - MBLOCK_SHIFT))
+#define MBLOCK_ROUND_UP_HUGEPAGE(x) ((x) + ((x) & (HUGEPAGE_SHIFT - MBLOCK_SHIFT)))
void osMemInit(void);
void *osGetMBlocks(uint32_t n);
=====================================
testsuite/tests/rts/all.T
=====================================
@@ -15,6 +15,12 @@ test('testmblockalloc',
# which will crash because the mblocks we allocate are not in a state
# the leak detector is expecting.
+# A variant of the above that tries to use hugepages
+test('testhugepagesmblockalloc',
+ [c_src, only_ways(['normal','threaded1']), extra_run_opts('+RTS -I0 -xr0.125T -xH'),
+ unless(opsys('linux'), skip)], # Huge pages are only currently supported on Linux
+ compile_and_run, [''])
+
# See bug #101, test requires +RTS -c (or equivalently +RTS -M<something>)
# only GHCi triggers the bug, but we run the test all ways for completeness.
=====================================
testsuite/tests/rts/testhugepagesmblockalloc.c
=====================================
@@ -0,0 +1,75 @@
+#include "Rts.h"
+
+#include
+
+// 16 * 64 == max 1GB
+const int MAXALLOC = 16;
+const int ARRSIZE = 64;
+
+const int LOOPS = 1000;
+const int SEED = 0xf00f00;
+
+extern StgWord mblocks_allocated;
+
+int main (int argc, char *argv[])
+{
+ int i, j, b;
+
+ void *a[ARRSIZE];
+ uint32_t sizes[ARRSIZE];
+
+ srand(SEED);
+
+ {
+ RtsConfig conf = defaultRtsConfig;
+ conf.rts_opts_enabled = RtsOptsAll;
+ hs_init_ghc(&argc, &argv, conf);
+ }
+
+ // repeatedly sweep though the array, allocating new random-sized
+ // objects and deallocating the old ones.
+ for (i=0; i < LOOPS; i++)
+ {
+ for (j=0; j < ARRSIZE; j++)
+ {
+ if (i > 0)
+ {
+ freeMBlocks(a[j], sizes[j]);
+ }
+ b = (rand() % MAXALLOC) + 1;
+ a[j] = getMBlocks(b);
+ sizes[j] = b;
+ }
+ }
+
+ releaseFreeMemory();
+
+ for (j=0; j < ARRSIZE; j++)
+ {
+ freeMBlocks(a[j], sizes[j]);
+ }
+
+ releaseFreeMemory();
+
+ // this time, sweep forwards allocating new blocks, and then
+ // backwards deallocating them.
+ for (i=0; i < LOOPS; i++)
+ {
+ for (j=0; j < ARRSIZE; j++)
+ {
+ b = (rand() % MAXALLOC) + 1;
+ a[j] = getMBlocks(b);
+ sizes[j] = b;
+ }
+ for (j=ARRSIZE-1; j >= 0; j--)
+ {
+ freeMBlocks(a[j], sizes[j]);
+ }
+ }
+
+ releaseFreeMemory();
+
+ hs_exit(); // will do a memory leak test
+
+ exit(0);
+}
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/4cfbe8ff8a57dbf657a34b95ff0cc58c...
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/4cfbe8ff8a57dbf657a34b95ff0cc58c...
You're receiving this email because of your account on gitlab.haskell.org.