Skip to content

Commit c058a4c

Browse files
d-nettoRAI CI (GitHub Action Automation)
authored and
RAI CI (GitHub Action Automation)
committed
add a compile-time option to enable 4k page sizes (JuliaLang#52229)
We're suffering from heavy fragmentation in some of our workloads. Add a build-time option to enable 4k pages (instead of 16k) in the GC, since that improves memory utilization considerably for us. Drawback is that this may increase the number of `madvise` system calls in the sweeping phase by a factor of 4, but concurrent page sweeping should help with some of that.
1 parent d155ffd commit c058a4c

File tree

4 files changed

+68
-18
lines changed

4 files changed

+68
-18
lines changed

src/gc.h

+23-1
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@
3333
extern "C" {
3434
#endif
3535

36+
#ifdef GC_SMALL_PAGE
37+
#define GC_PAGE_LG2 12 // log2(size of a page)
38+
#else
3639
#define GC_PAGE_LG2 14 // log2(size of a page)
37-
#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k
40+
#endif
41+
#define GC_PAGE_SZ (1 << GC_PAGE_LG2)
3842
#define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT))
3943

4044
#define jl_malloc_tag ((void*)0xdeadaa01)
@@ -227,6 +231,23 @@ typedef struct {
227231
_Atomic(size_t) n_pages_allocd;
228232
} gc_fragmentation_stat_t;
229233

234+
#ifdef GC_SMALL_PAGE
235+
#ifdef _P64
236+
#define REGION0_PG_COUNT (1 << 16)
237+
#define REGION1_PG_COUNT (1 << 18)
238+
#define REGION2_PG_COUNT (1 << 18)
239+
#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0xFFFF) // shift by GC_PAGE_LG2
240+
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 28) & 0x3FFFF)
241+
#define REGION_INDEX(p) (((uintptr_t)(p) >> 46) & 0x3FFFF)
242+
#else
243+
#define REGION0_PG_COUNT (1 << 10)
244+
#define REGION1_PG_COUNT (1 << 10)
245+
#define REGION2_PG_COUNT (1 << 0)
246+
#define REGION0_INDEX(p) (((uintptr_t)(p) >> 12) & 0x3FF) // shift by GC_PAGE_LG2
247+
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF)
248+
#define REGION_INDEX(p) (0)
249+
#endif
250+
#else
230251
#ifdef _P64
231252
#define REGION0_PG_COUNT (1 << 16)
232253
#define REGION1_PG_COUNT (1 << 16)
@@ -242,6 +263,7 @@ typedef struct {
242263
#define REGION1_INDEX(p) (((uintptr_t)(p) >> 22) & 0x3FF)
243264
#define REGION_INDEX(p) (0)
244265
#endif
266+
#endif
245267

246268
// define the representation of the levels of the page-table (0 to 2)
247269
typedef struct {

src/julia_internal.h

+37-8
Original file line numberDiff line numberDiff line change
@@ -375,24 +375,48 @@ static const int jl_gc_sizeclasses[] = {
375375
144, 160, 176, 192, 208, 224, 240, 256,
376376

377377
// the following tables are computed for maximum packing efficiency via the formula:
378-
// pg = 2^14
378+
// pg = GC_SMALL_PAGE ? 2^12 : 2^14
379379
// sz = (div.(pg-8, rng).÷16)*16; hcat(sz, (pg-8).÷sz, pg .- (pg-8).÷sz.*sz)'
380380

381+
#ifdef GC_SMALL_PAGE
382+
// rng = 15:-1:2 (14 pools)
383+
272, 288, 304, 336, 368, 400, 448, 496, 576, 672, 816, 1008, 1360, 2032
384+
// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, /pool
385+
// 16, 64, 144, 64, 48, 96, 64, 128, 64, 64, 16, 64, 16, 32, bytes lost
386+
#else
381387
// rng = 60:-4:32 (8 pools)
382388
272, 288, 304, 336, 368, 400, 448, 496,
383-
// 60, 56, 53, 48, 44, 40, 36, 33, /pool
384-
// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost
389+
// 60, 56, 53, 48, 44, 40, 36, 33, /pool
390+
// 64, 256, 272, 256, 192, 384, 256, 16, bytes lost
385391

386392
// rng = 30:-2:16 (8 pools)
387393
544, 576, 624, 672, 736, 816, 896, 1008,
388-
// 30, 28, 26, 24, 22, 20, 18, 16, /pool
389-
// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost
394+
// 30, 28, 26, 24, 22, 20, 18, 16, /pool
395+
// 64, 256, 160, 256, 192, 64, 256, 256, bytes lost
390396

391397
// rng = 15:-1:8 (8 pools)
392398
1088, 1168, 1248, 1360, 1488, 1632, 1808, 2032
393-
// 15, 14, 13, 12, 11, 10, 9, 8, /pool
394-
// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost
399+
// 15, 14, 13, 12, 11, 10, 9, 8, /pool
400+
// 64, 32, 160, 64, 16, 64, 112, 128, bytes lost
401+
#endif
395402
};
403+
#ifdef GC_SMALL_PAGE
404+
#ifdef _P64
405+
# define JL_GC_N_POOLS 39
406+
#elif MAX_ALIGN == 8
407+
# define JL_GC_N_POOLS 40
408+
#else
409+
# define JL_GC_N_POOLS 41
410+
#endif
411+
#else
412+
#ifdef _P64
413+
# define JL_GC_N_POOLS 49
414+
#elif MAX_ALIGN == 8
415+
# define JL_GC_N_POOLS 50
416+
#else
417+
# define JL_GC_N_POOLS 51
418+
#endif
419+
#endif
396420
static_assert(sizeof(jl_gc_sizeclasses) / sizeof(jl_gc_sizeclasses[0]) == JL_GC_N_POOLS, "");
397421

398422
STATIC_INLINE int jl_gc_alignment(size_t sz) JL_NOTSAFEPOINT
@@ -419,7 +443,12 @@ JL_DLLEXPORT int jl_alignment(size_t sz) JL_NOTSAFEPOINT;
419443

420444
// the following table is computed as:
421445
// [searchsortedfirst(jl_gc_sizeclasses, i) - 1 for i = 0:16:jl_gc_sizeclasses[end]]
422-
static const uint8_t szclass_table[] = {0, 1, 3, 5, 7, 9, 11, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48};
446+
static const uint8_t szclass_table[] =
447+
#ifdef GC_SMALL_PAGE
448+
{0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,33,33,34,34,34,34,34,34,35,35,35,35,35,35,35,35,35,36,36,36,36,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38,38};
449+
#else
450+
{0,1,3,5,7,9,11,13,15,17,18,19,20,21,22,23,24,25,26,27,28,28,29,29,30,30,31,31,31,32,32,32,33,33,33,34,34,35,35,35,36,36,36,37,37,37,37,38,38,38,38,38,39,39,39,39,39,40,40,40,40,40,40,40,41,41,41,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,46,46,46,46,46,46,46,46,46,47,47,47,47,47,47,47,47,47,47,47,48,48,48,48,48,48,48,48,48,48,48,48,48,48};
451+
#endif
423452
static_assert(sizeof(szclass_table) == 128, "");
424453

425454
STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz) JL_NOTSAFEPOINT

src/julia_threads.h

+3-9
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
#ifndef JL_THREADS_H
55
#define JL_THREADS_H
66

7-
#include "work-stealing-queue.h"
87
#include "julia_atomics.h"
8+
#include "work-stealing-queue.h"
99
#ifndef _OS_WINDOWS_
1010
#include "pthread.h"
1111
#endif
@@ -161,14 +161,8 @@ typedef struct {
161161
arraylist_t *last_remset;
162162

163163
// variables for allocating objects from pools
164-
#ifdef _P64
165-
# define JL_GC_N_POOLS 49
166-
#elif MAX_ALIGN == 8
167-
# define JL_GC_N_POOLS 50
168-
#else
169-
# define JL_GC_N_POOLS 51
170-
#endif
171-
jl_gc_pool_t norm_pools[JL_GC_N_POOLS];
164+
#define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h`
165+
jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS];
172166

173167
#define JL_N_STACK_POOLS 16
174168
small_arraylist_t free_stacks[JL_N_STACK_POOLS];

src/options.h

+5
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@
7878
// OBJPROFILE counts objects by type
7979
// #define OBJPROFILE
8080

81+
// pool allocator configuration options
82+
83+
// GC_SMALL_PAGE allocates objects in 4k pages
84+
// #define GC_SMALL_PAGE
85+
8186

8287
// method dispatch profiling --------------------------------------------------
8388

0 commit comments

Comments
 (0)