[release-branch.go1.21] runtime: avoid MADV_HUGEPAGE for heap memory

author Michael Anthony Knyszek <mknyszek@google.com>

Mon, 7 Aug 2023 19:09:59 +0000 (19:09 +0000)

committer Gopher Robot <gobot@golang.org>

Wed, 30 Aug 2023 21:59:35 +0000 (21:59 +0000)
author Michael Anthony Knyszek <mknyszek@google.com>
Mon, 7 Aug 2023 19:09:59 +0000 (19:09 +0000)
committer Gopher Robot <gobot@golang.org>
Wed, 30 Aug 2023 21:59:35 +0000 (21:59 +0000)
diff --git a/src/runtime/defs_linux_386.go b/src/runtime/defs_linux_386.go

index 72339f4aa5315f6e1a3c1cee7cfcc230b7be9e82..5fef55610f39d2965933fd8d677fce56af357720 100644 (file)
--- a/src/runtime/defs_linux_386.go
+++ b/src/runtime/defs_linux_386.go
@@ -23,6 +23,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART  = 0x10000000
         _SA_ONSTACK  = 0x8000000
diff --git a/src/runtime/defs_linux_amd64.go b/src/runtime/defs_linux_amd64.go

index 298f3ebf7caf81d2016cd0c1db483c28290e131a..dce7799b6adc9cb2cdc1e7001775109a72943eb0 100644 (file)
--- a/src/runtime/defs_linux_amd64.go
+++ b/src/runtime/defs_linux_amd64.go
@@ -23,6 +23,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART  = 0x10000000
         _SA_ONSTACK  = 0x8000000
diff --git a/src/runtime/defs_linux_arm.go b/src/runtime/defs_linux_arm.go

index 6fee57dacf70ad03cc480749f9d8ca898c2f34aa..71cf8c6d50e7609bb5fb492197afe3821e88c294 100644 (file)
--- a/src/runtime/defs_linux_arm.go
+++ b/src/runtime/defs_linux_arm.go
@@ -25,6 +25,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART     = 0x10000000
         _SA_ONSTACK     = 0x8000000
diff --git a/src/runtime/defs_linux_arm64.go b/src/runtime/defs_linux_arm64.go

index 02160963011fef19e1ad9c7b92fca57074751a8c..606cd70494e37ad9b23f81b2e8e3b286267f5269 100644 (file)
--- a/src/runtime/defs_linux_arm64.go
+++ b/src/runtime/defs_linux_arm64.go
@@ -23,6 +23,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART  = 0x10000000
         _SA_ONSTACK  = 0x8000000
diff --git a/src/runtime/defs_linux_loong64.go b/src/runtime/defs_linux_loong64.go

index 6eca18bdae24d905ec8d494778407633717e0f5d..692d8c78e9a4d2d3f02d7748b56cb30baac04f8a 100644 (file)
--- a/src/runtime/defs_linux_loong64.go
+++ b/src/runtime/defs_linux_loong64.go
@@ -24,6 +24,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART  = 0x10000000
         _SA_ONSTACK  = 0x8000000
diff --git a/src/runtime/defs_linux_mips64x.go b/src/runtime/defs_linux_mips64x.go

index 2e8c4056bae3f540f4633d422e5f9791d4ae78d7..8a0af41234f22629343f24ebb0958f58d7078d33 100644 (file)
--- a/src/runtime/defs_linux_mips64x.go
+++ b/src/runtime/defs_linux_mips64x.go
@@ -26,6 +26,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART = 0x10000000
         _SA_ONSTACK = 0x8000000
diff --git a/src/runtime/defs_linux_mipsx.go b/src/runtime/defs_linux_mipsx.go

index 7593600cc6af385218b06e45c0f981ff376eae9d..8322beab2b1c20e536b425182fe8630c059078bb 100644 (file)
--- a/src/runtime/defs_linux_mipsx.go
+++ b/src/runtime/defs_linux_mipsx.go
@@ -26,6 +26,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART = 0x10000000
         _SA_ONSTACK = 0x8000000
diff --git a/src/runtime/defs_linux_ppc64.go b/src/runtime/defs_linux_ppc64.go

index bb3ac016e59d04237980ea237b244f0214d16d49..f87924affe65ef190e95da2cf7bf768cb2c58c1e 100644 (file)
--- a/src/runtime/defs_linux_ppc64.go
+++ b/src/runtime/defs_linux_ppc64.go
@@ -23,6 +23,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART = 0x10000000
         _SA_ONSTACK = 0x8000000
diff --git a/src/runtime/defs_linux_ppc64le.go b/src/runtime/defs_linux_ppc64le.go

index bb3ac016e59d04237980ea237b244f0214d16d49..f87924affe65ef190e95da2cf7bf768cb2c58c1e 100644 (file)
--- a/src/runtime/defs_linux_ppc64le.go
+++ b/src/runtime/defs_linux_ppc64le.go
@@ -23,6 +23,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART = 0x10000000
         _SA_ONSTACK = 0x8000000
diff --git a/src/runtime/defs_linux_riscv64.go b/src/runtime/defs_linux_riscv64.go

index ce4a7f36cd57d4f9155c93c1eb45c91d00ab9dc8..29b1ef2a50a94c9f6dd6be96b4e08ed03c4d4dad 100644 (file)
--- a/src/runtime/defs_linux_riscv64.go
+++ b/src/runtime/defs_linux_riscv64.go
@@ -24,6 +24,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART  = 0x10000000
         _SA_ONSTACK  = 0x8000000
diff --git a/src/runtime/defs_linux_s390x.go b/src/runtime/defs_linux_s390x.go

index 36497dd40dc6409f22112b90ee9ccc4aac97cacd..b0280213b3d12f6207ad44b83dd88edded9bd066 100644 (file)
--- a/src/runtime/defs_linux_s390x.go
+++ b/src/runtime/defs_linux_s390x.go
@@ -24,6 +24,7 @@ const (
         _MADV_FREE       = 0x8
         _MADV_HUGEPAGE   = 0xe
         _MADV_NOHUGEPAGE = 0xf
+       _MADV_COLLAPSE   = 0x19
  
         _SA_RESTART = 0x10000000
         _SA_ONSTACK = 0x8000000
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go

index 5641005401aa41e8fa7cd547b2905446d40f1920..979eb74332148016e845f81fecc0418db9c4aef7 100644 (file)
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -1819,8 +1819,8 @@ func (s *ScavengeIndex) SetEmpty(ci ChunkIdx) {
         s.i.setEmpty(chunkIdx(ci))
  }
  
-func (s *ScavengeIndex) SetNoHugePage(ci ChunkIdx) bool {
-       return s.i.setNoHugePage(chunkIdx(ci))
+func (s *ScavengeIndex) SetNoHugePage(ci ChunkIdx) {
+       s.i.setNoHugePage(chunkIdx(ci))
  }
  
  func CheckPackScavChunkData(gen uint32, inUse, lastInUse uint16, flags uint8) bool {
diff --git a/src/runtime/mem.go b/src/runtime/mem.go

index 7b019052247d942e70405fd6085ab21368744582..22688d51d5e3fe7b618b9669ea6c31e8661cce5d 100644 (file)
--- a/src/runtime/mem.go
+++ b/src/runtime/mem.go
@@ -91,6 +91,12 @@ func sysNoHugePage(v unsafe.Pointer, n uintptr) {
         sysNoHugePageOS(v, n)
  }
  
+// sysHugePageCollapse attempts to immediately back the provided memory region
+// with huge pages. It is best-effort and may fail silently.
+func sysHugePageCollapse(v unsafe.Pointer, n uintptr) {
+       sysHugePageCollapseOS(v, n)
+}
+
  // sysFree transitions a memory region from any state to None. Therefore, it
  // returns memory unconditionally. It is used if an out-of-memory error has been
  // detected midway through an allocation or to carve out an aligned section of
diff --git a/src/runtime/mem_aix.go b/src/runtime/mem_aix.go

index deae61635cddd42f0fb85c99be1840b58d46609e..dff2756d971ab1d36d948a8246b58d1aa1fd990c 100644 (file)
--- a/src/runtime/mem_aix.go
+++ b/src/runtime/mem_aix.go
@@ -41,6 +41,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) {
  func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
  }
  
+func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
+}
+
  // Don't split the stack as this function may be invoked without a valid G,
  // which prevents us from allocating more stack.
  //
diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go

index a9025ad015338098f09f4b6cd19e4d85ee076276..78128aedf7b069e1028a6cc7fec03905cc5abbc1 100644 (file)
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go
@@ -39,6 +39,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) {
  func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
  }
  
+func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
+}
+
  // Don't split the stack as this function may be invoked without a valid G,
  // which prevents us from allocating more stack.
  //
diff --git a/src/runtime/mem_darwin.go b/src/runtime/mem_darwin.go

index 1e3e53d45b94a42fd0788a2b46ca957ea3676d95..ae8487127cfdd7aabd903cc06125795d6c8849fd 100644 (file)
--- a/src/runtime/mem_darwin.go
+++ b/src/runtime/mem_darwin.go
@@ -39,6 +39,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) {
  func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
  }
  
+func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
+}
+
  // Don't split the stack as this function may be invoked without a valid G,
  // which prevents us from allocating more stack.
  //
diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go

index bdfab13fed9c8255a504f0a6fb0715c3178a2635..c9823d30113310b29c7c40495dd9b5ba6b1bb6b8 100644 (file)
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go
@@ -116,6 +116,31 @@ func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
         madvise(v, n, _MADV_NOHUGEPAGE)
  }
  
+func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
+       if uintptr(v)&(physPageSize-1) != 0 {
+               // The Linux implementation requires that the address
+               // addr be page-aligned, and allows length to be zero.
+               throw("unaligned sysHugePageCollapseOS")
+       }
+       if physHugePageSize == 0 {
+               return
+       }
+       // N.B. If you find yourself debugging this code, note that
+       // this call can fail with EAGAIN because it's best-effort.
+       // Also, when it returns an error, it's only for the last
+       // huge page in the region requested.
+       //
+       // It can also sometimes return EINVAL if the corresponding
+       // region hasn't been backed by physical memory. This is
+       // difficult to guarantee in general, and it also means
+       // there's no way to distinguish whether this syscall is
+       // actually available. Oops.
+       //
+       // Anyway, that's why this call just doesn't bother checking
+       // any errors.
+       madvise(v, n, _MADV_COLLAPSE)
+}
+
  // Don't split the stack as this function may be invoked without a valid G,
  // which prevents us from allocating more stack.
  //
diff --git a/src/runtime/mem_sbrk.go b/src/runtime/mem_sbrk.go

index c8f50e7bd57e6b392c8e5168d52d66d0789c11f0..dc0a764a2cac86f0c9ce9d6b46880e0ee1be3efe 100644 (file)
--- a/src/runtime/mem_sbrk.go
+++ b/src/runtime/mem_sbrk.go
@@ -163,6 +163,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) {
  func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
  }
  
+func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
+}
+
  func sysMapOS(v unsafe.Pointer, n uintptr) {
  }
  
diff --git a/src/runtime/mem_windows.go b/src/runtime/mem_windows.go

index c11abc17addd092f9894bf7408b9842d5fa63836..477d8988702c67e2b6cb9a705bfd1701b0473e37 100644 (file)
--- a/src/runtime/mem_windows.go
+++ b/src/runtime/mem_windows.go
@@ -97,6 +97,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) {
  func sysNoHugePageOS(v unsafe.Pointer, n uintptr) {
  }
  
+func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) {
+}
+
  // Don't split the stack as this function may be invoked without a valid G,
  // which prevents us from allocating more stack.
  //
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go

index 82a94be22a7ea51479136617db23f89ac873f40c..4c6d6be4f04b754412fdd2337537d1b957c47bd7 100644 (file)
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -771,7 +771,7 @@ func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintpt
  
                         // Grab whether the chunk is hugepage backed and if it is,
                         // clear it. We're about to break up this huge page.
-                       shouldNoHugePage := p.scav.index.setNoHugePage(ci)
+                       p.scav.index.setNoHugePage(ci)
  
                         // With that done, it's safe to unlock.
                         unlock(p.mheapLock)
@@ -781,9 +781,6 @@ func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintpt
  
                                 // Only perform sys* operations if we're not in a test.
                                 // It's dangerous to do so otherwise.
-                               if shouldNoHugePage {
-                                       sysNoHugePage(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes)
-                               }
                                 sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
  
                                 // Update global accounting only when not in test, otherwise
@@ -1134,17 +1131,34 @@ func (s *scavengeIndex) find(force bool) (chunkIdx, uint) {
  }
  
  // alloc updates metadata for chunk at index ci with the fact that
-// an allocation of npages occurred.
+// an allocation of npages occurred. It also eagerly attempts to collapse
+// the chunk's memory into hugepage if the chunk has become sufficiently
+// dense and we're not allocating the whole chunk at once (which suggests
+// the allocation is part of a bigger one and it's probably not worth
+// eagerly collapsing).
  //
  // alloc may only run concurrently with find.
  func (s *scavengeIndex) alloc(ci chunkIdx, npages uint) {
         sc := s.chunks[ci].load()
         sc.alloc(npages, s.gen)
         if !sc.isHugePage() && sc.inUse > scavChunkHiOccPages {
-               // Mark dense chunks as specifically backed by huge pages.
+               // Mark that we're considering this chunk as backed by huge pages.
                 sc.setHugePage()
-               if !s.test {
-                       sysHugePage(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes)
+
+               // Collapse dense chunks into huge pages and mark that
+               // we did that, but only if we're not allocating to
+               // use the entire chunk. If we're allocating an entire chunk,
+               // this is likely part of a much bigger allocation. For
+               // instance, if the caller is allocating a 1 GiB slice of bytes, we
+               // don't want to go and manually collapse all those pages; we want
+               // them to be demand-paged. If the caller is actually going to use
+               // all that memory, it'll naturally get backed by huge pages later.
+               //
+               // This also avoids having sysHugePageCollapse fail. On Linux,
+               // the call requires that some part of the huge page being collapsed
+               // is already paged in.
+               if !s.test && npages < pallocChunkPages {
+                       sysHugePageCollapse(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes)
                 }
         }
         s.chunks[ci].store(sc)
@@ -1204,14 +1218,13 @@ func (s *scavengeIndex) setEmpty(ci chunkIdx) {
  // Returns true if the set was successful (not already backed by huge pages).
  //
  // setNoHugePage may only run concurrently with find.
-func (s *scavengeIndex) setNoHugePage(ci chunkIdx) bool {
+func (s *scavengeIndex) setNoHugePage(ci chunkIdx) {
         val := s.chunks[ci].load()
         if !val.isHugePage() {
-               return false
+               return
         }
         val.setNoHugePage()
         s.chunks[ci].store(val)
-       return true
  }
  
  // atomicScavChunkData is an atomic wrapper around a scavChunkData
@@ -1282,8 +1295,9 @@ const (
         // file. The reason we say "HasFree" here is so the zero value is
         // correct for a newly-grown chunk. (New memory is scavenged.)
         scavChunkHasFree scavChunkFlags = 1 << iota
-       // scavChunkNoHugePage indicates whether this chunk has been marked
-       // sysNoHugePage. If not set, it means the chunk is marked sysHugePage.
+       // scavChunkNoHugePage indicates whether this chunk has had any huge
+       // pages broken by the scavenger.
+       //.
         // The negative here is unfortunate, but necessary to make it so that
         // the zero value of scavChunkData accurately represents the state of
         // a newly-grown chunk. (New memory is marked as backed by huge pages.)
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go

index ed53a5672b8d6177b97b049439abec39219d54bb..3e789ab85cc0192587dde791cc8fab5f113a2cee 100644 (file)
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -426,11 +426,6 @@ func (p *pageAlloc) grow(base, size uintptr) {
         // we need to ensure this newly-free memory is visible in the
         // summaries.
         p.update(base, size/pageSize, true, false)
-
-       // Mark all new memory as huge page eligible.
-       if !p.test {
-               sysHugePage(unsafe.Pointer(base), size)
-       }
  }
  
  // enableChunkHugePages enables huge pages for the chunk bitmap mappings (disabled by default).
author	Michael Anthony Knyszek <mknyszek@google.com>
	Mon, 7 Aug 2023 19:09:59 +0000 (19:09 +0000)
committer	Gopher Robot <gobot@golang.org>
	Wed, 30 Aug 2023 21:59:35 +0000 (21:59 +0000)
src/runtime/defs_linux_386.go		patch \| blob \| history
src/runtime/defs_linux_amd64.go		patch \| blob \| history
src/runtime/defs_linux_arm.go		patch \| blob \| history
src/runtime/defs_linux_arm64.go		patch \| blob \| history
src/runtime/defs_linux_loong64.go		patch \| blob \| history
src/runtime/defs_linux_mips64x.go		patch \| blob \| history
src/runtime/defs_linux_mipsx.go		patch \| blob \| history
src/runtime/defs_linux_ppc64.go		patch \| blob \| history
src/runtime/defs_linux_ppc64le.go		patch \| blob \| history
src/runtime/defs_linux_riscv64.go		patch \| blob \| history
src/runtime/defs_linux_s390x.go		patch \| blob \| history
src/runtime/export_test.go		patch \| blob \| history
src/runtime/mem.go		patch \| blob \| history
src/runtime/mem_aix.go		patch \| blob \| history
src/runtime/mem_bsd.go		patch \| blob \| history
src/runtime/mem_darwin.go		patch \| blob \| history
src/runtime/mem_linux.go		patch \| blob \| history
src/runtime/mem_sbrk.go		patch \| blob \| history
src/runtime/mem_windows.go		patch \| blob \| history
src/runtime/mgcscavenge.go		patch \| blob \| history
src/runtime/mpagealloc.go		patch \| blob \| history