!189 golang-1.15: fix a deadlock issue when a signal is received

From: @hcnbxx 
Reviewed-by: @jing-rui 
Signed-off-by: @jing-rui
This commit is contained in:
openeuler-ci-bot 2023-04-14 09:47:17 +00:00 committed by Gitee
commit af8ef0f258
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
3 changed files with 547 additions and 1 deletions

View File

@ -0,0 +1,199 @@
From 2113c53920d02e7805372b543d2b658394fcd636 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Tue, 17 Nov 2020 11:55:53 -0500
Subject: [PATCH 1/2] runtime: don't take allglock in tracebackothers
tracebackothers is called from fatal throw/panic.
A fatal throw may be taken with allglock held (notably in the allocator
when allglock is held), which would cause a deadlock in tracebackothers
when we try to take allglock again. Locking allglock here is also often
a lock order violation w.r.t. the locks held when throw was called.
Avoid the deadlock and ordering issues by skipping locking altogether.
It is OK to miss concurrently created Gs (which are generally avoided by
freezetheworld(), and which were possible previously anyways if created
after the loop).
Fatal throw/panic freezetheworld(), which should freeze other threads
that may be racing to modify allgs. However, freezetheworld() does _not_
guarantee that it stops all other threads, so we can't simply drop the
lock.
Fixes #42669
Updates #43175
Change-Id: I657aec46ed35fd5d1b3f1ba25b500128ab26b088
Reviewed-on: https://go-review.googlesource.com/c/go/+/270861
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Trust: Michael Pratt <mpratt@google.com>
Conflict: NA
Reference: https://go-review.googlesource.com/c/go/+/270861
---
src/runtime/mgcmark.go | 1 -
src/runtime/proc.go | 43 ++++++++++++++++++++++++++++++++++++----
src/runtime/runtime2.go | 1 -
src/runtime/traceback.go | 27 +++++++++++++++----------
4 files changed, 56 insertions(+), 16 deletions(-)
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index b2fbcd4105..6ecc467475 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -143,7 +143,6 @@ fail:
println("gp", gp, "goid", gp.goid,
"status", readgstatus(gp),
"gcscandone", gp.gcscandone)
- unlock(&allglock) // Avoid self-deadlock with traceback.
throw("scan missed a g")
}
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 7fa19d867b..db49fbf385 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -460,8 +460,29 @@ func lockedOSThread() bool {
}
var (
- allgs []*g
+ // allgs contains all Gs ever created (including dead Gs), and thus
+ // never shrinks.
+ //
+ // Access via the slice is protected by allglock or stop-the-world.
+ // Readers that cannot take the lock may (carefully!) use the atomic
+ // variables below.
allglock mutex
+ allgs []*g
+
+ // allglen and allgptr are atomic variables that contain len(allg) and
+ // &allg[0] respectively. Proper ordering depends on totally-ordered
+ // loads and stores. Writes are protected by allglock.
+ //
+ // allgptr is updated before allglen. Readers should read allglen
+ // before allgptr to ensure that allglen is always <= len(allgptr). New
+ // Gs appended during the race can be missed. For a consistent view of
+ // all Gs, allglock must be held.
+ //
+ // allgptr copies should always be stored as a concrete type or
+ // unsafe.Pointer, not uintptr, to ensure that GC can still reach it
+ // even if it points to a stale array.
+ allglen uintptr
+ allgptr **g
)
func allgadd(gp *g) {
@@ -471,10 +492,25 @@ func allgadd(gp *g) {
lock(&allglock)
allgs = append(allgs, gp)
- allglen = uintptr(len(allgs))
+ if &allgs[0] != allgptr {
+ atomicstorep(unsafe.Pointer(&allgptr), unsafe.Pointer(&allgs[0]))
+ }
+ atomic.Storeuintptr(&allglen, uintptr(len(allgs)))
unlock(&allglock)
}
+// atomicAllG returns &allgs[0] and len(allgs) for use with atomicAllGIndex.
+func atomicAllG() (**g, uintptr) {
+ length := atomic.Loaduintptr(&allglen)
+ ptr := (**g)(atomic.Loadp(unsafe.Pointer(&allgptr)))
+ return ptr, length
+}
+
+// atomicAllGIndex returns ptr[i] with the allgptr returned from atomicAllG.
+func atomicAllGIndex(ptr **g, i uintptr) *g {
+ return *(**g)(add(unsafe.Pointer(ptr), i*sys.PtrSize))
+}
+
const (
// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
@@ -3913,7 +3949,7 @@ func badunlockosthread() {
}
func gcount() int32 {
- n := int32(allglen) - sched.gFree.n - int32(atomic.Load(&sched.ngsys))
+ n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.n - int32(atomic.Load(&sched.ngsys))
for _, _p_ := range allp {
n -= _p_.gFree.n
}
@@ -4583,7 +4619,6 @@ func checkdead() {
case _Grunnable,
_Grunning,
_Gsyscall:
- unlock(&allglock)
print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
throw("checkdead: runnable g")
}
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 814364aa42..c82678e0ed 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -1031,7 +1031,6 @@ func (w waitReason) String() string {
}
var (
- allglen uintptr
allm *m
allp []*p // len(allp) == gomaxprocs; may change at safe points, otherwise immutable
allpLock mutex // Protects P-less reads of allp and all writes
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 944c8473d2..bf4e02da0e 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -918,17 +918,25 @@ func tracebackothers(me *g) {
level, _, _ := gotraceback()
// Show the current goroutine first, if we haven't already.
- g := getg()
- gp := g.m.curg
- if gp != nil && gp != me {
+ curgp := getg().m.curg
+ if curgp != nil && curgp != me {
print("\n")
- goroutineheader(gp)
- traceback(^uintptr(0), ^uintptr(0), 0, gp)
+ goroutineheader(curgp)
+ traceback(^uintptr(0), ^uintptr(0), 0, curgp)
}
- lock(&allglock)
- for _, gp := range allgs {
- if gp == me || gp == g.m.curg || readgstatus(gp) == _Gdead || isSystemGoroutine(gp, false) && level < 2 {
+ // We can't take allglock here because this may be during fatal
+ // throw/panic, where locking allglock could be out-of-order or a
+ // direct deadlock.
+ //
+ // Instead, use atomic access to allgs which requires no locking. We
+ // don't lock against concurrent creation of new Gs, but even with
+ // allglock we may miss Gs created after this loop.
+ ptr, length := atomicAllG()
+ for i := uintptr(0); i < length; i++ {
+ gp := atomicAllGIndex(ptr, i)
+
+ if gp == me || gp == curgp || readgstatus(gp) == _Gdead || isSystemGoroutine(gp, false) && level < 2 {
continue
}
print("\n")
@@ -937,14 +945,13 @@ func tracebackothers(me *g) {
// called from a signal handler initiated during a
// systemstack call. The original G is still in the
// running state, and we want to print its stack.
- if gp.m != g.m && readgstatus(gp)&^_Gscan == _Grunning {
+ if gp.m != getg().m && readgstatus(gp)&^_Gscan == _Grunning {
print("\tgoroutine running on other thread; stack unavailable\n")
printcreatedby(gp)
} else {
traceback(^uintptr(0), ^uintptr(0), 0, gp)
}
}
- unlock(&allglock)
}
// tracebackHexdump hexdumps part of stk around frame.sp and frame.fp
--
2.33.0

View File

@ -0,0 +1,339 @@
From e1c2b3b3da33448e41a39aabd0cc8cbcea0e1450 Mon Sep 17 00:00:00 2001
From: Michael Pratt <mpratt@google.com>
Date: Wed, 23 Dec 2020 15:05:37 -0500
Subject: [PATCH 2/2] runtime: encapsulate access to allgs
Correctly accessing allgs is a bit hairy. Some paths need to lock
allglock, some don't. Those that don't are safest using atomicAllG, but
usage is not consistent.
Rather than doing this ad-hoc, move all access* through forEachG /
forEachGRace, the locking and atomic versions, respectively. This will
make it easier to ensure safe access.
* markroot is the only exception, as it has a far-removed guarantee of
safe access via an atomic load of allglen far before actual use.
Change-Id: Ie1c7a8243e155ae2b4bc3143577380c695680e89
Reviewed-on: https://go-review.googlesource.com/c/go/+/279994
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Conflict: NA
Reference: https://go-review.googlesource.com/c/go/+/279994
---
src/runtime/heapdump.go | 5 ++---
src/runtime/mgc.go | 8 +++-----
src/runtime/mgcmark.go | 32 +++++++++++++++++++-------------
src/runtime/mprof.go | 35 +++++++++++++++++++----------------
src/runtime/proc.go | 40 +++++++++++++++++++++++++++++-----------
src/runtime/trace.go | 5 +++--
src/runtime/traceback.go | 21 +++++++++------------
7 files changed, 84 insertions(+), 62 deletions(-)
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index cfd5c251b4..93add0ad5a 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -393,8 +393,7 @@ func dumpgoroutine(gp *g) {
func dumpgs() {
// goroutines & stacks
- for i := 0; uintptr(i) < allglen; i++ {
- gp := allgs[i]
+ forEachG(func(gp *g) {
status := readgstatus(gp) // The world is stopped so gp will not be in a scan state.
switch status {
default:
@@ -407,7 +406,7 @@ func dumpgs() {
_Gwaiting:
dumpgoroutine(gp)
}
- }
+ })
}
func finq_callback(fn *funcval, obj unsafe.Pointer, nret uintptr, fint *_type, ot *ptrtype) {
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index b3499516f6..f7198822aa 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -2207,14 +2207,12 @@ func gcSweep(mode gcMode) {
//
//go:systemstack
func gcResetMarkState() {
- // This may be called during a concurrent phase, so make sure
+ // This may be called during a concurrent phase, so lock to make sure
// allgs doesn't change.
- lock(&allglock)
- for _, gp := range allgs {
+ forEachG(func(gp *g) {
gp.gcscandone = false // set to true in gcphasework
gp.gcAssistBytes = 0
- }
- unlock(&allglock)
+ })
// Clear page marks. This is just 1MB per 64GB of heap, so the
// time here is pretty trivial.
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 6ecc467475..233c4976eb 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -127,23 +127,26 @@ func gcMarkRootCheck() {
throw("left over markroot jobs")
}
- lock(&allglock)
// Check that stacks have been scanned.
- var gp *g
- for i := 0; i < work.nStackRoots; i++ {
- gp = allgs[i]
+ //
+ // We only check the first nStackRoots Gs that we should have scanned.
+ // Since we don't care about newer Gs (see comment in
+ // gcMarkRootPrepare), no locking is required.
+ i := 0
+ forEachGRace(func(gp *g) {
+ if i >= work.nStackRoots {
+ return
+ }
+
if !gp.gcscandone {
- goto fail
+ println("gp", gp, "goid", gp.goid,
+ "status", readgstatus(gp),
+ "gcscandone", gp.gcscandone)
+ throw("scan missed a g")
}
- }
- unlock(&allglock)
- return
-fail:
- println("gp", gp, "goid", gp.goid,
- "status", readgstatus(gp),
- "gcscandone", gp.gcscandone)
- throw("scan missed a g")
+ i++
+ })
}
// ptrmask for an allocation containing a single pointer.
@@ -200,6 +203,9 @@ func markroot(gcw *gcWork, i uint32) {
// the rest is scanning goroutine stacks
var gp *g
if baseStacks <= i && i < end {
+ // N.B. Atomic read of allglen in gcMarkRootPrepare
+ // acts as a barrier to ensure that allgs must be large
+ // enough to contain all relevant Gs.
gp = allgs[i-baseStacks]
} else {
throw("markroot: bad index")
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index 128498d69b..c94b8f7cae 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -731,12 +731,13 @@ func goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int
stopTheWorld("profile")
+ // World is stopped, no locking required.
n = 1
- for _, gp1 := range allgs {
+ forEachGRace(func(gp1 *g) {
if isOK(gp1) {
n++
}
- }
+ })
if n <= len(p) {
ok = true
@@ -757,21 +758,23 @@ func goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int
}
// Save other goroutines.
- for _, gp1 := range allgs {
- if isOK(gp1) {
- if len(r) == 0 {
- // Should be impossible, but better to return a
- // truncated profile than to crash the entire process.
- break
- }
- saveg(^uintptr(0), ^uintptr(0), gp1, &r[0])
- if labels != nil {
- lbl[0] = gp1.labels
- lbl = lbl[1:]
- }
- r = r[1:]
+ forEachGRace(func(gp1 *g) {
+ if !isOK(gp1) {
+ return
}
- }
+
+ if len(r) == 0 {
+ // Should be impossible, but better to return a
+ // truncated profile than to crash the entire process.
+ return
+ }
+ saveg(^uintptr(0), ^uintptr(0), gp1, &r[0])
+ if labels != nil {
+ lbl[0] = gp1.labels
+ lbl = lbl[1:]
+ }
+ r = r[1:]
+ })
}
startTheWorld()
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index db49fbf385..e1aafffc93 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -511,6 +511,30 @@ func atomicAllGIndex(ptr **g, i uintptr) *g {
return *(**g)(add(unsafe.Pointer(ptr), i*sys.PtrSize))
}
+// forEachG calls fn on every G from allgs.
+//
+// forEachG takes a lock to exclude concurrent addition of new Gs.
+func forEachG(fn func(gp *g)) {
+ lock(&allglock)
+ for _, gp := range allgs {
+ fn(gp)
+ }
+ unlock(&allglock)
+}
+
+// forEachGRace calls fn on every G from allgs.
+//
+// forEachGRace avoids locking, but does not exclude addition of new Gs during
+// execution, which may be missed.
+func forEachGRace(fn func(gp *g)) {
+ ptr, length := atomicAllG()
+ for i := uintptr(0); i < length; i++ {
+ gp := atomicAllGIndex(ptr, i)
+ fn(gp)
+ }
+ return
+}
+
const (
// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
@@ -4605,11 +4629,9 @@ func checkdead() {
}
grunning := 0
- lock(&allglock)
- for i := 0; i < len(allgs); i++ {
- gp := allgs[i]
+ forEachG(func(gp *g) {
if isSystemGoroutine(gp, false) {
- continue
+ return
}
s := readgstatus(gp)
switch s &^ _Gscan {
@@ -4622,8 +4644,7 @@ func checkdead() {
print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
throw("checkdead: runnable g")
}
- }
- unlock(&allglock)
+ })
if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
unlock(&sched.lock) // unlock so that GODEBUG=scheddetail=1 doesn't hang
throw("no goroutines (main called runtime.Goexit) - deadlock!")
@@ -4993,9 +5014,7 @@ func schedtrace(detailed bool) {
print(" M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, ""+" locks=", mp.locks, " dying=", mp.dying, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=", id3, "\n")
}
- lock(&allglock)
- for gi := 0; gi < len(allgs); gi++ {
- gp := allgs[gi]
+ forEachG(func(gp *g) {
mp := gp.m
lockedm := gp.lockedm.ptr()
id1 := int64(-1)
@@ -5007,8 +5026,7 @@ func schedtrace(detailed bool) {
id2 = lockedm.id
}
print(" G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason.String(), ") m=", id1, " lockedm=", id2, "\n")
- }
- unlock(&allglock)
+ })
unlock(&sched.lock)
}
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 169b650eb4..b8794d383a 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -220,7 +220,8 @@ func StartTrace() error {
stackID := traceStackID(mp, stkBuf, 2)
releasem(mp)
- for _, gp := range allgs {
+ // World is stopped, no need to lock.
+ forEachGRace(func(gp *g) {
status := readgstatus(gp)
if status != _Gdead {
gp.traceseq = 0
@@ -240,7 +241,7 @@ func StartTrace() error {
} else {
gp.sysblocktraced = false
}
- }
+ })
traceProcStart()
traceGoStart()
// Note: ticksStart needs to be set after we emit traceEvGoInSyscall events.
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index bf4e02da0e..b57e61712c 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -925,19 +925,16 @@ func tracebackothers(me *g) {
traceback(^uintptr(0), ^uintptr(0), 0, curgp)
}
- // We can't take allglock here because this may be during fatal
- // throw/panic, where locking allglock could be out-of-order or a
- // direct deadlock.
+ // We can't call locking forEachG here because this may be during fatal
+ // throw/panic, where locking could be out-of-order or a direct
+ // deadlock.
//
- // Instead, use atomic access to allgs which requires no locking. We
- // don't lock against concurrent creation of new Gs, but even with
- // allglock we may miss Gs created after this loop.
- ptr, length := atomicAllG()
- for i := uintptr(0); i < length; i++ {
- gp := atomicAllGIndex(ptr, i)
-
+ // Instead, use forEachGRace, which requires no locking. We don't lock
+ // against concurrent creation of new Gs, but even with allglock we may
+ // miss Gs created after this loop.
+ forEachGRace(func(gp *g) {
if gp == me || gp == curgp || readgstatus(gp) == _Gdead || isSystemGoroutine(gp, false) && level < 2 {
- continue
+ return
}
print("\n")
goroutineheader(gp)
@@ -951,7 +948,7 @@ func tracebackothers(me *g) {
} else {
traceback(^uintptr(0), ^uintptr(0), 0, gp)
}
- }
+ })
}
// tracebackHexdump hexdumps part of stk around frame.sp and frame.fp
--
2.33.0

View File

@ -58,7 +58,7 @@
Name: golang
Version: 1.15.7
Release: 25
Release: 26
Summary: The Go Programming Language
License: BSD and Public Domain
URL: https://golang.org/
@ -235,6 +235,8 @@ Patch6090: 0090-release-branch.go1.19-html-template-disallow-actions.patch
Patch6091: 0091-release-branch.go1.19-mime-multipart-avoid-excessive.patch
Patch6092: 0092-release-branch.go1.19-net-textproto-mime-multipart-i.patch
Patch6093: 0093-release-branch.go1.19-mime-multipart-limit-parsed-mi.patch
Patch6094: 0094-runtime-don-t-take-allglock-in-tracebackothers.patch
Patch6095: 0095-runtime-encapsulate-access-to-allgs.patch
Patch9001: 0001-drop-hard-code-cert.patch
Patch9002: 0002-fix-patch-cmd-go-internal-modfetch-do-not-sho.patch
@ -474,6 +476,12 @@ fi
%files devel -f go-tests.list -f go-misc.list -f go-src.list
%changelog
* Fri Apr 14 2023 hanchao <hanchao47@huawei.com> - 1.15.7-26
- Type:bugfix
- CVE:
- SUG:NA
- DESC:fix a deadlock issue when a signal is received.
* Thu Apr 13 2023 hanchao <hanchao47@huawei.com> - 1.15.7-25
- Type:CVE
- CVE:CVE-2023-24534,CVE-2023-24536,CVE-2023-24537,CVE-2023-24538