Fix shutdown waitgroup panic (#35676)

This PR fixes a panic issue in the WaitGroup that occurs when Gitea is
shut down using Ctrl+C.
It ensures that all active connection pointers in the server are
properly tracked and forcibly closed when the hammer shutdown is
invoked.
The process remains graceful — the normal shutdown sequence runs before
the hammer is triggered, and existing connections are given a timeout
period to complete gracefully.

This PR also fixes `no logger writer` problem. Now the log close will
only be invoked when the command exit.

- Fixes #35468
- Fixes #35551
- Fixes #35559
- Replace #35578

---------

Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
This commit is contained in:
Lunny Xiao
2025-10-25 00:02:58 -07:00
committed by GitHub
parent cb72c901b3
commit 304d836a61
4 changed files with 72 additions and 83 deletions

View File

@@ -156,7 +156,6 @@ func serveInstall(cmd *cli.Command) error {
case <-graceful.GetManager().IsShutdown():
<-graceful.GetManager().Done()
log.Info("PID: %d Gitea Web Finished", os.Getpid())
log.GetManager().Close()
return err
default:
}
@@ -231,7 +230,6 @@ func serveInstalled(c *cli.Command) error {
err := listen(webRoutes, true)
<-graceful.GetManager().Done()
log.Info("PID: %d Gitea Web Finished", os.Getpid())
log.GetManager().Close()
return err
}

View File

@@ -44,6 +44,7 @@ func main() {
}
app := cmd.NewMainApp(cmd.AppVersion{Version: Version, Extra: formatBuiltWith()})
_ = cmd.RunMainApp(app, os.Args...) // all errors should have been handled by the RunMainApp
// flush the queued logs before exiting, it is a MUST, otherwise there will be log loss
log.GetManager().Close()
}

View File

@@ -11,7 +11,6 @@ import (
"os"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
@@ -30,12 +29,15 @@ type ServeFunction = func(net.Listener) error
// Server represents our graceful server
type Server struct {
network string
address string
listener net.Listener
wg sync.WaitGroup
state state
lock *sync.RWMutex
network string
address string
listener net.Listener
lock sync.RWMutex
state state
connCounter int64
connEmptyCond *sync.Cond
BeforeBegin func(network, address string)
OnShutdown func()
PerWriteTimeout time.Duration
@@ -50,14 +52,13 @@ func NewServer(network, address, name string) *Server {
log.Info("Starting new %s server: %s:%s on PID: %d", name, network, address, os.Getpid())
}
srv := &Server{
wg: sync.WaitGroup{},
state: stateInit,
lock: &sync.RWMutex{},
network: network,
address: address,
PerWriteTimeout: setting.PerWriteTimeout,
PerWritePerKbTimeout: setting.PerWritePerKbTimeout,
}
srv.connEmptyCond = sync.NewCond(&srv.lock)
srv.BeforeBegin = func(network, addr string) {
log.Debug("Starting server on %s:%s (PID: %d)", network, addr, syscall.Getpid())
@@ -154,7 +155,7 @@ func (srv *Server) Serve(serve ServeFunction) error {
GetManager().RegisterServer()
err := serve(srv.listener)
log.Debug("Waiting for connections to finish... (PID: %d)", syscall.Getpid())
srv.wg.Wait()
srv.waitForActiveConnections()
srv.setState(stateTerminate)
GetManager().ServerDone()
// use of closed means that the listeners are closed - i.e. we should be shutting down - return nil
@@ -178,16 +179,62 @@ func (srv *Server) setState(st state) {
srv.state = st
}
func (srv *Server) waitForActiveConnections() {
srv.lock.Lock()
for srv.connCounter > 0 {
srv.connEmptyCond.Wait()
}
srv.lock.Unlock()
}
func (srv *Server) wrapConnection(c net.Conn) (net.Conn, error) {
srv.lock.Lock()
defer srv.lock.Unlock()
if srv.state != stateRunning {
_ = c.Close()
return nil, syscall.EINVAL // same as AcceptTCP
}
srv.connCounter++
return &wrappedConn{Conn: c, server: srv}, nil
}
func (srv *Server) removeConnection(_ *wrappedConn) {
srv.lock.Lock()
defer srv.lock.Unlock()
srv.connCounter--
if srv.connCounter <= 0 {
srv.connEmptyCond.Broadcast()
}
}
// closeAllConnections forcefully closes all active connections
func (srv *Server) closeAllConnections() {
srv.lock.Lock()
if srv.connCounter > 0 {
log.Warn("After graceful shutdown period, %d connections are still active. Forcefully close.", srv.connCounter)
srv.connCounter = 0 // OS will close all the connections after the process exits, so we just assume there is no active connection now
}
srv.lock.Unlock()
srv.connEmptyCond.Broadcast()
}
type filer interface {
File() (*os.File, error)
}
type wrappedListener struct {
net.Listener
stopped bool
server *Server
server *Server
}
var (
_ net.Listener = (*wrappedListener)(nil)
_ filer = (*wrappedListener)(nil)
)
func newWrappedListener(l net.Listener, srv *Server) *wrappedListener {
return &wrappedListener{
Listener: l,
@@ -195,46 +242,24 @@ func newWrappedListener(l net.Listener, srv *Server) *wrappedListener {
}
}
func (wl *wrappedListener) Accept() (net.Conn, error) {
var c net.Conn
// Set keepalive on TCPListeners connections.
func (wl *wrappedListener) Accept() (c net.Conn, err error) {
if tcl, ok := wl.Listener.(*net.TCPListener); ok {
// Set keepalive on TCPListeners connections if possible, see http.tcpKeepAliveListener
tc, err := tcl.AcceptTCP()
if err != nil {
return nil, err
}
_ = tc.SetKeepAlive(true) // see http.tcpKeepAliveListener
_ = tc.SetKeepAlivePeriod(3 * time.Minute) // see http.tcpKeepAliveListener
_ = tc.SetKeepAlive(true)
_ = tc.SetKeepAlivePeriod(3 * time.Minute)
c = tc
} else {
var err error
c, err = wl.Listener.Accept()
if err != nil {
return nil, err
}
}
closed := int32(0)
c = &wrappedConn{
Conn: c,
server: wl.server,
closed: &closed,
perWriteTimeout: wl.server.PerWriteTimeout,
perWritePerKbTimeout: wl.server.PerWritePerKbTimeout,
}
wl.server.wg.Add(1)
return c, nil
}
func (wl *wrappedListener) Close() error {
if wl.stopped {
return syscall.EINVAL
}
wl.stopped = true
return wl.Listener.Close()
return wl.server.wrapConnection(c)
}
func (wl *wrappedListener) File() (*os.File, error) {
@@ -244,17 +269,14 @@ func (wl *wrappedListener) File() (*os.File, error) {
type wrappedConn struct {
net.Conn
server *Server
closed *int32
deadline time.Time
perWriteTimeout time.Duration
perWritePerKbTimeout time.Duration
server *Server
deadline time.Time
}
func (w *wrappedConn) Write(p []byte) (n int, err error) {
if w.perWriteTimeout > 0 {
minTimeout := time.Duration(len(p)/1024) * w.perWritePerKbTimeout
minDeadline := time.Now().Add(minTimeout).Add(w.perWriteTimeout)
if w.server.PerWriteTimeout > 0 {
minTimeout := time.Duration(len(p)/1024) * w.server.PerWritePerKbTimeout
minDeadline := time.Now().Add(minTimeout).Add(w.server.PerWriteTimeout)
w.deadline = w.deadline.Add(minTimeout)
if minDeadline.After(w.deadline) {
@@ -266,19 +288,6 @@ func (w *wrappedConn) Write(p []byte) (n int, err error) {
}
func (w *wrappedConn) Close() error {
if atomic.CompareAndSwapInt32(w.closed, 0, 1) {
defer func() {
if err := recover(); err != nil {
select {
case <-GetManager().IsHammer():
// Likely deadlocked request released at hammertime
log.Warn("Panic during connection close! %v. Likely there has been a deadlocked request which has been released by forced shutdown.", err)
default:
log.Error("Panic during connection close! %v", err)
}
}
}()
w.server.wg.Done()
}
w.server.removeConnection(w)
return w.Conn.Close()
}

View File

@@ -5,7 +5,6 @@ package graceful
import (
"os"
"runtime"
"code.gitea.io/gitea/modules/log"
)
@@ -48,26 +47,8 @@ func (srv *Server) doShutdown() {
}
func (srv *Server) doHammer() {
defer func() {
// We call srv.wg.Done() until it panics.
// This happens if we call Done() when the WaitGroup counter is already at 0
// So if it panics -> we're done, Serve() will return and the
// parent will goroutine will exit.
if r := recover(); r != nil {
log.Error("WaitGroup at 0: Error: %v", r)
}
}()
if srv.getState() != stateShuttingDown {
return
}
log.Warn("Forcefully shutting down parent")
for {
if srv.getState() == stateTerminate {
break
}
srv.wg.Done()
// Give other goroutines a chance to finish before we forcibly stop them.
runtime.Gosched()
}
srv.closeAllConnections()
}