mirror of
				https://github.com/go-gitea/gitea.git
				synced 2025-10-31 10:56:10 +01:00 
			
		
		
		
	Fix shutdown waitgroup panic (#35676)
This PR fixes a panic issue in the WaitGroup that occurs when Gitea is shut down using Ctrl+C. It ensures that all active connection pointers in the server are properly tracked and forcibly closed when the hammer shutdown is invoked. The process remains graceful — the normal shutdown sequence runs before the hammer is triggered, and existing connections are given a timeout period to complete gracefully. This PR also fixes `no logger writer` problem. Now the log close will only be invoked when the command exit. - Fixes #35468 - Fixes #35551 - Fixes #35559 - Replace #35578 --------- Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
This commit is contained in:
		| @@ -156,7 +156,6 @@ func serveInstall(cmd *cli.Command) error { | ||||
| 	case <-graceful.GetManager().IsShutdown(): | ||||
| 		<-graceful.GetManager().Done() | ||||
| 		log.Info("PID: %d Gitea Web Finished", os.Getpid()) | ||||
| 		log.GetManager().Close() | ||||
| 		return err | ||||
| 	default: | ||||
| 	} | ||||
| @@ -231,7 +230,6 @@ func serveInstalled(c *cli.Command) error { | ||||
| 	err := listen(webRoutes, true) | ||||
| 	<-graceful.GetManager().Done() | ||||
| 	log.Info("PID: %d Gitea Web Finished", os.Getpid()) | ||||
| 	log.GetManager().Close() | ||||
| 	return err | ||||
| } | ||||
|  | ||||
|   | ||||
							
								
								
									
										1
									
								
								main.go
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								main.go
									
									
									
									
									
								
							| @@ -44,6 +44,7 @@ func main() { | ||||
| 	} | ||||
| 	app := cmd.NewMainApp(cmd.AppVersion{Version: Version, Extra: formatBuiltWith()}) | ||||
| 	_ = cmd.RunMainApp(app, os.Args...) // all errors should have been handled by the RunMainApp | ||||
| 	// flush the queued logs before exiting, it is a MUST, otherwise there will be log loss | ||||
| 	log.GetManager().Close() | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -11,7 +11,6 @@ import ( | ||||
| 	"os" | ||||
| 	"strings" | ||||
| 	"sync" | ||||
| 	"sync/atomic" | ||||
| 	"syscall" | ||||
| 	"time" | ||||
|  | ||||
| @@ -33,9 +32,12 @@ type Server struct { | ||||
| 	network  string | ||||
| 	address  string | ||||
| 	listener net.Listener | ||||
| 	wg                   sync.WaitGroup | ||||
|  | ||||
| 	lock          sync.RWMutex | ||||
| 	state         state | ||||
| 	lock                 *sync.RWMutex | ||||
| 	connCounter   int64 | ||||
| 	connEmptyCond *sync.Cond | ||||
|  | ||||
| 	BeforeBegin          func(network, address string) | ||||
| 	OnShutdown           func() | ||||
| 	PerWriteTimeout      time.Duration | ||||
| @@ -50,14 +52,13 @@ func NewServer(network, address, name string) *Server { | ||||
| 		log.Info("Starting new %s server: %s:%s on PID: %d", name, network, address, os.Getpid()) | ||||
| 	} | ||||
| 	srv := &Server{ | ||||
| 		wg:                   sync.WaitGroup{}, | ||||
| 		state:                stateInit, | ||||
| 		lock:                 &sync.RWMutex{}, | ||||
| 		network:              network, | ||||
| 		address:              address, | ||||
| 		PerWriteTimeout:      setting.PerWriteTimeout, | ||||
| 		PerWritePerKbTimeout: setting.PerWritePerKbTimeout, | ||||
| 	} | ||||
| 	srv.connEmptyCond = sync.NewCond(&srv.lock) | ||||
|  | ||||
| 	srv.BeforeBegin = func(network, addr string) { | ||||
| 		log.Debug("Starting server on %s:%s (PID: %d)", network, addr, syscall.Getpid()) | ||||
| @@ -154,7 +155,7 @@ func (srv *Server) Serve(serve ServeFunction) error { | ||||
| 	GetManager().RegisterServer() | ||||
| 	err := serve(srv.listener) | ||||
| 	log.Debug("Waiting for connections to finish... (PID: %d)", syscall.Getpid()) | ||||
| 	srv.wg.Wait() | ||||
| 	srv.waitForActiveConnections() | ||||
| 	srv.setState(stateTerminate) | ||||
| 	GetManager().ServerDone() | ||||
| 	// use of closed means that the listeners are closed - i.e. we should be shutting down - return nil | ||||
| @@ -178,16 +179,62 @@ func (srv *Server) setState(st state) { | ||||
| 	srv.state = st | ||||
| } | ||||
|  | ||||
| func (srv *Server) waitForActiveConnections() { | ||||
| 	srv.lock.Lock() | ||||
| 	for srv.connCounter > 0 { | ||||
| 		srv.connEmptyCond.Wait() | ||||
| 	} | ||||
| 	srv.lock.Unlock() | ||||
| } | ||||
|  | ||||
| func (srv *Server) wrapConnection(c net.Conn) (net.Conn, error) { | ||||
| 	srv.lock.Lock() | ||||
| 	defer srv.lock.Unlock() | ||||
|  | ||||
| 	if srv.state != stateRunning { | ||||
| 		_ = c.Close() | ||||
| 		return nil, syscall.EINVAL // same as AcceptTCP | ||||
| 	} | ||||
|  | ||||
| 	srv.connCounter++ | ||||
| 	return &wrappedConn{Conn: c, server: srv}, nil | ||||
| } | ||||
|  | ||||
| func (srv *Server) removeConnection(_ *wrappedConn) { | ||||
| 	srv.lock.Lock() | ||||
| 	defer srv.lock.Unlock() | ||||
|  | ||||
| 	srv.connCounter-- | ||||
| 	if srv.connCounter <= 0 { | ||||
| 		srv.connEmptyCond.Broadcast() | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // closeAllConnections forcefully closes all active connections | ||||
| func (srv *Server) closeAllConnections() { | ||||
| 	srv.lock.Lock() | ||||
| 	if srv.connCounter > 0 { | ||||
| 		log.Warn("After graceful shutdown period, %d connections are still active. Forcefully close.", srv.connCounter) | ||||
| 		srv.connCounter = 0 // OS will close all the connections after the process exits, so we just assume there is no active connection now | ||||
| 	} | ||||
| 	srv.lock.Unlock() | ||||
| 	srv.connEmptyCond.Broadcast() | ||||
| } | ||||
|  | ||||
| type filer interface { | ||||
| 	File() (*os.File, error) | ||||
| } | ||||
|  | ||||
| type wrappedListener struct { | ||||
| 	net.Listener | ||||
| 	stopped bool | ||||
| 	server *Server | ||||
| } | ||||
|  | ||||
| var ( | ||||
| 	_ net.Listener = (*wrappedListener)(nil) | ||||
| 	_ filer        = (*wrappedListener)(nil) | ||||
| ) | ||||
|  | ||||
| func newWrappedListener(l net.Listener, srv *Server) *wrappedListener { | ||||
| 	return &wrappedListener{ | ||||
| 		Listener: l, | ||||
| @@ -195,46 +242,24 @@ func newWrappedListener(l net.Listener, srv *Server) *wrappedListener { | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func (wl *wrappedListener) Accept() (net.Conn, error) { | ||||
| 	var c net.Conn | ||||
| 	// Set keepalive on TCPListeners connections. | ||||
| func (wl *wrappedListener) Accept() (c net.Conn, err error) { | ||||
| 	if tcl, ok := wl.Listener.(*net.TCPListener); ok { | ||||
| 		// Set keepalive on TCPListeners connections if possible, see http.tcpKeepAliveListener | ||||
| 		tc, err := tcl.AcceptTCP() | ||||
| 		if err != nil { | ||||
| 			return nil, err | ||||
| 		} | ||||
| 		_ = tc.SetKeepAlive(true)                  // see http.tcpKeepAliveListener | ||||
| 		_ = tc.SetKeepAlivePeriod(3 * time.Minute) // see http.tcpKeepAliveListener | ||||
| 		_ = tc.SetKeepAlive(true) | ||||
| 		_ = tc.SetKeepAlivePeriod(3 * time.Minute) | ||||
| 		c = tc | ||||
| 	} else { | ||||
| 		var err error | ||||
| 		c, err = wl.Listener.Accept() | ||||
| 		if err != nil { | ||||
| 			return nil, err | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	closed := int32(0) | ||||
|  | ||||
| 	c = &wrappedConn{ | ||||
| 		Conn:                 c, | ||||
| 		server:               wl.server, | ||||
| 		closed:               &closed, | ||||
| 		perWriteTimeout:      wl.server.PerWriteTimeout, | ||||
| 		perWritePerKbTimeout: wl.server.PerWritePerKbTimeout, | ||||
| 	} | ||||
|  | ||||
| 	wl.server.wg.Add(1) | ||||
| 	return c, nil | ||||
| } | ||||
|  | ||||
| func (wl *wrappedListener) Close() error { | ||||
| 	if wl.stopped { | ||||
| 		return syscall.EINVAL | ||||
| 	} | ||||
|  | ||||
| 	wl.stopped = true | ||||
| 	return wl.Listener.Close() | ||||
| 	return wl.server.wrapConnection(c) | ||||
| } | ||||
|  | ||||
| func (wl *wrappedListener) File() (*os.File, error) { | ||||
| @@ -245,16 +270,13 @@ func (wl *wrappedListener) File() (*os.File, error) { | ||||
| type wrappedConn struct { | ||||
| 	net.Conn | ||||
| 	server   *Server | ||||
| 	closed               *int32 | ||||
| 	deadline time.Time | ||||
| 	perWriteTimeout      time.Duration | ||||
| 	perWritePerKbTimeout time.Duration | ||||
| } | ||||
|  | ||||
| func (w *wrappedConn) Write(p []byte) (n int, err error) { | ||||
| 	if w.perWriteTimeout > 0 { | ||||
| 		minTimeout := time.Duration(len(p)/1024) * w.perWritePerKbTimeout | ||||
| 		minDeadline := time.Now().Add(minTimeout).Add(w.perWriteTimeout) | ||||
| 	if w.server.PerWriteTimeout > 0 { | ||||
| 		minTimeout := time.Duration(len(p)/1024) * w.server.PerWritePerKbTimeout | ||||
| 		minDeadline := time.Now().Add(minTimeout).Add(w.server.PerWriteTimeout) | ||||
|  | ||||
| 		w.deadline = w.deadline.Add(minTimeout) | ||||
| 		if minDeadline.After(w.deadline) { | ||||
| @@ -266,19 +288,6 @@ func (w *wrappedConn) Write(p []byte) (n int, err error) { | ||||
| } | ||||
|  | ||||
| func (w *wrappedConn) Close() error { | ||||
| 	if atomic.CompareAndSwapInt32(w.closed, 0, 1) { | ||||
| 		defer func() { | ||||
| 			if err := recover(); err != nil { | ||||
| 				select { | ||||
| 				case <-GetManager().IsHammer(): | ||||
| 					// Likely deadlocked request released at hammertime | ||||
| 					log.Warn("Panic during connection close! %v. Likely there has been a deadlocked request which has been released by forced shutdown.", err) | ||||
| 				default: | ||||
| 					log.Error("Panic during connection close! %v", err) | ||||
| 				} | ||||
| 			} | ||||
| 		}() | ||||
| 		w.server.wg.Done() | ||||
| 	} | ||||
| 	w.server.removeConnection(w) | ||||
| 	return w.Conn.Close() | ||||
| } | ||||
|   | ||||
| @@ -5,7 +5,6 @@ package graceful | ||||
|  | ||||
| import ( | ||||
| 	"os" | ||||
| 	"runtime" | ||||
|  | ||||
| 	"code.gitea.io/gitea/modules/log" | ||||
| ) | ||||
| @@ -48,26 +47,8 @@ func (srv *Server) doShutdown() { | ||||
| } | ||||
|  | ||||
| func (srv *Server) doHammer() { | ||||
| 	defer func() { | ||||
| 		// We call srv.wg.Done() until it panics. | ||||
| 		// This happens if we call Done() when the WaitGroup counter is already at 0 | ||||
| 		// So if it panics -> we're done, Serve() will return and the | ||||
| 		// parent will goroutine will exit. | ||||
| 		if r := recover(); r != nil { | ||||
| 			log.Error("WaitGroup at 0: Error: %v", r) | ||||
| 		} | ||||
| 	}() | ||||
| 	if srv.getState() != stateShuttingDown { | ||||
| 		return | ||||
| 	} | ||||
| 	log.Warn("Forcefully shutting down parent") | ||||
| 	for { | ||||
| 		if srv.getState() == stateTerminate { | ||||
| 			break | ||||
| 		} | ||||
| 		srv.wg.Done() | ||||
|  | ||||
| 		// Give other goroutines a chance to finish before we forcibly stop them. | ||||
| 		runtime.Gosched() | ||||
| 	} | ||||
| 	srv.closeAllConnections() | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user