mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
enhance: Remove CPU profile to prevent blocking stop progress (#40460)
issue: #39735 related to #39726 - Removed CPU profile dump from util.go's pprof collection - Avoid potential blocking in StopCPUProfile() during shutdown - Maintain goroutine/heap/block/mutex profiles for diagnostics - Ensure safe shutdown timeout handling without profile stalls --------- Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
parent
5735c3ef19
commit
972e47043a
@ -20,9 +20,14 @@ var errStopTimeout = errors.New("stop timeout")
|
|||||||
|
|
||||||
// exitWhenStopTimeout stops a component with timeout and exit progress when timeout.
|
// exitWhenStopTimeout stops a component with timeout and exit progress when timeout.
|
||||||
func exitWhenStopTimeout(stop func() error, timeout time.Duration) error {
|
func exitWhenStopTimeout(stop func() error, timeout time.Duration) error {
|
||||||
err := dumpPprof(func() error { return stopWithTimeout(stop, timeout) })
|
err := stopWithTimeout(stop, timeout)
|
||||||
if errors.Is(err, errStopTimeout) {
|
if errors.Is(err, errStopTimeout) {
|
||||||
log.Info("stop progress timeout, force exit")
|
start := time.Now()
|
||||||
|
dumpPprof()
|
||||||
|
log.Info("stop progress timeout, force exit",
|
||||||
|
zap.String("component", paramtable.GetRole()),
|
||||||
|
zap.Duration("cost", time.Since(start)),
|
||||||
|
zap.Error(err))
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
@ -34,7 +39,7 @@ func stopWithTimeout(stop func() error, timeout time.Duration) error {
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
future := conc.Go(func() (struct{}, error) {
|
future := conc.Go(func() (struct{}, error) {
|
||||||
return struct{}{}, dumpPprof(stop)
|
return struct{}{}, stop()
|
||||||
})
|
})
|
||||||
select {
|
select {
|
||||||
case <-future.Inner():
|
case <-future.Inner():
|
||||||
@ -51,14 +56,26 @@ type profileType struct {
|
|||||||
dump func(*os.File) error // Function to dump the profile data
|
dump func(*os.File) error // Function to dump the profile data
|
||||||
}
|
}
|
||||||
|
|
||||||
// dumpPprof wraps the execution of a function with pprof profiling
|
// dumpPprof collects various performance profiles
|
||||||
// It collects various performance profiles only if the execution fails
|
func dumpPprof() {
|
||||||
func dumpPprof(exec func() error) error {
|
|
||||||
// Get pprof directory from configuration
|
// Get pprof directory from configuration
|
||||||
pprofDir := paramtable.Get().ServiceParam.ProfileCfg.PprofPath.GetValue()
|
pprofDir := paramtable.Get().ServiceParam.ProfileCfg.PprofPath.GetValue()
|
||||||
|
|
||||||
|
// Clean existing directory if not empty
|
||||||
|
if pprofDir != "" {
|
||||||
|
if err := os.RemoveAll(pprofDir); err != nil {
|
||||||
|
log.Error("failed to clean pprof directory",
|
||||||
|
zap.String("path", pprofDir),
|
||||||
|
zap.Error(err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recreate directory with proper permissions
|
||||||
if err := os.MkdirAll(pprofDir, 0o755); err != nil {
|
if err := os.MkdirAll(pprofDir, 0o755); err != nil {
|
||||||
log.Error("failed to create pprof directory", zap.Error(err))
|
log.Error("failed to create pprof directory",
|
||||||
return exec()
|
zap.String("path", pprofDir),
|
||||||
|
zap.Error(err))
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate base file path with timestamp
|
// Generate base file path with timestamp
|
||||||
@ -72,16 +89,6 @@ func dumpPprof(exec func() error) error {
|
|||||||
|
|
||||||
// Define all profile types to be collected
|
// Define all profile types to be collected
|
||||||
profiles := []profileType{
|
profiles := []profileType{
|
||||||
{
|
|
||||||
name: "cpu",
|
|
||||||
filename: baseFilePath + "_cpu.prof",
|
|
||||||
dump: func(f *os.File) error {
|
|
||||||
// Ensure no other CPU profiling is active before starting a new one.
|
|
||||||
// This prevents the "cpu profiling already in use" error.
|
|
||||||
pprof.StopCPUProfile()
|
|
||||||
return pprof.StartCPUProfile(f)
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
name: "goroutine",
|
name: "goroutine",
|
||||||
filename: baseFilePath + "_goroutine.prof",
|
filename: baseFilePath + "_goroutine.prof",
|
||||||
@ -124,7 +131,7 @@ func dumpPprof(exec func() error) error {
|
|||||||
f.Close()
|
f.Close()
|
||||||
os.Remove(filename)
|
os.Remove(filename)
|
||||||
}
|
}
|
||||||
return exec()
|
return
|
||||||
}
|
}
|
||||||
files[p.filename] = f
|
files[p.filename] = f
|
||||||
}
|
}
|
||||||
@ -135,33 +142,11 @@ func dumpPprof(exec func() error) error {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// Start CPU profiling
|
for _, p := range profiles {
|
||||||
cpuProfile := profiles[0]
|
if err := p.dump(files[p.filename]); err != nil {
|
||||||
if err := cpuProfile.dump(files[cpuProfile.filename]); err != nil {
|
log.Error("could not write profile",
|
||||||
log.Error("could not start CPU profiling", zap.Error(err))
|
zap.String("profile", p.name),
|
||||||
return exec()
|
zap.Error(err))
|
||||||
}
|
|
||||||
defer pprof.StopCPUProfile()
|
|
||||||
|
|
||||||
// Execute the target function
|
|
||||||
execErr := exec()
|
|
||||||
|
|
||||||
// Only save profiles and collect additional data if execution fails
|
|
||||||
if execErr != nil {
|
|
||||||
// Start from index 1 to skip CPU profile (already running)
|
|
||||||
for _, p := range profiles[1:] {
|
|
||||||
if err := p.dump(files[p.filename]); err != nil {
|
|
||||||
log.Error("could not write profile",
|
|
||||||
zap.String("profile", p.name),
|
|
||||||
zap.Error(err))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Remove all files if execution succeeds
|
|
||||||
for _, p := range profiles {
|
|
||||||
os.Remove(p.filename)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return execErr
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user