llvm-project/lldb/test/API/functionalities/single-thread-step/TestSingleThreadStepTimeout.py
jeffreytan81 f838fa820f
New ThreadPlanSingleThreadTimeout to resolve potential deadlock in single thread stepping (#90930)
This PR introduces a new `ThreadPlanSingleThreadTimeout` that will be
used to address potential deadlock during single-thread stepping.

While debugging a target with a non-trivial number of threads (around
5000 threads in one example target), we noticed that a simple step over
can take as long as 10 seconds. Enabling single-thread stepping mode
significantly reduces the stepping time to around 3 seconds. However,
this can introduce deadlock if we try to step over a method that depends
on other threads to release a lock.

To address this issue, we introduce a new
`ThreadPlanSingleThreadTimeout` that can be controlled by the
`target.process.thread.single-thread-plan-timeout` setting during
single-thread stepping mode. The concept involves counting the elapsed
time since the last internal stop to detect overall stepping progress.
Once a timeout occurs, we assume the target is not making progress due
to a potential deadlock, as mentioned above. We then send a new async
interrupt, resume all threads, and `ThreadPlanSingleThreadTimeout`
completes its task.

To support this design, the major changes made in this PR are:
1. `ThreadPlanSingleThreadTimeout` is popped during every internal stop
and reset (re-pushed) to the top of the stack (as a leaf node) during
resume. This is achieved by always returning `true` from
`ThreadPlanSingleThreadTimeout::DoPlanExplainsStop()` and
`ThreadPlanSingleThreadTimeout::MischiefManaged()`.
2. A new thread-specific async interrupt stop is introduced, which can
be detected/consumed by `ThreadPlanSingleThreadTimeout`.
3. The clearing of branch breakpoints in the range thread plan has been
moved from `DoPlanExplainsStop()` to `ShouldStop()`, as it is not
guaranteed that it will be called.

The detailed design is discussed in the RFC below:

[https://discourse.llvm.org/t/improve-single-thread-stepping/74599](https://discourse.llvm.org/t/improve-single-thread-stepping/74599)

---------

Co-authored-by: jeffreytan81 <jeffreytan@fb.com>
2024-08-05 17:26:39 -07:00

255 lines
11 KiB
Python

"""
Test that single thread step over deadlock issue can be resolved
after timeout.
"""
import lldb
from lldbsuite.test.decorators import *
from lldbsuite.test.lldbtest import *
from lldbsuite.test import lldbutil
class SingleThreadStepTimeoutTestCase(TestBase):
NO_DEBUG_INFO_TESTCASE = True
def setUp(self):
TestBase.setUp(self)
self.main_source = "main.cpp"
self.build()
def verify_hit_correct_line(self, pattern):
target_line = line_number(self.main_source, pattern)
self.assertNotEqual(target_line, 0, "Could not find source pattern " + pattern)
cur_line = self.thread.frames[0].GetLineEntry().GetLine()
self.assertEqual(
cur_line,
target_line,
"Stepped to line %d instead of expected %d with pattern '%s'."
% (cur_line, target_line, pattern),
)
def step_over_deadlock_helper(self):
(target, _, self.thread, _) = lldbutil.run_to_source_breakpoint(
self, "// Set breakpoint1 here", lldb.SBFileSpec(self.main_source)
)
signal_main_thread_value = target.FindFirstGlobalVariable("signal_main_thread")
self.assertTrue(signal_main_thread_value.IsValid())
# Change signal_main_thread global variable to 1 so that worker thread loop can
# terminate and move forward to signal main thread
signal_main_thread_value.SetValueFromCString("1")
self.thread.StepOver(lldb.eOnlyThisThread)
self.verify_hit_correct_line("// Finish step-over from breakpoint1")
@skipIfWindows
def test_step_over_deadlock_small_timeout_fast_stepping(self):
"""Test single thread step over deadlock on other threads can be resolved after timeout with small timeout and fast stepping."""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 10"
)
self.dbg.HandleCommand("settings set target.use-fast-stepping true")
self.step_over_deadlock_helper()
@skipIfWindows
def test_step_over_deadlock_small_timeout_slow_stepping(self):
"""Test single thread step over deadlock on other threads can be resolved after timeout with small timeout and slow stepping."""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 10"
)
self.dbg.HandleCommand("settings set target.use-fast-stepping false")
self.step_over_deadlock_helper()
@skipIfWindows
def test_step_over_deadlock_large_timeout_fast_stepping(self):
"""Test single thread step over deadlock on other threads can be resolved after timeout with large timeout and fast stepping."""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 2000"
)
self.dbg.HandleCommand("settings set target.use-fast-stepping true")
self.step_over_deadlock_helper()
@skipIfWindows
def test_step_over_deadlock_large_timeout_slow_stepping(self):
"""Test single thread step over deadlock on other threads can be resolved after timeout with large timeout and slow stepping."""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 2000"
)
self.dbg.HandleCommand("settings set target.use-fast-stepping false")
self.step_over_deadlock_helper()
def step_over_multi_calls_helper(self):
(target, _, self.thread, _) = lldbutil.run_to_source_breakpoint(
self, "// Set breakpoint2 here", lldb.SBFileSpec(self.main_source)
)
self.thread.StepOver(lldb.eOnlyThisThread)
self.verify_hit_correct_line("// Finish step-over from breakpoint2")
@skipIfWindows
def test_step_over_multi_calls_small_timeout_fast_stepping(self):
"""Test step over source line with multiple call instructions works fine with small timeout and fast stepping."""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 10"
)
self.dbg.HandleCommand("settings set target.use-fast-stepping true")
self.step_over_multi_calls_helper()
@skipIfWindows
def test_step_over_multi_calls_small_timeout_slow_stepping(self):
"""Test step over source line with multiple call instructions works fine with small timeout and slow stepping."""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 10"
)
self.dbg.HandleCommand("settings set target.use-fast-stepping false")
self.step_over_multi_calls_helper()
@skipIfWindows
def test_step_over_multi_calls_large_timeout_fast_stepping(self):
"""Test step over source line with multiple call instructions works fine with large timeout and fast stepping."""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 2000"
)
self.dbg.HandleCommand("settings set target.use-fast-stepping true")
self.step_over_multi_calls_helper()
@skipIfWindows
def test_step_over_multi_calls_large_timeout_slow_stepping(self):
"""Test step over source line with multiple call instructions works fine with large timeout and slow stepping."""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 2000"
)
self.dbg.HandleCommand("settings set target.use-fast-stepping false")
self.step_over_multi_calls_helper()
@skipIfWindows
def test_step_over_deadlock_with_inner_breakpoint_continue(self):
"""Test step over deadlock function with inner breakpoint will trigger the breakpoint
and later continue will finish the stepping.
"""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 2000"
)
(target, process, self.thread, _) = lldbutil.run_to_source_breakpoint(
self, "// Set breakpoint1 here", lldb.SBFileSpec(self.main_source)
)
signal_main_thread_value = target.FindFirstGlobalVariable("signal_main_thread")
self.assertTrue(signal_main_thread_value.IsValid())
# Change signal_main_thread global variable to 1 so that worker thread loop can
# terminate and move forward to signal main thread
signal_main_thread_value.SetValueFromCString("1")
# Set breakpoint on inner function call
inner_breakpoint = target.BreakpointCreateByLocation(
lldb.SBFileSpec(self.main_source),
line_number("main.cpp", "// Set interrupt breakpoint here"),
0,
0,
lldb.SBFileSpecList(),
False,
)
# Step over will hit the inner breakpoint and stop
self.thread.StepOver(lldb.eOnlyThisThread)
self.assertStopReason(self.thread.GetStopReason(), lldb.eStopReasonBreakpoint)
thread1 = lldbutil.get_one_thread_stopped_at_breakpoint(
process, inner_breakpoint
)
self.assertTrue(
thread1.IsValid(),
"We are indeed stopped at inner breakpoint inside deadlock_func",
)
# Continue the process should complete the step-over
process.Continue()
self.assertState(process.GetState(), lldb.eStateStopped)
self.assertStopReason(self.thread.GetStopReason(), lldb.eStopReasonPlanComplete)
self.verify_hit_correct_line("// Finish step-over from breakpoint1")
@skipIfWindows
def test_step_over_deadlock_with_inner_breakpoint_step(self):
"""Test step over deadlock function with inner breakpoint will trigger the breakpoint
and later step still works
"""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 2000"
)
(target, process, self.thread, _) = lldbutil.run_to_source_breakpoint(
self, "// Set breakpoint1 here", lldb.SBFileSpec(self.main_source)
)
signal_main_thread_value = target.FindFirstGlobalVariable("signal_main_thread")
self.assertTrue(signal_main_thread_value.IsValid())
# Change signal_main_thread global variable to 1 so that worker thread loop can
# terminate and move forward to signal main thread
signal_main_thread_value.SetValueFromCString("1")
# Set breakpoint on inner function call
inner_breakpoint = target.BreakpointCreateByLocation(
lldb.SBFileSpec(self.main_source),
line_number("main.cpp", "// Set interrupt breakpoint here"),
0,
0,
lldb.SBFileSpecList(),
False,
)
# Step over will hit the inner breakpoint and stop
self.thread.StepOver(lldb.eOnlyThisThread)
self.assertStopReason(self.thread.GetStopReason(), lldb.eStopReasonBreakpoint)
thread1 = lldbutil.get_one_thread_stopped_at_breakpoint(
process, inner_breakpoint
)
self.assertTrue(
thread1.IsValid(),
"We are indeed stopped at inner breakpoint inside deadlock_func",
)
# Step still works
self.thread.StepOver(lldb.eOnlyThisThread)
self.assertState(process.GetState(), lldb.eStateStopped)
self.assertStopReason(self.thread.GetStopReason(), lldb.eStopReasonPlanComplete)
self.verify_hit_correct_line("// Finish step-over from inner breakpoint")
@skipIfWindows
def test_step_over_deadlock_with_user_async_interrupt(self):
"""Test step over deadlock function with large timeout then send async interrupt
should report correct stop reason
"""
self.dbg.HandleCommand(
"settings set target.process.thread.single-thread-plan-timeout 2000000"
)
(target, process, self.thread, _) = lldbutil.run_to_source_breakpoint(
self, "// Set breakpoint1 here", lldb.SBFileSpec(self.main_source)
)
signal_main_thread_value = target.FindFirstGlobalVariable("signal_main_thread")
self.assertTrue(signal_main_thread_value.IsValid())
# Change signal_main_thread global variable to 1 so that worker thread loop can
# terminate and move forward to signal main thread
signal_main_thread_value.SetValueFromCString("1")
self.dbg.SetAsync(True)
# This stepping should block due to large timeout and should be interrupted by the
# async interrupt from the worker thread
self.thread.StepOver(lldb.eOnlyThisThread)
time.sleep(1)
listener = self.dbg.GetListener()
lldbutil.expect_state_changes(self, listener, process, [lldb.eStateRunning])
self.dbg.SetAsync(False)
process.SendAsyncInterrupt()
lldbutil.expect_state_changes(self, listener, process, [lldb.eStateStopped])
self.assertStopReason(self.thread.GetStopReason(), lldb.eStopReasonSignal)