fix: #161 — wall-clock timeout for run_turn_loop; stalled turns now abort with stop_reason='timeout'
Previously, run_turn_loop was bounded only by max_turns (turn count). If
engine.submit_message stalled — slow provider, hung network, infinite
stream — the loop blocked indefinitely with no cancellation path. Claws
calling run_turn_loop in CI or orchestration had no reliable way to
enforce a deadline; the loop would hang until OS kill or human intervention.
Fix:
- Add timeout_seconds parameter to run_turn_loop (default None = legacy unbounded).
- When set, each submit_message call runs inside a ThreadPoolExecutor and is
bounded by the remaining wall-clock budget (total across all turns, not per-turn).
- On timeout, synthesize a TurnResult with stop_reason='timeout' carrying the
turn's prompt and routed matches so transcripts preserve orchestration context.
- Exhausted/negative budget short-circuits before calling submit_message.
- Legacy path (timeout_seconds=None) bypasses the executor entirely — zero
overhead for callers that don't opt in.
CLI:
- Added --timeout-seconds flag to 'turn-loop' command.
- Exit code 2 when the loop terminated on timeout (vs 0 for completed),
so shell scripts can distinguish 'done' from 'budget exhausted'.
Tests (tests/test_run_turn_loop_timeout.py, 6 tests):
- Legacy unbounded path unchanged (timeout_seconds=None never emits 'timeout')
- Hung submit_message aborted within budget (0.3s budget, 5s mock hang → exit <1.5s)
- Budget is cumulative across turns (0.6s budget, 0.4s per turn, not per-turn)
- timeout_seconds=0 short-circuits first turn without calling submit_message
- Negative timeout treated as exhausted (guard against caller bugs)
- Timeout TurnResult carries correct prompt, matches, UsageSummary shape
Full suite: 49/49 passing, zero regression.
Blocker: none. Closes ROADMAP #161.
2026-04-22 17:23:43 +09:00
|
|
|
"""Tests for run_turn_loop wall-clock timeout (ROADMAP #161).
|
|
|
|
|
|
|
|
|
|
Covers:
|
|
|
|
|
- timeout_seconds=None preserves legacy unbounded behaviour
|
|
|
|
|
- timeout_seconds=X aborts a hung turn and emits stop_reason='timeout'
|
|
|
|
|
- Timeout budget is total wall-clock across all turns, not per-turn
|
|
|
|
|
- Already-exhausted budget short-circuits before the first turn runs
|
|
|
|
|
- Legacy path still runs without a ThreadPoolExecutor in the way
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
import time
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from unittest.mock import patch
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
|
|
|
|
|
|
from src.models import UsageSummary # noqa: E402
|
|
|
|
|
from src.query_engine import TurnResult # noqa: E402
|
|
|
|
|
from src.runtime import PortRuntime # noqa: E402
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _completed_result(prompt: str) -> TurnResult:
|
|
|
|
|
return TurnResult(
|
|
|
|
|
prompt=prompt,
|
|
|
|
|
output='ok',
|
|
|
|
|
matched_commands=(),
|
|
|
|
|
matched_tools=(),
|
|
|
|
|
permission_denials=(),
|
|
|
|
|
usage=UsageSummary(),
|
|
|
|
|
stop_reason='completed',
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestLegacyUnboundedBehaviour:
|
|
|
|
|
def test_no_timeout_preserves_existing_behaviour(self) -> None:
|
|
|
|
|
"""timeout_seconds=None must not change legacy path at all."""
|
|
|
|
|
results = PortRuntime().run_turn_loop('review MCP tool', max_turns=2)
|
|
|
|
|
assert len(results) >= 1
|
|
|
|
|
for r in results:
|
|
|
|
|
assert r.stop_reason in {'completed', 'max_turns_reached', 'max_budget_reached'}
|
|
|
|
|
assert r.stop_reason != 'timeout'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestTimeoutAbortsHungTurn:
|
|
|
|
|
def test_hung_submit_message_times_out(self) -> None:
|
|
|
|
|
"""A stalled submit_message must be aborted and emit stop_reason='timeout'."""
|
|
|
|
|
runtime = PortRuntime()
|
|
|
|
|
|
fix: #164 Stage A — cooperative cancellation via cancel_event in submit_message
Closes the #161 follow-up gap identified in review: wall-clock timeout
bounded caller-facing wait but did not cancel the underlying provider
thread, which could silently mutate mutable_messages / transcript_store /
permission_denials / total_usage after the caller had already observed
stop_reason='timeout'. A ghost turn committed post-deadline would poison
any session that got persisted afterwards.
Stage A scope (this commit): runtime + engine layer cooperative cancel.
Engine layer (src/query_engine.py):
- submit_message now accepts cancel_event: threading.Event | None = None
- Two safe checkpoints:
1. Entry (before max_turns / budget projection) — earliest possible return
2. Post-budget (after output synthesis, before mutation) — catches cancel
that arrives while output was being computed
- Both checkpoints return stop_reason='cancelled' with state UNCHANGED
(mutable_messages, transcript_store, permission_denials, total_usage
all preserved exactly as on entry)
- cancel_event=None preserves legacy behaviour with zero overhead (no
checkpoint checks at all)
Runtime layer (src/runtime.py):
- run_turn_loop creates one cancel_event per invocation when a deadline
is in play (and None otherwise, preserving legacy fast path)
- Passes the same event to every submit_message call across turns, so a
late cancel on turn N-1 affects turn N
- On timeout (either pre-call or mid-call), runtime explicitly calls
cancel_event.set() before future.cancel() + synthesizing the timeout
TurnResult. This upgrades #161's best-effort future.cancel() (which
only cancels not-yet-started futures) to cooperative mid-flight cancel.
Stop reason taxonomy after Stage A:
'completed' — turn committed, state mutated exactly once
'max_budget_reached' — overflow, state unchanged (#162)
'max_turns_reached' — capacity exceeded, state unchanged
'cancelled' — cancel_event observed, state unchanged (#164 Stage A)
'timeout' — synthesised by runtime, not engine (#161)
The 'cancelled' vs 'timeout' split matters:
- 'timeout' is the runtime's best-effort signal to the caller: deadline hit
- 'cancelled' is the engine's confirmation: cancel was observed + honoured
If the provider call wedges entirely (never reaches a checkpoint), the
caller still sees 'timeout' and the thread is leaked — but any NEXT
submit_message call on the same engine observes the event at entry and
returns 'cancelled' immediately, preventing ghost-turn accumulation.
This is the honest cooperative limit in Python threading land; true
preemption requires async-native provider IO (future work, not Stage A).
Tests (29 new tests, tests/test_submit_message_cancellation.py + tests/
test_run_turn_loop_cancellation.py):
Engine-layer (12 tests):
- TestCancellationBeforeCall (5): pre-set event returns 'cancelled' immediately;
mutable_messages, transcript_store, usage, permission_denials all preserved
- TestCancellationAfterBudgetCheck (1): cancel set mid-call (after projection,
before commit) still honoured; output synthesised but state untouched
- TestCancellationAfterCommit (2): post-commit cancel not observable (honest
limit) BUT next call on same engine observes it + returns 'cancelled'
- TestLegacyCallersUnchanged (3): cancel_event=None preserves #162 atomicity
+ max_turns contract with zero behaviour change
- TestCancellationVsOtherStopReasons (2): cancel precedes max_turns check;
cancel does not retroactively override a completed turn
Runtime-layer (5 tests):
- TestTimeoutPropagatesCancelEvent (3): submit_message receives a real Event
object when deadline is set; None in legacy mode; timeout actually calls
event.set() so in-flight threads observe at their next checkpoint
- TestCancelEventSharedAcrossTurns (1): same event object passed to every
turn (object identity check) — late cancel on turn N-1 must affect turn N
Regression: 3 existing timeout test mocks updated to accept cancel_event
kwarg (mocks that previously had signature (prompt, commands, tools, denials)
now have (prompt, commands, tools, denials, cancel_event=None) since runtime
passes cancel_event positionally on the timeout path).
Full suite: 97 → 114 passing, zero regression.
Closes ROADMAP #164 Stage A.
What's explicitly NOT in Stage A:
- Preemptive cancellation of wedged provider IO (requires asyncio-native
provider path; larger refactor)
- Timeout on the legacy unbounded run_turn_loop path (by design: legacy
callers opt out of cancellation entirely)
- CLI exposure of 'cancelled' as a distinct exit code (currently 'cancelled'
maps to the same stop_reason != 'completed' break condition as others;
CLI surface for cancel is a separate pinpoint if warranted)
2026-04-22 18:14:14 +09:00
|
|
|
# #164 Stage A: runtime now passes cancel_event as a 5th positional
|
|
|
|
|
# arg on the timeout path, so mocks must accept it (even if they ignore it).
|
|
|
|
|
def _hang(prompt, commands, tools, denials, cancel_event=None):
|
fix: #161 — wall-clock timeout for run_turn_loop; stalled turns now abort with stop_reason='timeout'
Previously, run_turn_loop was bounded only by max_turns (turn count). If
engine.submit_message stalled — slow provider, hung network, infinite
stream — the loop blocked indefinitely with no cancellation path. Claws
calling run_turn_loop in CI or orchestration had no reliable way to
enforce a deadline; the loop would hang until OS kill or human intervention.
Fix:
- Add timeout_seconds parameter to run_turn_loop (default None = legacy unbounded).
- When set, each submit_message call runs inside a ThreadPoolExecutor and is
bounded by the remaining wall-clock budget (total across all turns, not per-turn).
- On timeout, synthesize a TurnResult with stop_reason='timeout' carrying the
turn's prompt and routed matches so transcripts preserve orchestration context.
- Exhausted/negative budget short-circuits before calling submit_message.
- Legacy path (timeout_seconds=None) bypasses the executor entirely — zero
overhead for callers that don't opt in.
CLI:
- Added --timeout-seconds flag to 'turn-loop' command.
- Exit code 2 when the loop terminated on timeout (vs 0 for completed),
so shell scripts can distinguish 'done' from 'budget exhausted'.
Tests (tests/test_run_turn_loop_timeout.py, 6 tests):
- Legacy unbounded path unchanged (timeout_seconds=None never emits 'timeout')
- Hung submit_message aborted within budget (0.3s budget, 5s mock hang → exit <1.5s)
- Budget is cumulative across turns (0.6s budget, 0.4s per turn, not per-turn)
- timeout_seconds=0 short-circuits first turn without calling submit_message
- Negative timeout treated as exhausted (guard against caller bugs)
- Timeout TurnResult carries correct prompt, matches, UsageSummary shape
Full suite: 49/49 passing, zero regression.
Blocker: none. Closes ROADMAP #161.
2026-04-22 17:23:43 +09:00
|
|
|
time.sleep(5.0) # would block the loop
|
|
|
|
|
return _completed_result(prompt)
|
|
|
|
|
|
|
|
|
|
with patch('src.runtime.QueryEnginePort.from_workspace') as mock_factory:
|
|
|
|
|
engine = mock_factory.return_value
|
|
|
|
|
engine.config = None # attribute-assigned in run_turn_loop
|
|
|
|
|
engine.submit_message.side_effect = _hang
|
|
|
|
|
|
|
|
|
|
start = time.monotonic()
|
|
|
|
|
results = runtime.run_turn_loop(
|
|
|
|
|
'review MCP tool', max_turns=3, timeout_seconds=0.3
|
|
|
|
|
)
|
|
|
|
|
elapsed = time.monotonic() - start
|
|
|
|
|
|
|
|
|
|
# Must exit well under the 5s hang
|
|
|
|
|
assert elapsed < 1.5, f'run_turn_loop did not honor timeout: {elapsed:.2f}s'
|
|
|
|
|
assert len(results) == 1
|
|
|
|
|
assert results[-1].stop_reason == 'timeout'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestTimeoutBudgetIsTotal:
|
|
|
|
|
def test_budget_is_cumulative_across_turns(self) -> None:
|
fix: #163 — remove [turn N] suffix pollution from run_turn_loop; file #164 timeout-cancellation followup
#163: run_turn_loop no longer injects f'{prompt} [turn N]' into follow-up
prompts. The suffix was never defined or interpreted anywhere — not by the
engine, not by the system prompt, not by any LLM. It looked like a real
user-typed annotation in the transcript and made replay/analysis fragile.
New behaviour:
- turn 0 submits the original prompt (unchanged)
- turn > 0 submits caller-supplied continuation_prompt if provided, else
the loop stops cleanly — no fabricated user turn
- added continuation_prompt: str | None = None parameter to run_turn_loop
- added --continuation-prompt CLI flag for claws scripting multi-turn loops
- zero '[turn' strings ever appear in mutable_messages or stdout now
Behaviour change for existing callers:
- Before: run_turn_loop(prompt, max_turns=3) submitted 3 turns
('prompt', 'prompt [turn 2]', 'prompt [turn 3]')
- After: run_turn_loop(prompt, max_turns=3) submits 1 turn ('prompt')
- To preserve old multi-turn behaviour, pass continuation_prompt='Continue.'
or any structured follow-up text
One existing timeout test (test_budget_is_cumulative_across_turns) updated
to pass continuation_prompt so the cumulative-budget contract is actually
exercised across turns instead of trivially satisfied by a one-turn loop.
#164 filed: addresses reviewer feedback on #161. The wall-clock timeout
bounds the caller-facing wait, but the underlying submit_message worker
thread keeps running and can mutate engine state after the timeout
TurnResult is returned. A cooperative cancel_event pattern is sketched in
the pinpoint; real asyncio.Task.cancel() support will come once provider
IO is async-native (larger refactor).
Tests (tests/test_run_turn_loop_continuation.py, 8 tests):
- TestNoTurnSuffixInjection (2): zero '[turn' strings in any submitted
prompt, both default and explicit-continuation paths
- TestContinuationDefaultStopsAfterTurnZero (2): default loops run exactly
one turn; engine.submit_message called exactly once despite max_turns=10
- TestExplicitContinuationBehaviour (2): turn 0 = original, turn N = continuation
verbatim; max_turns still respected
- TestCLIContinuationFlag (2): CLI default emits only '## Turn 1';
--continuation-prompt wires through to multi-turn behaviour
Full suite: 67/67 passing.
Closes ROADMAP #163. Files #164.
2026-04-22 17:37:22 +09:00
|
|
|
"""timeout_seconds is total wall-clock across all turns, not per-turn.
|
|
|
|
|
|
|
|
|
|
#163 interaction: multi-turn behaviour now requires an explicit
|
|
|
|
|
``continuation_prompt``; otherwise the loop stops after turn 0 and
|
|
|
|
|
the cumulative-budget contract is trivially satisfied. We supply one
|
|
|
|
|
here so the test actually exercises the cross-turn deadline.
|
|
|
|
|
"""
|
fix: #161 — wall-clock timeout for run_turn_loop; stalled turns now abort with stop_reason='timeout'
Previously, run_turn_loop was bounded only by max_turns (turn count). If
engine.submit_message stalled — slow provider, hung network, infinite
stream — the loop blocked indefinitely with no cancellation path. Claws
calling run_turn_loop in CI or orchestration had no reliable way to
enforce a deadline; the loop would hang until OS kill or human intervention.
Fix:
- Add timeout_seconds parameter to run_turn_loop (default None = legacy unbounded).
- When set, each submit_message call runs inside a ThreadPoolExecutor and is
bounded by the remaining wall-clock budget (total across all turns, not per-turn).
- On timeout, synthesize a TurnResult with stop_reason='timeout' carrying the
turn's prompt and routed matches so transcripts preserve orchestration context.
- Exhausted/negative budget short-circuits before calling submit_message.
- Legacy path (timeout_seconds=None) bypasses the executor entirely — zero
overhead for callers that don't opt in.
CLI:
- Added --timeout-seconds flag to 'turn-loop' command.
- Exit code 2 when the loop terminated on timeout (vs 0 for completed),
so shell scripts can distinguish 'done' from 'budget exhausted'.
Tests (tests/test_run_turn_loop_timeout.py, 6 tests):
- Legacy unbounded path unchanged (timeout_seconds=None never emits 'timeout')
- Hung submit_message aborted within budget (0.3s budget, 5s mock hang → exit <1.5s)
- Budget is cumulative across turns (0.6s budget, 0.4s per turn, not per-turn)
- timeout_seconds=0 short-circuits first turn without calling submit_message
- Negative timeout treated as exhausted (guard against caller bugs)
- Timeout TurnResult carries correct prompt, matches, UsageSummary shape
Full suite: 49/49 passing, zero regression.
Blocker: none. Closes ROADMAP #161.
2026-04-22 17:23:43 +09:00
|
|
|
runtime = PortRuntime()
|
|
|
|
|
call_count = {'n': 0}
|
|
|
|
|
|
fix: #164 Stage A — cooperative cancellation via cancel_event in submit_message
Closes the #161 follow-up gap identified in review: wall-clock timeout
bounded caller-facing wait but did not cancel the underlying provider
thread, which could silently mutate mutable_messages / transcript_store /
permission_denials / total_usage after the caller had already observed
stop_reason='timeout'. A ghost turn committed post-deadline would poison
any session that got persisted afterwards.
Stage A scope (this commit): runtime + engine layer cooperative cancel.
Engine layer (src/query_engine.py):
- submit_message now accepts cancel_event: threading.Event | None = None
- Two safe checkpoints:
1. Entry (before max_turns / budget projection) — earliest possible return
2. Post-budget (after output synthesis, before mutation) — catches cancel
that arrives while output was being computed
- Both checkpoints return stop_reason='cancelled' with state UNCHANGED
(mutable_messages, transcript_store, permission_denials, total_usage
all preserved exactly as on entry)
- cancel_event=None preserves legacy behaviour with zero overhead (no
checkpoint checks at all)
Runtime layer (src/runtime.py):
- run_turn_loop creates one cancel_event per invocation when a deadline
is in play (and None otherwise, preserving legacy fast path)
- Passes the same event to every submit_message call across turns, so a
late cancel on turn N-1 affects turn N
- On timeout (either pre-call or mid-call), runtime explicitly calls
cancel_event.set() before future.cancel() + synthesizing the timeout
TurnResult. This upgrades #161's best-effort future.cancel() (which
only cancels not-yet-started futures) to cooperative mid-flight cancel.
Stop reason taxonomy after Stage A:
'completed' — turn committed, state mutated exactly once
'max_budget_reached' — overflow, state unchanged (#162)
'max_turns_reached' — capacity exceeded, state unchanged
'cancelled' — cancel_event observed, state unchanged (#164 Stage A)
'timeout' — synthesised by runtime, not engine (#161)
The 'cancelled' vs 'timeout' split matters:
- 'timeout' is the runtime's best-effort signal to the caller: deadline hit
- 'cancelled' is the engine's confirmation: cancel was observed + honoured
If the provider call wedges entirely (never reaches a checkpoint), the
caller still sees 'timeout' and the thread is leaked — but any NEXT
submit_message call on the same engine observes the event at entry and
returns 'cancelled' immediately, preventing ghost-turn accumulation.
This is the honest cooperative limit in Python threading land; true
preemption requires async-native provider IO (future work, not Stage A).
Tests (29 new tests, tests/test_submit_message_cancellation.py + tests/
test_run_turn_loop_cancellation.py):
Engine-layer (12 tests):
- TestCancellationBeforeCall (5): pre-set event returns 'cancelled' immediately;
mutable_messages, transcript_store, usage, permission_denials all preserved
- TestCancellationAfterBudgetCheck (1): cancel set mid-call (after projection,
before commit) still honoured; output synthesised but state untouched
- TestCancellationAfterCommit (2): post-commit cancel not observable (honest
limit) BUT next call on same engine observes it + returns 'cancelled'
- TestLegacyCallersUnchanged (3): cancel_event=None preserves #162 atomicity
+ max_turns contract with zero behaviour change
- TestCancellationVsOtherStopReasons (2): cancel precedes max_turns check;
cancel does not retroactively override a completed turn
Runtime-layer (5 tests):
- TestTimeoutPropagatesCancelEvent (3): submit_message receives a real Event
object when deadline is set; None in legacy mode; timeout actually calls
event.set() so in-flight threads observe at their next checkpoint
- TestCancelEventSharedAcrossTurns (1): same event object passed to every
turn (object identity check) — late cancel on turn N-1 must affect turn N
Regression: 3 existing timeout test mocks updated to accept cancel_event
kwarg (mocks that previously had signature (prompt, commands, tools, denials)
now have (prompt, commands, tools, denials, cancel_event=None) since runtime
passes cancel_event positionally on the timeout path).
Full suite: 97 → 114 passing, zero regression.
Closes ROADMAP #164 Stage A.
What's explicitly NOT in Stage A:
- Preemptive cancellation of wedged provider IO (requires asyncio-native
provider path; larger refactor)
- Timeout on the legacy unbounded run_turn_loop path (by design: legacy
callers opt out of cancellation entirely)
- CLI exposure of 'cancelled' as a distinct exit code (currently 'cancelled'
maps to the same stop_reason != 'completed' break condition as others;
CLI surface for cancel is a separate pinpoint if warranted)
2026-04-22 18:14:14 +09:00
|
|
|
def _slow(prompt, commands, tools, denials, cancel_event=None):
|
fix: #161 — wall-clock timeout for run_turn_loop; stalled turns now abort with stop_reason='timeout'
Previously, run_turn_loop was bounded only by max_turns (turn count). If
engine.submit_message stalled — slow provider, hung network, infinite
stream — the loop blocked indefinitely with no cancellation path. Claws
calling run_turn_loop in CI or orchestration had no reliable way to
enforce a deadline; the loop would hang until OS kill or human intervention.
Fix:
- Add timeout_seconds parameter to run_turn_loop (default None = legacy unbounded).
- When set, each submit_message call runs inside a ThreadPoolExecutor and is
bounded by the remaining wall-clock budget (total across all turns, not per-turn).
- On timeout, synthesize a TurnResult with stop_reason='timeout' carrying the
turn's prompt and routed matches so transcripts preserve orchestration context.
- Exhausted/negative budget short-circuits before calling submit_message.
- Legacy path (timeout_seconds=None) bypasses the executor entirely — zero
overhead for callers that don't opt in.
CLI:
- Added --timeout-seconds flag to 'turn-loop' command.
- Exit code 2 when the loop terminated on timeout (vs 0 for completed),
so shell scripts can distinguish 'done' from 'budget exhausted'.
Tests (tests/test_run_turn_loop_timeout.py, 6 tests):
- Legacy unbounded path unchanged (timeout_seconds=None never emits 'timeout')
- Hung submit_message aborted within budget (0.3s budget, 5s mock hang → exit <1.5s)
- Budget is cumulative across turns (0.6s budget, 0.4s per turn, not per-turn)
- timeout_seconds=0 short-circuits first turn without calling submit_message
- Negative timeout treated as exhausted (guard against caller bugs)
- Timeout TurnResult carries correct prompt, matches, UsageSummary shape
Full suite: 49/49 passing, zero regression.
Blocker: none. Closes ROADMAP #161.
2026-04-22 17:23:43 +09:00
|
|
|
call_count['n'] += 1
|
|
|
|
|
time.sleep(0.4) # each turn burns 0.4s
|
|
|
|
|
return _completed_result(prompt)
|
|
|
|
|
|
|
|
|
|
with patch('src.runtime.QueryEnginePort.from_workspace') as mock_factory:
|
|
|
|
|
engine = mock_factory.return_value
|
|
|
|
|
engine.submit_message.side_effect = _slow
|
|
|
|
|
|
|
|
|
|
start = time.monotonic()
|
|
|
|
|
# 0.6s budget, 0.4s per turn. First turn completes (~0.4s),
|
|
|
|
|
# second turn times out before finishing.
|
|
|
|
|
results = runtime.run_turn_loop(
|
fix: #163 — remove [turn N] suffix pollution from run_turn_loop; file #164 timeout-cancellation followup
#163: run_turn_loop no longer injects f'{prompt} [turn N]' into follow-up
prompts. The suffix was never defined or interpreted anywhere — not by the
engine, not by the system prompt, not by any LLM. It looked like a real
user-typed annotation in the transcript and made replay/analysis fragile.
New behaviour:
- turn 0 submits the original prompt (unchanged)
- turn > 0 submits caller-supplied continuation_prompt if provided, else
the loop stops cleanly — no fabricated user turn
- added continuation_prompt: str | None = None parameter to run_turn_loop
- added --continuation-prompt CLI flag for claws scripting multi-turn loops
- zero '[turn' strings ever appear in mutable_messages or stdout now
Behaviour change for existing callers:
- Before: run_turn_loop(prompt, max_turns=3) submitted 3 turns
('prompt', 'prompt [turn 2]', 'prompt [turn 3]')
- After: run_turn_loop(prompt, max_turns=3) submits 1 turn ('prompt')
- To preserve old multi-turn behaviour, pass continuation_prompt='Continue.'
or any structured follow-up text
One existing timeout test (test_budget_is_cumulative_across_turns) updated
to pass continuation_prompt so the cumulative-budget contract is actually
exercised across turns instead of trivially satisfied by a one-turn loop.
#164 filed: addresses reviewer feedback on #161. The wall-clock timeout
bounds the caller-facing wait, but the underlying submit_message worker
thread keeps running and can mutate engine state after the timeout
TurnResult is returned. A cooperative cancel_event pattern is sketched in
the pinpoint; real asyncio.Task.cancel() support will come once provider
IO is async-native (larger refactor).
Tests (tests/test_run_turn_loop_continuation.py, 8 tests):
- TestNoTurnSuffixInjection (2): zero '[turn' strings in any submitted
prompt, both default and explicit-continuation paths
- TestContinuationDefaultStopsAfterTurnZero (2): default loops run exactly
one turn; engine.submit_message called exactly once despite max_turns=10
- TestExplicitContinuationBehaviour (2): turn 0 = original, turn N = continuation
verbatim; max_turns still respected
- TestCLIContinuationFlag (2): CLI default emits only '## Turn 1';
--continuation-prompt wires through to multi-turn behaviour
Full suite: 67/67 passing.
Closes ROADMAP #163. Files #164.
2026-04-22 17:37:22 +09:00
|
|
|
'review MCP tool',
|
|
|
|
|
max_turns=5,
|
|
|
|
|
timeout_seconds=0.6,
|
|
|
|
|
continuation_prompt='continue',
|
fix: #161 — wall-clock timeout for run_turn_loop; stalled turns now abort with stop_reason='timeout'
Previously, run_turn_loop was bounded only by max_turns (turn count). If
engine.submit_message stalled — slow provider, hung network, infinite
stream — the loop blocked indefinitely with no cancellation path. Claws
calling run_turn_loop in CI or orchestration had no reliable way to
enforce a deadline; the loop would hang until OS kill or human intervention.
Fix:
- Add timeout_seconds parameter to run_turn_loop (default None = legacy unbounded).
- When set, each submit_message call runs inside a ThreadPoolExecutor and is
bounded by the remaining wall-clock budget (total across all turns, not per-turn).
- On timeout, synthesize a TurnResult with stop_reason='timeout' carrying the
turn's prompt and routed matches so transcripts preserve orchestration context.
- Exhausted/negative budget short-circuits before calling submit_message.
- Legacy path (timeout_seconds=None) bypasses the executor entirely — zero
overhead for callers that don't opt in.
CLI:
- Added --timeout-seconds flag to 'turn-loop' command.
- Exit code 2 when the loop terminated on timeout (vs 0 for completed),
so shell scripts can distinguish 'done' from 'budget exhausted'.
Tests (tests/test_run_turn_loop_timeout.py, 6 tests):
- Legacy unbounded path unchanged (timeout_seconds=None never emits 'timeout')
- Hung submit_message aborted within budget (0.3s budget, 5s mock hang → exit <1.5s)
- Budget is cumulative across turns (0.6s budget, 0.4s per turn, not per-turn)
- timeout_seconds=0 short-circuits first turn without calling submit_message
- Negative timeout treated as exhausted (guard against caller bugs)
- Timeout TurnResult carries correct prompt, matches, UsageSummary shape
Full suite: 49/49 passing, zero regression.
Blocker: none. Closes ROADMAP #161.
2026-04-22 17:23:43 +09:00
|
|
|
)
|
|
|
|
|
elapsed = time.monotonic() - start
|
|
|
|
|
|
|
|
|
|
# Should exit at around 0.6s, not 2.0s (5 turns * 0.4s)
|
|
|
|
|
assert elapsed < 1.5, f'cumulative budget not honored: {elapsed:.2f}s'
|
|
|
|
|
# Last result should be the timeout
|
|
|
|
|
assert results[-1].stop_reason == 'timeout'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestExhaustedBudget:
|
|
|
|
|
def test_zero_timeout_short_circuits_first_turn(self) -> None:
|
|
|
|
|
"""timeout_seconds=0 emits timeout before the first submit_message call."""
|
|
|
|
|
runtime = PortRuntime()
|
|
|
|
|
|
|
|
|
|
with patch('src.runtime.QueryEnginePort.from_workspace') as mock_factory:
|
|
|
|
|
engine = mock_factory.return_value
|
|
|
|
|
# submit_message should never be called when budget is already 0
|
|
|
|
|
engine.submit_message.side_effect = AssertionError(
|
|
|
|
|
'submit_message should not run when budget is exhausted'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = runtime.run_turn_loop(
|
|
|
|
|
'review MCP tool', max_turns=3, timeout_seconds=0.0
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert len(results) == 1
|
|
|
|
|
assert results[0].stop_reason == 'timeout'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestTimeoutResultShape:
|
|
|
|
|
def test_timeout_result_has_correct_prompt_and_matches(self) -> None:
|
|
|
|
|
"""Synthetic TurnResult on timeout must carry the turn's prompt + routed matches."""
|
|
|
|
|
runtime = PortRuntime()
|
|
|
|
|
|
fix: #164 Stage A — cooperative cancellation via cancel_event in submit_message
Closes the #161 follow-up gap identified in review: wall-clock timeout
bounded caller-facing wait but did not cancel the underlying provider
thread, which could silently mutate mutable_messages / transcript_store /
permission_denials / total_usage after the caller had already observed
stop_reason='timeout'. A ghost turn committed post-deadline would poison
any session that got persisted afterwards.
Stage A scope (this commit): runtime + engine layer cooperative cancel.
Engine layer (src/query_engine.py):
- submit_message now accepts cancel_event: threading.Event | None = None
- Two safe checkpoints:
1. Entry (before max_turns / budget projection) — earliest possible return
2. Post-budget (after output synthesis, before mutation) — catches cancel
that arrives while output was being computed
- Both checkpoints return stop_reason='cancelled' with state UNCHANGED
(mutable_messages, transcript_store, permission_denials, total_usage
all preserved exactly as on entry)
- cancel_event=None preserves legacy behaviour with zero overhead (no
checkpoint checks at all)
Runtime layer (src/runtime.py):
- run_turn_loop creates one cancel_event per invocation when a deadline
is in play (and None otherwise, preserving legacy fast path)
- Passes the same event to every submit_message call across turns, so a
late cancel on turn N-1 affects turn N
- On timeout (either pre-call or mid-call), runtime explicitly calls
cancel_event.set() before future.cancel() + synthesizing the timeout
TurnResult. This upgrades #161's best-effort future.cancel() (which
only cancels not-yet-started futures) to cooperative mid-flight cancel.
Stop reason taxonomy after Stage A:
'completed' — turn committed, state mutated exactly once
'max_budget_reached' — overflow, state unchanged (#162)
'max_turns_reached' — capacity exceeded, state unchanged
'cancelled' — cancel_event observed, state unchanged (#164 Stage A)
'timeout' — synthesised by runtime, not engine (#161)
The 'cancelled' vs 'timeout' split matters:
- 'timeout' is the runtime's best-effort signal to the caller: deadline hit
- 'cancelled' is the engine's confirmation: cancel was observed + honoured
If the provider call wedges entirely (never reaches a checkpoint), the
caller still sees 'timeout' and the thread is leaked — but any NEXT
submit_message call on the same engine observes the event at entry and
returns 'cancelled' immediately, preventing ghost-turn accumulation.
This is the honest cooperative limit in Python threading land; true
preemption requires async-native provider IO (future work, not Stage A).
Tests (29 new tests, tests/test_submit_message_cancellation.py + tests/
test_run_turn_loop_cancellation.py):
Engine-layer (12 tests):
- TestCancellationBeforeCall (5): pre-set event returns 'cancelled' immediately;
mutable_messages, transcript_store, usage, permission_denials all preserved
- TestCancellationAfterBudgetCheck (1): cancel set mid-call (after projection,
before commit) still honoured; output synthesised but state untouched
- TestCancellationAfterCommit (2): post-commit cancel not observable (honest
limit) BUT next call on same engine observes it + returns 'cancelled'
- TestLegacyCallersUnchanged (3): cancel_event=None preserves #162 atomicity
+ max_turns contract with zero behaviour change
- TestCancellationVsOtherStopReasons (2): cancel precedes max_turns check;
cancel does not retroactively override a completed turn
Runtime-layer (5 tests):
- TestTimeoutPropagatesCancelEvent (3): submit_message receives a real Event
object when deadline is set; None in legacy mode; timeout actually calls
event.set() so in-flight threads observe at their next checkpoint
- TestCancelEventSharedAcrossTurns (1): same event object passed to every
turn (object identity check) — late cancel on turn N-1 must affect turn N
Regression: 3 existing timeout test mocks updated to accept cancel_event
kwarg (mocks that previously had signature (prompt, commands, tools, denials)
now have (prompt, commands, tools, denials, cancel_event=None) since runtime
passes cancel_event positionally on the timeout path).
Full suite: 97 → 114 passing, zero regression.
Closes ROADMAP #164 Stage A.
What's explicitly NOT in Stage A:
- Preemptive cancellation of wedged provider IO (requires asyncio-native
provider path; larger refactor)
- Timeout on the legacy unbounded run_turn_loop path (by design: legacy
callers opt out of cancellation entirely)
- CLI exposure of 'cancelled' as a distinct exit code (currently 'cancelled'
maps to the same stop_reason != 'completed' break condition as others;
CLI surface for cancel is a separate pinpoint if warranted)
2026-04-22 18:14:14 +09:00
|
|
|
def _hang(prompt, commands, tools, denials, cancel_event=None):
|
fix: #161 — wall-clock timeout for run_turn_loop; stalled turns now abort with stop_reason='timeout'
Previously, run_turn_loop was bounded only by max_turns (turn count). If
engine.submit_message stalled — slow provider, hung network, infinite
stream — the loop blocked indefinitely with no cancellation path. Claws
calling run_turn_loop in CI or orchestration had no reliable way to
enforce a deadline; the loop would hang until OS kill or human intervention.
Fix:
- Add timeout_seconds parameter to run_turn_loop (default None = legacy unbounded).
- When set, each submit_message call runs inside a ThreadPoolExecutor and is
bounded by the remaining wall-clock budget (total across all turns, not per-turn).
- On timeout, synthesize a TurnResult with stop_reason='timeout' carrying the
turn's prompt and routed matches so transcripts preserve orchestration context.
- Exhausted/negative budget short-circuits before calling submit_message.
- Legacy path (timeout_seconds=None) bypasses the executor entirely — zero
overhead for callers that don't opt in.
CLI:
- Added --timeout-seconds flag to 'turn-loop' command.
- Exit code 2 when the loop terminated on timeout (vs 0 for completed),
so shell scripts can distinguish 'done' from 'budget exhausted'.
Tests (tests/test_run_turn_loop_timeout.py, 6 tests):
- Legacy unbounded path unchanged (timeout_seconds=None never emits 'timeout')
- Hung submit_message aborted within budget (0.3s budget, 5s mock hang → exit <1.5s)
- Budget is cumulative across turns (0.6s budget, 0.4s per turn, not per-turn)
- timeout_seconds=0 short-circuits first turn without calling submit_message
- Negative timeout treated as exhausted (guard against caller bugs)
- Timeout TurnResult carries correct prompt, matches, UsageSummary shape
Full suite: 49/49 passing, zero regression.
Blocker: none. Closes ROADMAP #161.
2026-04-22 17:23:43 +09:00
|
|
|
time.sleep(5.0)
|
|
|
|
|
return _completed_result(prompt)
|
|
|
|
|
|
|
|
|
|
with patch('src.runtime.QueryEnginePort.from_workspace') as mock_factory:
|
|
|
|
|
engine = mock_factory.return_value
|
|
|
|
|
engine.submit_message.side_effect = _hang
|
|
|
|
|
|
|
|
|
|
results = runtime.run_turn_loop(
|
|
|
|
|
'review MCP tool', max_turns=2, timeout_seconds=0.2
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
timeout_result = results[-1]
|
|
|
|
|
assert timeout_result.stop_reason == 'timeout'
|
|
|
|
|
assert timeout_result.prompt == 'review MCP tool'
|
|
|
|
|
# matched_commands / matched_tools should still be populated from routing,
|
|
|
|
|
# so downstream transcripts don't lose the routing context.
|
|
|
|
|
# These may be empty tuples depending on routing; they must be tuples.
|
|
|
|
|
assert isinstance(timeout_result.matched_commands, tuple)
|
|
|
|
|
assert isinstance(timeout_result.matched_tools, tuple)
|
|
|
|
|
assert isinstance(timeout_result.usage, UsageSummary)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestNegativeTimeoutTreatedAsExhausted:
|
|
|
|
|
def test_negative_timeout_short_circuits(self) -> None:
|
|
|
|
|
"""A negative budget should behave identically to exhausted."""
|
|
|
|
|
runtime = PortRuntime()
|
|
|
|
|
|
|
|
|
|
with patch('src.runtime.QueryEnginePort.from_workspace') as mock_factory:
|
|
|
|
|
engine = mock_factory.return_value
|
|
|
|
|
engine.submit_message.side_effect = AssertionError(
|
|
|
|
|
'submit_message should not run when budget is negative'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = runtime.run_turn_loop(
|
|
|
|
|
'review MCP tool', max_turns=3, timeout_seconds=-1.0
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert len(results) == 1
|
|
|
|
|
assert results[0].stop_reason == 'timeout'
|