Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor of error handling #4575

Draft
wants to merge 23 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 23 additions & 34 deletions openhands/controller/agent_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import traceback
from typing import Type

import litellm

from openhands.controller.agent import Agent
from openhands.controller.state.state import State, TrafficControlState
from openhands.controller.stuck import StuckDetector
Expand Down Expand Up @@ -140,17 +138,15 @@ async def update_state_after_step(self):
# update metrics especially for cost. Use deepcopy to avoid it being modified by agent.reset()
self.state.local_metrics = copy.deepcopy(self.agent.llm.metrics)

async def _react_to_error(
async def _react_to_exception(
self,
message: str,
new_state: AgentState | None = None,
e: Exception,
new_state: AgentState = AgentState.ERROR,
):
if new_state is not None:
# it's important to set the state before adding the error event, so that metrics sync properly
await self.set_agent_state_to(new_state)
await self.event_stream.async_add_event(
ErrorObservation(message), EventSource.AGENT
)
raise e
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is all that's left of this method... it seems we don't really need it, maybe we can set the agent_state in the catch block 🤔
Is this state ever different than ERROR?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is one (one...) PAUSED


async def start_step_loop(self):
"""The main loop for the agent's step-by-step execution."""
Expand All @@ -165,16 +161,7 @@ async def start_step_loop(self):
except Exception as e:
traceback.print_exc()
self.log('error', f'Error while running the agent: {e}')
self.log('error', traceback.format_exc())
detail = str(e)
if isinstance(e, litellm.AuthenticationError):
detail += (
'\nPlease check your credentials. Is your API key correct?'
)
await self._react_to_error(
detail,
new_state=AgentState.ERROR,
)
await self._react_to_exception(e)
break

await asyncio.sleep(0.1)
Expand Down Expand Up @@ -246,8 +233,6 @@ async def _handle_observation(self, observation: Observation):
if isinstance(observation, AgentDelegateObservation):
self.state.history.on_event(observation)
elif isinstance(observation, ErrorObservation):
if observation.fatal:
await self.set_agent_state_to(AgentState.ERROR)
if self.state.agent_state == AgentState.ERROR:
self.state.metrics.merge(self.state.local_metrics)

Expand Down Expand Up @@ -439,9 +424,12 @@ async def _step(self) -> None:
if action is None:
raise LLMNoActionError('No action was returned')
except (LLMMalformedActionError, LLMNoActionError, LLMResponseError) as e:
await self._react_to_error(
str(e), new_state=None
) # don't change state, LLM can correct itself
await self.event_stream.async_add_event(
ErrorObservation(
content=str(e),
),
EventSource.AGENT,
)
return

if action.runnable:
Expand All @@ -467,9 +455,7 @@ async def _step(self) -> None:
self.log('debug', str(action), extra={'msg_type': 'ACTION'})

if self._is_stuck():
await self._react_to_error(
'Agent got stuck in a loop', new_state=AgentState.ERROR
)
await self._react_to_exception(RuntimeError('Agent got stuck in a loop'))

async def _delegate_step(self):
"""Executes a single step of the delegate agent."""
Expand All @@ -488,8 +474,9 @@ async def _delegate_step(self):
self.delegate = None
self.delegateAction = None

await self._react_to_error(
'Delegator agent encountered an error', new_state=None
await self.event_stream.async_add_event(
ErrorObservation('Delegate agent encountered an error'),
EventSource.AGENT,
)
elif delegate_state in (AgentState.FINISHED, AgentState.REJECTED):
self.log('debug', 'Delegate agent has finished execution')
Expand Down Expand Up @@ -539,16 +526,18 @@ async def _handle_traffic_control(
else:
self.state.traffic_control_state = TrafficControlState.THROTTLING
if self.headless_mode:
await self._react_to_error(
f'Agent reached maximum {limit_type} in headless mode, task stopped. '
f'Current {limit_type}: {current_value:.2f}, max {limit_type}: {max_value:.2f}',
new_state=AgentState.ERROR,
e = RuntimeError(
f'Agent reached maximum {limit_type} in headless mode. '
f'Current {limit_type}: {current_value:.2f}, max {limit_type}: {max_value:.2f}'
)
await self._react_to_exception(e)
else:
await self._react_to_error(
e = RuntimeError(
f'Agent reached maximum {limit_type}, task paused. '
f'Current {limit_type}: {current_value:.2f}, max {limit_type}: {max_value:.2f}. '
f'{TRAFFIC_CONTROL_REMINDER}',
)
await self._react_to_exception(
e,
new_state=AgentState.PAUSED,
)
stop_step = True
Expand Down
7 changes: 1 addition & 6 deletions openhands/controller/state/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,12 +157,7 @@ def __setstate__(self, state):
def get_last_error(self) -> str:
for event in self.history.get_events(reverse=True):
if isinstance(event, ErrorObservation):
return (
event.content
if not event.fatal
else 'There was a fatal error during agent execution: '
+ event.content
)
return event.content
return ''

def get_current_user_intent(self):
Expand Down
1 change: 0 additions & 1 deletion openhands/events/observation/error.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ class ErrorObservation(Observation):
E.g., Linter error after editing a file.
"""

fatal: bool = False
observation: str = ObservationType.ERROR

@property
Expand Down
8 changes: 3 additions & 5 deletions openhands/runtime/impl/eventstream/eventstream_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,12 +466,11 @@ def run_action(self, action: Action) -> Observation:
action_type = action.action # type: ignore[attr-defined]
if action_type not in ACTION_TYPE_TO_CLASS:
return ErrorObservation(
f'Action {action_type} does not exist.', fatal=True
f'Action {action_type} does not exist.',
)
if not hasattr(self, action_type):
return ErrorObservation(
f'Action {action_type} is not supported in the current runtime.',
fatal=True,
)
if (
getattr(action, 'confirmation_state', None)
Expand Down Expand Up @@ -503,17 +502,16 @@ def run_action(self, action: Action) -> Observation:
error_message = response.text
self.log('error', f'Error from server: {error_message}')
obs = ErrorObservation(
f'Action execution failed: {error_message}', fatal=True
f'Action execution failed: {error_message}',
)
except requests.Timeout:
self.log('error', 'No response received within the timeout period.')
obs = ErrorObservation(
f'Action execution timed out after {action.timeout} seconds.',
fatal=True,
)
except Exception as e:
self.log('error', f'Error during action execution: {e}')
obs = ErrorObservation(f'Action execution failed: {str(e)}', fatal=True)
obs = ErrorObservation(f'Action execution failed: {str(e)}')
self._refresh_logs()
return obs

Expand Down
8 changes: 1 addition & 7 deletions openhands/runtime/impl/remote/remote_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,12 +363,10 @@ def run_action(self, action: Action) -> Observation:
if action_type not in ACTION_TYPE_TO_CLASS:
return ErrorObservation(
f'[Runtime (ID={self.runtime_id})] Action {action_type} does not exist.',
fatal=True,
)
if not hasattr(self, action_type):
return ErrorObservation(
f'[Runtime (ID={self.runtime_id})] Action {action_type} is not supported in the current runtime.',
fatal=True,
)

assert action.timeout is not None
Expand All @@ -391,20 +389,16 @@ def run_action(self, action: Action) -> Observation:
else:
error_message = response.text
self.log('error', f'Error from server: {error_message}')
obs = ErrorObservation(
f'Action execution failed: {error_message}', fatal=True
)
obs = ErrorObservation(f'Action execution failed: {error_message}')
except Timeout:
self.log('error', 'No response received within the timeout period.')
obs = ErrorObservation(
f'[Runtime (ID={self.runtime_id})] Action execution timed out',
fatal=True,
)
except Exception as e:
self.log('error', f'Error during action execution: {e}')
obs = ErrorObservation(
f'[Runtime (ID={self.runtime_id})] Action execution failed: {str(e)}',
fatal=True,
)
return obs

Expand Down
1 change: 0 additions & 1 deletion openhands/runtime/utils/bash.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,5 +331,4 @@ def run(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservation:
except UnicodeDecodeError as e:
return ErrorObservation(
f'Runtime bash execution failed: Command output could not be decoded as utf-8. {str(e)}',
fatal=True,
)
6 changes: 2 additions & 4 deletions openhands/runtime/utils/edit.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,7 @@ def edit(self, action: FileEditAction) -> Observation:
return obs
if not isinstance(obs, FileWriteObservation):
return ErrorObservation(

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Use consistent naming for error handling to improve readability.

Existing Code:

return ErrorObservation(
    f'Fatal Runtime in editing: Expected FileWriteObservation, got {type(obs)}: {str(obs)}',
)

Improved Code:

return ErrorObservation(
    f'Error in editing: Expected FileWriteObservation, got {type(obs)}: {str(obs)}',
)

Details:
Consider using a consistent naming convention for error handling to improve readability and maintainability.

f'Fatal Runtime in editing: Expected FileWriteObservation, got {type(obs)}: {str(obs)}',
fatal=True,
f'Expected FileWriteObservation, got {type(obs)}: {str(obs)}',
)
return FileEditObservation(
content=get_diff('', action.content, action.path),
Expand All @@ -226,8 +225,7 @@ def edit(self, action: FileEditAction) -> Observation:
)
if not isinstance(obs, FileReadObservation):
return ErrorObservation(
f'Fatal Runtime in editing: Expected FileReadObservation, got {type(obs)}: {str(obs)}',
fatal=True,
f'Expected FileReadObservation, got {type(obs)}: {str(obs)}',
)

original_file_content = obs.content
Expand Down
Loading