# Error Handling Best Practices Comprehensive guide to robust error handling, graceful degradation, and recovery strategies for Agent Patterns. ## Overview Robust error handling is critical for production agent systems. This guide covers: - Exception handling patterns - Graceful degradation strategies - Retry logic and resilience - Error logging and monitoring - Recovery mechanisms ## Common Error Types ### 1. LLM API Errors ```python from openai import RateLimitError, APIError, Timeout from anthropic import AnthropicError try: result = agent.run(task) except RateLimitError as e: # Rate limit exceeded print(f"Rate limit error: {e}") # Implement backoff and retry except APIError as e: # API service error print(f"API error: {e}") # Log and potentially retry except Timeout as e: # Request timeout print(f"Timeout: {e}") # Retry with longer timeout except Exception as e: # Catch-all print(f"Unexpected error: {e}") ``` ### 2. Configuration Errors ```python from agent_patterns.patterns import ReActAgent try: agent = ReActAgent( llm_configs={ "thinking": { "provider": "invalid_provider", # Error! "model": "gpt-4" } }, tools=tools ) except ValueError as e: print(f"Configuration error: {e}") # Use fallback configuration agent = ReActAgent( llm_configs=get_default_config(), tools=tools ) ``` ### 3. Tool Execution Errors ```python def safe_tool_wrapper(tool_func): """Wrap tools with error handling.""" def wrapper(*args, **kwargs): try: return tool_func(*args, **kwargs) except Exception as e: return f"Tool error: {type(e).__name__}: {str(e)}" return wrapper # Use wrapped tools tools = { "search": safe_tool_wrapper(search_function), "calculate": safe_tool_wrapper(calculate_function) } ``` ## Error Handling Patterns ### Try-Catch with Fallback ```python def run_with_fallback(agent, task, fallback_response="Unable to process request"): """Run agent with fallback on error.""" try: return agent.run(task) except Exception as e: print(f"Error: {e}") return fallback_response result = run_with_fallback(agent, task) ``` ### Retry with Exponential Backoff ```python import time def run_with_retry(agent, task, max_retries=3, base_delay=1): """Run agent with exponential backoff retry.""" for attempt in range(max_retries): try: return agent.run(task) except (RateLimitError, Timeout) as e: if attempt < max_retries - 1: delay = base_delay * (2 ** attempt) print(f"Attempt {attempt + 1} failed, retrying in {delay}s...") time.sleep(delay) else: print(f"Max retries reached") raise except Exception as e: # Non-retryable error raise result = run_with_retry(agent, task) ``` ### Circuit Breaker Pattern ```python class CircuitBreaker: """Circuit breaker for agent execution.""" def __init__(self, failure_threshold=5, timeout=60): self.failure_threshold = failure_threshold self.timeout = timeout self.failures = 0 self.last_failure_time = None self.state = "closed" # closed, open, half-open def call(self, func, *args, **kwargs): if self.state == "open": if time.time() - self.last_failure_time > self.timeout: self.state = "half-open" else: raise Exception("Circuit breaker is OPEN") try: result = func(*args, **kwargs) if self.state == "half-open": self.state = "closed" self.failures = 0 return result except Exception as e: self.failures += 1 self.last_failure_time = time.time() if self.failures >= self.failure_threshold: self.state = "open" raise # Usage breaker = CircuitBreaker() result = breaker.call(agent.run, task) ``` ### Graceful Degradation ```python def run_with_degradation(task): """Try progressively simpler approaches on failure.""" # Try full agent first try: complex_agent = SelfDiscoveryAgent( llm_configs=high_quality_config, max_selected_modules=5 ) return complex_agent.run(task) except Exception as e: print(f"Complex agent failed: {e}") # Fallback to simpler agent try: simple_agent = ReflectionAgent( llm_configs=standard_config, max_reflection_cycles=1 ) return simple_agent.run(task) except Exception as e: print(f"Simple agent failed: {e}") # Final fallback to basic LLM try: from langchain_openai import ChatOpenAI llm = ChatOpenAI(model="gpt-3.5-turbo") return llm.invoke(task).content except Exception as e: return "Service temporarily unavailable" ``` ## Logging and Monitoring ### Structured Logging ```python import logging import json # Setup logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class LoggingAgent(ReActAgent): """Agent with comprehensive logging.""" def on_start(self, input_data): """Log when agent starts.""" logger.info(json.dumps({ "event": "agent_start", "pattern": self.__class__.__name__, "input_length": len(str(input_data)) })) def on_finish(self, result): """Log when agent finishes.""" logger.info(json.dumps({ "event": "agent_finish", "pattern": self.__class__.__name__, "output_length": len(str(result)) })) def on_error(self, error): """Log errors.""" logger.error(json.dumps({ "event": "agent_error", "pattern": self.__class__.__name__, "error_type": type(error).__name__, "error_message": str(error) }), exc_info=True) ``` ### Metrics Collection ```python import time from dataclasses import dataclass from typing import Optional @dataclass class AgentMetrics: """Metrics for agent execution.""" start_time: float end_time: Optional[float] = None success: bool = False error: Optional[str] = None iterations: int = 0 llm_calls: int = 0 class MetricsCollectingAgent(ReActAgent): """Agent that collects execution metrics.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.metrics = AgentMetrics(start_time=time.time()) def run(self, input_data): self.metrics.start_time = time.time() try: result = super().run(input_data) self.metrics.success = True return result except Exception as e: self.metrics.error = str(e) raise finally: self.metrics.end_time = time.time() self._log_metrics() def _log_metrics(self): duration = self.metrics.end_time - self.metrics.start_time logger.info(json.dumps({ "duration_seconds": duration, "success": self.metrics.success, "error": self.metrics.error, "llm_calls": self.metrics.llm_calls })) ``` ## Validation and Safety ### Input Validation ```python def validate_input(task: str, max_length: int = 10000) -> str: """Validate and sanitize input.""" if not task or not task.strip(): raise ValueError("Task cannot be empty") if len(task) > max_length: raise ValueError(f"Task too long (max {max_length} characters)") # Remove potentially harmful content forbidden_patterns = ["