autonomous-agent-patterns

🕹️ Autonomous Agent Patterns

> Design patterns for building autonomous coding agents, inspired by Cline and OpenAI Codex.

When to Use This Skill

Use this skill when:

Building autonomous AI agents

Designing tool/function calling APIs

Implementing permission and approval systems

Creating browser automation for agents

Designing human-in-the-loop workflows

1. Core Agent Architecture

1.1 Agent Loop

┌─────────────────────────────────────────────────────────────┐
│                     AGENT LOOP                               │
│                                                              │
│  ┌──────────┐    ┌──────────┐    ┌──────────┐              │
│  │  Think   │───▶│  Decide  │───▶│   Act    │              │
│  │ (Reason) │    │ (Plan)   │    │ (Execute)│              │
│  └──────────┘    └──────────┘    └──────────┘              │
│       ▲                               │                     │
│       │         ┌──────────┐          │                     │
│       └─────────│ Observe  │◀─────────┘                     │
│                 │ (Result) │                                │
│                 └──────────┘                                │
└─────────────────────────────────────────────────────────────┘

class AgentLoop:
    def __init__(self, llm, tools, max_iterations=50):
        self.llm = llm
        self.tools = {t.name: t for t in tools}
        self.max_iterations = max_iterations
        self.history = []
    def run(self, task: str) -> str:
        self.history.append({"role": "user", "content": task})
        for i in range(self.max_iterations):
            # Think: Get LLM response with tool options
            response = self.llm.chat(
                messages=self.history,
                tools=self._format_tools(),
                tool_choice="auto"
            )
            # Decide: Check if agent wants to use a tool
            if response.tool_calls:
                for tool_call in response.tool_calls:
                    # Act: Execute the tool
                    result = self._execute_tool(tool_call)
                    # Observe: Add result to history
                    self.history.append({
                        "role": "tool",
                        "tool_call_id": tool_call.id,
                        "content": str(result)
                    })
            else:
                # No more tool calls = task complete
                return response.content
        return "Max iterations reached"    def _execute_tool(self, tool_call) -> Any:
        tool = self.tools[tool_call.name]
        args = json.loads(tool_call.arguments)
        return tool.execute(args)

1.2 Multi-Model Architecture

class MultiModelAgent:
    """
    Use different models for different purposes:
    - Fast model for planning
    - Powerful model for complex reasoning
    - Specialized model for code generation
    """
    def __init__(self):
        self.models = {
            "fast": "gpt-3.5-turbo",      # Quick decisions
            "smart": "gpt-4-turbo",        # Complex reasoning
            "code": "claude-3-sonnet",     # Code generation
        }    def select_model(self, task_type: str) -> str:
        if task_type == "planning":
            return self.models["fast"]
        elif task_type == "analysis":
            return self.models["smart"]
        elif task_type == "code":
            return self.models["code"]
        return self.models["smart"]

2. Tool Design Patterns

2.1 Tool Schema

class Tool:
    """Base class for agent tools"""
    @property
    def schema(self) -> dict:
        """JSON Schema for the tool"""
        return {
            "name": self.name,
            "description": self.description,
            "parameters": {
                "type": "object",
                "properties": self._get_parameters(),
                "required": self._get_required()
            }
        }    def execute(self, kwargs) -> ToolResult:
        """Execute the tool and return result"""
        raise NotImplementedError
class ReadFileTool(Tool):
    name = "read_file"
    description = "Read the contents of a file from the filesystem"
    def _get_parameters(self):
        return {
            "path": {
                "type": "string",
                "description": "Absolute path to the file"
            },
            "start_line": {
                "type": "integer",
                "description": "Line to start reading from (1-indexed)"
            },
            "end_line": {
                "type": "integer",
                "description": "Line to stop reading at (inclusive)"
            }
        }
    def _get_required(self):
        return ["path"]
    def execute(self, path: str, start_line: int = None, end_line: int = None) -> ToolResult:
        try:
            with open(path, 'r') as f:
                lines = f.readlines()
            if start_line and end_line:
                lines = lines[start_line-1:end_line]            return ToolResult(
                success=True,
                output="".join(lines)
            )
        except FileNotFoundError:
            return ToolResult(
                success=False,
                error=f"File not found: {path}"
            )

2.2 Essential Agent Tools

CODING_AGENT_TOOLS = {
    # File operations
    "read_file": "Read file contents",
    "write_file": "Create or overwrite a file",
    "edit_file": "Make targeted edits to a file",
    "list_directory": "List files and folders",
    "search_files": "Search for files by pattern",
    # Code understanding
    "search_code": "Search for code patterns (grep)",
    "get_definition": "Find function/class definition",
    "get_references": "Find all references to a symbol",
    # Terminal
    "run_command": "Execute a shell command",
    "read_output": "Read command output",
    "send_input": "Send input to running command",
    # Browser (optional)
    "open_browser": "Open URL in browser",
    "click_element": "Click on page element",
    "type_text": "Type text into input",
    "screenshot": "Capture screenshot",    # Context
    "ask_user": "Ask the user a question",
    "search_web": "Search the web for information"
}

2.3 Edit Tool Design

class EditFileTool(Tool):
    """
    Precise file editing with conflict detection.
    Uses search/replace pattern for reliable edits.
    """
    name = "edit_file"
    description = "Edit a file by replacing specific content"
    def execute(
        self,
        path: str,
        search: str,
        replace: str,
        expected_occurrences: int = 1
    ) -> ToolResult:
        """
        Args:
            path: File to edit
            search: Exact text to find (must match exactly, including whitespace)
            replace: Text to replace with
            expected_occurrences: How many times search should appear (validation)
        """
        with open(path, 'r') as f:
            content = f.read()
        # Validate
        actual_occurrences = content.count(search)
        if actual_occurrences != expected_occurrences:
            return ToolResult(
                success=False,
                error=f"Expected {expected_occurrences} occurrences, found {actual_occurrences}"
            )
        if actual_occurrences == 0:
            return ToolResult(
                success=False,
                error="Search text not found in file"
            )
        # Apply edit
        new_content = content.replace(search, replace)
        with open(path, 'w') as f:
            f.write(new_content)        return ToolResult(
            success=True,
            output=f"Replaced {actual_occurrences} occurrence(s)"
        )

3. Permission & Safety Patterns

3.1 Permission Levels

class PermissionLevel(Enum):
    # Fully automatic - no user approval needed
    AUTO = "auto"
    # Ask once per session
    ASK_ONCE = "ask_once"
    # Ask every time
    ASK_EACH = "ask_each"
    # Never allow
    NEVER = "never"
PERMISSION_CONFIG = {
    # Low risk - can auto-approve
    "read_file": PermissionLevel.AUTO,
    "list_directory": PermissionLevel.AUTO,
    "search_code": PermissionLevel.AUTO,
    # Medium risk - ask once
    "write_file": PermissionLevel.ASK_ONCE,
    "edit_file": PermissionLevel.ASK_ONCE,
    # High risk - ask each time
    "run_command": PermissionLevel.ASK_EACH,
    "delete_file": PermissionLevel.ASK_EACH,    # Dangerous - never auto-approve
    "sudo_command": PermissionLevel.NEVER,
    "format_disk": PermissionLevel.NEVER
}

3.2 Approval UI Pattern

class ApprovalManager:
    def __init__(self, ui, config):
        self.ui = ui
        self.config = config
        self.session_approvals = {}
    def request_approval(self, tool_name: str, args: dict) -> bool:
        level = self.config.get(tool_name, PermissionLevel.ASK_EACH)
        if level == PermissionLevel.AUTO:
            return True
        if level == PermissionLevel.NEVER:
            self.ui.show_error(f"Tool '{tool_name}' is not allowed")
            return False
        if level == PermissionLevel.ASK_ONCE:
            if tool_name in self.session_approvals:
                return self.session_approvals[tool_name]
        # Show approval dialog
        approved = self.ui.show_approval_dialog(
            tool=tool_name,
            args=args,
            risk_level=self._assess_risk(tool_name, args)
        )
        if level == PermissionLevel.ASK_ONCE:
            self.session_approvals[tool_name] = approved
        return approved    def _assess_risk(self, tool_name: str, args: dict) -> str:
        """Analyze specific call for risk level"""
        if tool_name == "run_command":
            cmd = args.get("command", "")
            if any(danger in cmd for danger in ["rm -rf", "sudo", "chmod"]):
                return "HIGH"
        return "MEDIUM"

3.3 Sandboxing

class SandboxedExecution:
    """
    Execute code/commands in isolated environment
    """
    def __init__(self, workspace_dir: str):
        self.workspace = workspace_dir
        self.allowed_commands = ["npm", "python", "node", "git", "ls", "cat"]
        self.blocked_paths = ["/etc", "/usr", "/bin", os.path.expanduser("~")]
    def validate_path(self, path: str) -> bool:
        """Ensure path is within workspace"""
        real_path = os.path.realpath(path)
        workspace_real = os.path.realpath(self.workspace)
        return real_path.startswith(workspace_real)
    def validate_command(self, command: str) -> bool:
        """Check if command is allowed"""
        cmd_parts = shlex.split(command)
        if not cmd_parts:
            return False
        base_cmd = cmd_parts[0]
        return base_cmd in self.allowed_commands
    def execute_sandboxed(self, command: str) -> ToolResult:
        if not self.validate_command(command):
            return ToolResult(
                success=False,
                error=f"Command not allowed: {command}"
            )
        # Execute in isolated environment
        result = subprocess.run(
            command,
            shell=True,
            cwd=self.workspace,
            capture_output=True,
            timeout=30,
            env={
                **os.environ,
                "HOME": self.workspace,  # Isolate home directory
            }
        )        return ToolResult(
            success=result.returncode == 0,
            output=result.stdout.decode(),
            error=result.stderr.decode() if result.returncode != 0 else None
        )

4. Browser Automation

4.1 Browser Tool Pattern

class BrowserTool:
    """
    Browser automation for agents using Playwright/Puppeteer.
    Enables visual debugging and web testing.
    """
    def __init__(self, headless: bool = True):
        self.browser = None
        self.page = None
        self.headless = headless
    async def open_url(self, url: str) -> ToolResult:
        """Navigate to URL and return page info"""
        if not self.browser:
            self.browser = await playwright.chromium.launch(headless=self.headless)
            self.page = await self.browser.new_page()
        await self.page.goto(url)
        # Capture state
        screenshot = await self.page.screenshot(type='png')
        title = await self.page.title()
        return ToolResult(
            success=True,
            output=f"Loaded: {title}",
            metadata={
                "screenshot": base64.b64encode(screenshot).decode(),
                "url": self.page.url
            }
        )
    async def click(self, selector: str) -> ToolResult:
        """Click on an element"""
        try:
            await self.page.click(selector, timeout=5000)
            await self.page.wait_for_load_state("networkidle")
            screenshot = await self.page.screenshot()
            return ToolResult(
                success=True,
                output=f"Clicked: {selector}",
                metadata={"screenshot": base64.b64encode(screenshot).decode()}
            )
        except TimeoutError:
            return ToolResult(
                success=False,
                error=f"Element not found: {selector}"
            )
    async def type_text(self, selector: str, text: str) -> ToolResult:
        """Type text into an input"""
        await self.page.fill(selector, text)
        return ToolResult(success=True, output=f"Typed into {selector}")
    async def get_page_content(self) -> ToolResult:
        """Get accessible text content of the page"""
        content = await self.page.evaluate("""
            () => {
                // Get visible text
                const walker = document.createTreeWalker(
                    document.body,
                    NodeFilter.SHOW_TEXT,
                    null,
                    false
                );                let text = '';
                while (walker.nextNode()) {
                    const node = walker.currentNode;
                    if (node.textContent.trim()) {
                        text += node.textContent.trim() + '\\n';
                    }
                }
                return text;
            }
        """)
        return ToolResult(success=True, output=content)

4.2 Visual Agent Pattern

class VisualAgent:
    """
    Agent that uses screenshots to understand web pages.
    Can identify elements visually without selectors.
    """
    def __init__(self, llm, browser):
        self.llm = llm
        self.browser = browser
    async def describe_page(self) -> str:
        """Use vision model to describe current page"""
        screenshot = await self.browser.screenshot()
        response = self.llm.chat([
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this webpage. List all interactive elements you see."},
                    {"type": "image", "data": screenshot}
                ]
            }
        ])
        return response.content
    async def find_and_click(self, description: str) -> ToolResult:
        """Find element by visual description and click it"""
        screenshot = await self.browser.screenshot()
        # Ask vision model to find element
        response = self.llm.chat([
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"""
                        Find the element matching: "{description}"
                        Return the approximate coordinates as JSON: {{"x": number, "y": number}}
                        """
                    },
                    {"type": "image", "data": screenshot}
                ]
            }
        ])
        coords = json.loads(response.content)
        await self.browser.page.mouse.click(coords["x"], coords["y"])        return ToolResult(success=True, output=f"Clicked at ({coords['x']}, {coords['y']})")

5. Context Management

5.1 Context Injection Patterns

class ContextManager:
    """
    Manage context provided to the agent.
    Inspired by Cline's @-mention patterns.
    """
    def __init__(self, workspace: str):
        self.workspace = workspace
        self.context = []
    def add_file(self, path: str) -> None:
        """@file - Add file contents to context"""
        with open(path, 'r') as f:
            content = f.read()
        self.context.append({
            "type": "file",
            "path": path,
            "content": content
        })
    def add_folder(self, path: str, max_files: int = 20) -> None:
        """@folder - Add all files in folder"""
        for root, dirs, files in os.walk(path):
            for file in files[:max_files]:
                file_path = os.path.join(root, file)
                self.add_file(file_path)
    def add_url(self, url: str) -> None:
        """@url - Fetch and add URL content"""
        response = requests.get(url)
        content = html_to_markdown(response.text)
        self.context.append({
            "type": "url",
            "url": url,
            "content": content
        })
    def add_problems(self, diagnostics: list) -> None:
        """@problems - Add IDE diagnostics"""
        self.context.append({
            "type": "diagnostics",
            "problems": diagnostics
        })    def format_for_prompt(self) -> str:
        """Format all context for LLM prompt"""
        parts = []
        for item in self.context:
            if item["type"] == "file":
                parts.append(f"## File: {item['path']}\n

\n{item['content']}\n`

")
            elif item["type"] == "url":
                parts.append(f"## URL: {item['url']}\n{item['content']}")
            elif item["type"] == "diagnostics":
                parts.append(f"## Problems:\n{json.dumps(item['problems'], indent=2)}")

return "\n\n".join(parts)

### 5.2 Checkpoint/Resume

python
class CheckpointManager:
"""
Save and restore agent state for long-running tasks.
"""

def __init__(self, storage_dir: str):
self.storage_dir = storage_dir
os.makedirs(storage_dir, exist_ok=True)

def save_checkpoint(self, session_id: str, state: dict) -> str:
"""Save current agent state"""
checkpoint = {
"timestamp": datetime.now().isoformat(),
"session_id": session_id,
"history": state["history"],
"context": state["context"],
"workspace_state": self._capture_workspace(state["workspace"]),
"metadata": state.get("metadata", {})
}

path = os.path.join(self.storage_dir, f"{session_id}.json")
with open(path, 'w') as f:
json.dump(checkpoint, f, indent=2)

return path

def restore_checkpoint(self, checkpoint_path: str) -> dict:
"""Restore agent state from checkpoint"""
with open(checkpoint_path, 'r') as f:
checkpoint = json.load(f)

return {
"history": checkpoint["history"],
"context": checkpoint["context"],
"workspace": self._restore_workspace(checkpoint["workspace_state"]),
"metadata": checkpoint["metadata"]
}

def _capture_workspace(self, workspace: str) -> dict:
"""Capture relevant workspace state"""
# Git status, file hashes, etc.
return {
"git_ref": subprocess.getoutput(f"cd {workspace} && git rev-parse HEAD"),
"git_dirty": subprocess.getoutput(f"cd {workspace} && git status --porcelain")
}

---
6. MCP (Model Context Protocol) Integration
6.1 MCP Server Pattern

python
from mcp import Server, Tool

class MCPAgent:
"""
Agent that can dynamically discover and use MCP tools.
'Add a tool that...' pattern from Cline.
"""

def __init__(self, llm):
self.llm = llm
self.mcp_servers = {}
self.available_tools = {}

def connect_server(self, name: str, config: dict) -> None:
"""Connect to an MCP server"""
server = Server(config)
self.mcp_servers[name] = server

# Discover tools
tools = server.list_tools()
for tool in tools:
self.available_tools[tool.name] = {
"server": name,
"schema": tool.schema
}

async def create_tool(self, description: str) -> str:
"""
Create a new MCP server based on user description.
'Add a tool that fetches Jira tickets'
"""
# Generate MCP server code
code = self.llm.generate(f"""
Create a Python MCP server with a tool that does:
{description}

Use the FastMCP framework. Include proper error handling.
Return only the Python code.
""")

# Save and install
server_name = self._extract_name(description)
path = f"./mcp_servers/{server_name}/server.py"

with open(path, 'w') as f:
f.write(code)

# Hot-reload
self.connect_server(server_name, {"path": path})

return f"Created tool: {server_name}"
```

Best Practices Checklist

Agent Design

[ ] Clear task decomposition

[ ] Appropriate tool granularity

[ ] Error handling at each step

[ ] Progress visibility to user

Safety

[ ] Permission system implemented

[ ] Dangerous operations blocked

[ ] Sandbox for untrusted code

[ ] Audit logging enabled

UX

[ ] Approval UI is clear

[ ] Progress updates provided

[ ] Undo/rollback available

[ ] Explanation of actions

Resources

Cline

OpenAI Codex

Model Context Protocol

Anthropic Tool Use