autonomous-agent-patterns

构建自主编码代理的设计模式。涵盖工具集成、权限系统、浏览器自动化及人机协同工作流。适用于开发AI代理、设计工具API、实施权限系统或创建自主编码助手场景。

查看详情
name:autonomous-agent-patternsdescription:"Design patterns for building autonomous coding agents. Covers tool integration, permission systems, browser automation, and human-in-the-loop workflows. Use when building AI agents, designing tool APIs, implementing permission systems, or creating autonomous coding assistants."

🕹️ Autonomous Agent Patterns

> Design patterns for building autonomous coding agents, inspired by Cline and OpenAI Codex.

When to Use This Skill

Use this skill when:

  • Building autonomous AI agents

  • Designing tool/function calling APIs

  • Implementing permission and approval systems

  • Creating browser automation for agents

  • Designing human-in-the-loop workflows

  • 1. Core Agent Architecture

    1.1 Agent Loop

    ┌─────────────────────────────────────────────────────────────┐
    │ AGENT LOOP │
    │ │
    │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
    │ │ Think │───▶│ Decide │───▶│ Act │ │
    │ │ (Reason) │ │ (Plan) │ │ (Execute)│ │
    │ └──────────┘ └──────────┘ └──────────┘ │
    │ ▲ │ │
    │ │ ┌──────────┐ │ │
    │ └─────────│ Observe │◀─────────┘ │
    │ │ (Result) │ │
    │ └──────────┘ │
    └─────────────────────────────────────────────────────────────┘

    class AgentLoop:
    def __init__(self, llm, tools, max_iterations=50):
    self.llm = llm
    self.tools = {t.name: t for t in tools}
    self.max_iterations = max_iterations
    self.history = []

    def run(self, task: str) -> str:
    self.history.append({"role": "user", "content": task})

    for i in range(self.max_iterations):
    # Think: Get LLM response with tool options
    response = self.llm.chat(
    messages=self.history,
    tools=self._format_tools(),
    tool_choice="auto"
    )

    # Decide: Check if agent wants to use a tool
    if response.tool_calls:
    for tool_call in response.tool_calls:
    # Act: Execute the tool
    result = self._execute_tool(tool_call)

    # Observe: Add result to history
    self.history.append({
    "role": "tool",
    "tool_call_id": tool_call.id,
    "content": str(result)
    })
    else:
    # No more tool calls = task complete
    return response.content

    return "Max iterations reached"

    def _execute_tool(self, tool_call) -> Any:
    tool = self.tools[tool_call.name]
    args = json.loads(tool_call.arguments)
    return tool.execute(args)

    1.2 Multi-Model Architecture

    class MultiModelAgent:
    """
    Use different models for different purposes:
    - Fast model for planning
    - Powerful model for complex reasoning
    - Specialized model for code generation
    """

    def __init__(self):
    self.models = {
    "fast": "gpt-3.5-turbo", # Quick decisions
    "smart": "gpt-4-turbo", # Complex reasoning
    "code": "claude-3-sonnet", # Code generation
    }

    def select_model(self, task_type: str) -> str:
    if task_type == "planning":
    return self.models["fast"]
    elif task_type == "analysis":
    return self.models["smart"]
    elif task_type == "code":
    return self.models["code"]
    return self.models["smart"]


    2. Tool Design Patterns

    2.1 Tool Schema

    class Tool:
    """Base class for agent tools"""

    @property
    def schema(self) -> dict:
    """JSON Schema for the tool"""
    return {
    "name": self.name,
    "description": self.description,
    "parameters": {
    "type": "object",
    "properties": self._get_parameters(),
    "required": self._get_required()
    }
    }

    def execute(self, kwargs) -> ToolResult:
    """Execute the tool and return result"""
    raise NotImplementedError

    class ReadFileTool(Tool):
    name = "read_file"
    description = "Read the contents of a file from the filesystem"

    def _get_parameters(self):
    return {
    "path": {
    "type": "string",
    "description": "Absolute path to the file"
    },
    "start_line": {
    "type": "integer",
    "description": "Line to start reading from (1-indexed)"
    },
    "end_line": {
    "type": "integer",
    "description": "Line to stop reading at (inclusive)"
    }
    }

    def _get_required(self):
    return ["path"]

    def execute(self, path: str, start_line: int = None, end_line: int = None) -> ToolResult:
    try:
    with open(path, 'r') as f:
    lines = f.readlines()

    if start_line and end_line:
    lines = lines[start_line-1:end_line]

    return ToolResult(
    success=True,
    output="".join(lines)
    )
    except FileNotFoundError:
    return ToolResult(
    success=False,
    error=f"File not found: {path}"
    )

    2.2 Essential Agent Tools

    CODING_AGENT_TOOLS = {
    # File operations
    "read_file": "Read file contents",
    "write_file": "Create or overwrite a file",
    "edit_file": "Make targeted edits to a file",
    "list_directory": "List files and folders",
    "search_files": "Search for files by pattern",

    # Code understanding
    "search_code": "Search for code patterns (grep)",
    "get_definition": "Find function/class definition",
    "get_references": "Find all references to a symbol",

    # Terminal
    "run_command": "Execute a shell command",
    "read_output": "Read command output",
    "send_input": "Send input to running command",

    # Browser (optional)
    "open_browser": "Open URL in browser",
    "click_element": "Click on page element",
    "type_text": "Type text into input",
    "screenshot": "Capture screenshot",

    # Context
    "ask_user": "Ask the user a question",
    "search_web": "Search the web for information"
    }

    2.3 Edit Tool Design

    class EditFileTool(Tool):
    """
    Precise file editing with conflict detection.
    Uses search/replace pattern for reliable edits.
    """

    name = "edit_file"
    description = "Edit a file by replacing specific content"

    def execute(
    self,
    path: str,
    search: str,
    replace: str,
    expected_occurrences: int = 1
    ) -> ToolResult:
    """
    Args:
    path: File to edit
    search: Exact text to find (must match exactly, including whitespace)
    replace: Text to replace with
    expected_occurrences: How many times search should appear (validation)
    """
    with open(path, 'r') as f:
    content = f.read()

    # Validate
    actual_occurrences = content.count(search)
    if actual_occurrences != expected_occurrences:
    return ToolResult(
    success=False,
    error=f"Expected {expected_occurrences} occurrences, found {actual_occurrences}"
    )

    if actual_occurrences == 0:
    return ToolResult(
    success=False,
    error="Search text not found in file"
    )

    # Apply edit
    new_content = content.replace(search, replace)

    with open(path, 'w') as f:
    f.write(new_content)

    return ToolResult(
    success=True,
    output=f"Replaced {actual_occurrences} occurrence(s)"
    )


    3. Permission & Safety Patterns

    3.1 Permission Levels

    class PermissionLevel(Enum):
    # Fully automatic - no user approval needed
    AUTO = "auto"

    # Ask once per session
    ASK_ONCE = "ask_once"

    # Ask every time
    ASK_EACH = "ask_each"

    # Never allow
    NEVER = "never"

    PERMISSION_CONFIG = {
    # Low risk - can auto-approve
    "read_file": PermissionLevel.AUTO,
    "list_directory": PermissionLevel.AUTO,
    "search_code": PermissionLevel.AUTO,

    # Medium risk - ask once
    "write_file": PermissionLevel.ASK_ONCE,
    "edit_file": PermissionLevel.ASK_ONCE,

    # High risk - ask each time
    "run_command": PermissionLevel.ASK_EACH,
    "delete_file": PermissionLevel.ASK_EACH,

    # Dangerous - never auto-approve
    "sudo_command": PermissionLevel.NEVER,
    "format_disk": PermissionLevel.NEVER
    }

    3.2 Approval UI Pattern

    class ApprovalManager:
    def __init__(self, ui, config):
    self.ui = ui
    self.config = config
    self.session_approvals = {}

    def request_approval(self, tool_name: str, args: dict) -> bool:
    level = self.config.get(tool_name, PermissionLevel.ASK_EACH)

    if level == PermissionLevel.AUTO:
    return True

    if level == PermissionLevel.NEVER:
    self.ui.show_error(f"Tool '{tool_name}' is not allowed")
    return False

    if level == PermissionLevel.ASK_ONCE:
    if tool_name in self.session_approvals:
    return self.session_approvals[tool_name]

    # Show approval dialog
    approved = self.ui.show_approval_dialog(
    tool=tool_name,
    args=args,
    risk_level=self._assess_risk(tool_name, args)
    )

    if level == PermissionLevel.ASK_ONCE:
    self.session_approvals[tool_name] = approved

    return approved

    def _assess_risk(self, tool_name: str, args: dict) -> str:
    """Analyze specific call for risk level"""
    if tool_name == "run_command":
    cmd = args.get("command", "")
    if any(danger in cmd for danger in ["rm -rf", "sudo", "chmod"]):
    return "HIGH"
    return "MEDIUM"

    3.3 Sandboxing

    class SandboxedExecution:
    """
    Execute code/commands in isolated environment
    """

    def __init__(self, workspace_dir: str):
    self.workspace = workspace_dir
    self.allowed_commands = ["npm", "python", "node", "git", "ls", "cat"]
    self.blocked_paths = ["/etc", "/usr", "/bin", os.path.expanduser("~")]

    def validate_path(self, path: str) -> bool:
    """Ensure path is within workspace"""
    real_path = os.path.realpath(path)
    workspace_real = os.path.realpath(self.workspace)
    return real_path.startswith(workspace_real)

    def validate_command(self, command: str) -> bool:
    """Check if command is allowed"""
    cmd_parts = shlex.split(command)
    if not cmd_parts:
    return False

    base_cmd = cmd_parts[0]
    return base_cmd in self.allowed_commands

    def execute_sandboxed(self, command: str) -> ToolResult:
    if not self.validate_command(command):
    return ToolResult(
    success=False,
    error=f"Command not allowed: {command}"
    )

    # Execute in isolated environment
    result = subprocess.run(
    command,
    shell=True,
    cwd=self.workspace,
    capture_output=True,
    timeout=30,
    env={
    **os.environ,
    "HOME": self.workspace, # Isolate home directory
    }
    )

    return ToolResult(
    success=result.returncode == 0,
    output=result.stdout.decode(),
    error=result.stderr.decode() if result.returncode != 0 else None
    )


    4. Browser Automation

    4.1 Browser Tool Pattern

    class BrowserTool:
    """
    Browser automation for agents using Playwright/Puppeteer.
    Enables visual debugging and web testing.
    """

    def __init__(self, headless: bool = True):
    self.browser = None
    self.page = None
    self.headless = headless

    async def open_url(self, url: str) -> ToolResult:
    """Navigate to URL and return page info"""
    if not self.browser:
    self.browser = await playwright.chromium.launch(headless=self.headless)
    self.page = await self.browser.new_page()

    await self.page.goto(url)

    # Capture state
    screenshot = await self.page.screenshot(type='png')
    title = await self.page.title()

    return ToolResult(
    success=True,
    output=f"Loaded: {title}",
    metadata={
    "screenshot": base64.b64encode(screenshot).decode(),
    "url": self.page.url
    }
    )

    async def click(self, selector: str) -> ToolResult:
    """Click on an element"""
    try:
    await self.page.click(selector, timeout=5000)
    await self.page.wait_for_load_state("networkidle")

    screenshot = await self.page.screenshot()
    return ToolResult(
    success=True,
    output=f"Clicked: {selector}",
    metadata={"screenshot": base64.b64encode(screenshot).decode()}
    )
    except TimeoutError:
    return ToolResult(
    success=False,
    error=f"Element not found: {selector}"
    )

    async def type_text(self, selector: str, text: str) -> ToolResult:
    """Type text into an input"""
    await self.page.fill(selector, text)
    return ToolResult(success=True, output=f"Typed into {selector}")

    async def get_page_content(self) -> ToolResult:
    """Get accessible text content of the page"""
    content = await self.page.evaluate("""
    () => {
    // Get visible text
    const walker = document.createTreeWalker(
    document.body,
    NodeFilter.SHOW_TEXT,
    null,
    false
    );

    let text = '';
    while (walker.nextNode()) {
    const node = walker.currentNode;
    if (node.textContent.trim()) {
    text += node.textContent.trim() + '\\n';
    }
    }
    return text;
    }
    """)
    return ToolResult(success=True, output=content)

    4.2 Visual Agent Pattern

    class VisualAgent:
    """
    Agent that uses screenshots to understand web pages.
    Can identify elements visually without selectors.
    """

    def __init__(self, llm, browser):
    self.llm = llm
    self.browser = browser

    async def describe_page(self) -> str:
    """Use vision model to describe current page"""
    screenshot = await self.browser.screenshot()

    response = self.llm.chat([
    {
    "role": "user",
    "content": [
    {"type": "text", "text": "Describe this webpage. List all interactive elements you see."},
    {"type": "image", "data": screenshot}
    ]
    }
    ])

    return response.content

    async def find_and_click(self, description: str) -> ToolResult:
    """Find element by visual description and click it"""
    screenshot = await self.browser.screenshot()

    # Ask vision model to find element
    response = self.llm.chat([
    {
    "role": "user",
    "content": [
    {
    "type": "text",
    "text": f"""
    Find the element matching: "{description}"
    Return the approximate coordinates as JSON: {{"x": number, "y": number}}
    """
    },
    {"type": "image", "data": screenshot}
    ]
    }
    ])

    coords = json.loads(response.content)
    await self.browser.page.mouse.click(coords["x"], coords["y"])

    return ToolResult(success=True, output=f"Clicked at ({coords['x']}, {coords['y']})")


    5. Context Management

    5.1 Context Injection Patterns

    class ContextManager:
    """
    Manage context provided to the agent.
    Inspired by Cline's @-mention patterns.
    """

    def __init__(self, workspace: str):
    self.workspace = workspace
    self.context = []

    def add_file(self, path: str) -> None:
    """@file - Add file contents to context"""
    with open(path, 'r') as f:
    content = f.read()

    self.context.append({
    "type": "file",
    "path": path,
    "content": content
    })

    def add_folder(self, path: str, max_files: int = 20) -> None:
    """@folder - Add all files in folder"""
    for root, dirs, files in os.walk(path):
    for file in files[:max_files]:
    file_path = os.path.join(root, file)
    self.add_file(file_path)

    def add_url(self, url: str) -> None:
    """@url - Fetch and add URL content"""
    response = requests.get(url)
    content = html_to_markdown(response.text)

    self.context.append({
    "type": "url",
    "url": url,
    "content": content
    })

    def add_problems(self, diagnostics: list) -> None:
    """@problems - Add IDE diagnostics"""
    self.context.append({
    "type": "diagnostics",
    "problems": diagnostics
    })

    def format_for_prompt(self) -> str:
    """Format all context for LLM prompt"""
    parts = []
    for item in self.context:
    if item["type"] == "file":
    parts.append(f"## File: {item['path']}\n

    \n{item['content']}\n`")
    elif item["type"] == "url":
    parts.append(f"## URL: {item['url']}\n{item['content']}")
    elif item["type"] == "diagnostics":
    parts.append(f"## Problems:\n{json.dumps(item['problems'], indent=2)}")

    return "\n\n".join(parts)

    ### 5.2 Checkpoint/Resume
    python
    class CheckpointManager:
    """
    Save and restore agent state for long-running tasks.
    """

    def __init__(self, storage_dir: str):
    self.storage_dir = storage_dir
    os.makedirs(storage_dir, exist_ok=True)

    def save_checkpoint(self, session_id: str, state: dict) -> str:
    """Save current agent state"""
    checkpoint = {
    "timestamp": datetime.now().isoformat(),
    "session_id": session_id,
    "history": state["history"],
    "context": state["context"],
    "workspace_state": self._capture_workspace(state["workspace"]),
    "metadata": state.get("metadata", {})
    }

    path = os.path.join(self.storage_dir, f"{session_id}.json")
    with open(path, 'w') as f:
    json.dump(checkpoint, f, indent=2)

    return path

    def restore_checkpoint(self, checkpoint_path: str) -> dict:
    """Restore agent state from checkpoint"""
    with open(checkpoint_path, 'r') as f:
    checkpoint = json.load(f)

    return {
    "history": checkpoint["history"],
    "context": checkpoint["context"],
    "workspace": self._restore_workspace(checkpoint["workspace_state"]),
    "metadata": checkpoint["metadata"]
    }

    def _capture_workspace(self, workspace: str) -> dict:
    """Capture relevant workspace state"""
    # Git status, file hashes, etc.
    return {
    "git_ref": subprocess.getoutput(f"cd {workspace} && git rev-parse HEAD"),
    "git_dirty": subprocess.getoutput(f"cd {workspace} && git status --porcelain")
    }

    ---

    6. MCP (Model Context Protocol) Integration

    6.1 MCP Server Pattern

    python
    from mcp import Server, Tool

    class MCPAgent:
    """
    Agent that can dynamically discover and use MCP tools.
    'Add a tool that...' pattern from Cline.
    """

    def __init__(self, llm):
    self.llm = llm
    self.mcp_servers = {}
    self.available_tools = {}

    def connect_server(self, name: str, config: dict) -> None:
    """Connect to an MCP server"""
    server = Server(config)
    self.mcp_servers[name] = server

    # Discover tools
    tools = server.list_tools()
    for tool in tools:
    self.available_tools[tool.name] = {
    "server": name,
    "schema": tool.schema
    }

    async def create_tool(self, description: str) -> str:
    """
    Create a new MCP server based on user description.
    'Add a tool that fetches Jira tickets'
    """
    # Generate MCP server code
    code = self.llm.generate(f"""
    Create a Python MCP server with a tool that does:
    {description}

    Use the FastMCP framework. Include proper error handling.
    Return only the Python code.
    """)

    # Save and install
    server_name = self._extract_name(description)
    path = f"./mcp_servers/{server_name}/server.py"

    with open(path, 'w') as f:
    f.write(code)

    # Hot-reload
    self.connect_server(server_name, {"path": path})

    return f"Created tool: {server_name}"
    ```


    Best Practices Checklist

    Agent Design

  • [ ] Clear task decomposition

  • [ ] Appropriate tool granularity

  • [ ] Error handling at each step

  • [ ] Progress visibility to user
  • Safety

  • [ ] Permission system implemented

  • [ ] Dangerous operations blocked

  • [ ] Sandbox for untrusted code

  • [ ] Audit logging enabled
  • UX

  • [ ] Approval UI is clear

  • [ ] Progress updates provided

  • [ ] Undo/rollback available

  • [ ] Explanation of actions

  • Resources

  • Cline

  • OpenAI Codex

  • Model Context Protocol

  • Anthropic Tool Use