Мультимодальные AI-агенты: зрение + текст

Агенты, понимающие изображения. Скриншоты → код, vision RAG, Playwright.

📊 Продвинутый ⏱ 14 мин

# 1. АНАЛИЗ ИЗОБРАЖЕНИЙ ЧЕРЕЗ CLAUDE VISION

import anthropic
import base64
from pathlib import Path

client = anthropic.Anthropic()

def encode_image(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode()

def analyze_image(image_path, prompt):
    ext = Path(image_path).suffix.lower()
    media_type = {".png": "image/png", ".jpg": "image/jpeg", ".webp": "image/webp"}.get(ext, "image/png")
    return client.messages.create(
        model="claude-sonnet-4-20250514", max_tokens=1024,
        messages=[{
            "role": "user",
            "content": [
                {"type": "image", "source": {
                    "type": "base64",
                    "media_type": media_type,
                    "data": encode_image(image_path),
                }},
                {"type": "text", "text": prompt},
            ]
        }]
    )

result = analyze_image("screenshot.png",
    "Describe every UI element: buttons, inputs, dropdowns. List their positions.")
print(result.content[0].text)

# 2. СКРИНШОТ → PLAYWRIGHT КОД (CLAUDE COMPUTER USE)

def screenshot_to_playwright(screenshot_path, goal):
    analysis = analyze_image(screenshot_path,
        f"Goal: {goal}.\n"
        "Write Playwright Python code to achieve this.\n"
        "Use selectors based on visible text/aria-labels.\n"
        "Return ONLY Python code, no explanation."
    )
    return analysis.content[0].text

code = screenshot_to_playwright("login_page.png",
    "Fill username 'admin', password 'secret', click Login button")
print("Generated Playwright code:\n", code)

# 3. МУЛЬТИМОДАЛЬНЫЙ RAG — ИЗОБРАЖЕНИЯ КАК КОНТЕКСТ

import openai

oai = openai.OpenAI()

def multimodal_rag(question, images, text_context=""):
    content = [{"type": "text", "text":
        f"Context: {text_context}\nQuestion: {question}"}]

    for img_path in images:
        b64 = encode_image(img_path)
        content.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/png;base64,{b64}", "detail": "high"},
        })

    return oai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}],
        max_tokens=500,
    )

response = multimodal_rag(
    "Compare Q1 and Q2 charts. Which quarter performed better?",
    images=["q1_chart.png", "q2_chart.png"],
    text_context="Q1 target was 100K, Q2 target 120K. Product: SaaS subscriptions.",
)
print(response.choices[0].message.content)

# 4. АГЕНТ ДЛЯ АВТОМАТИЗАЦИИ UI — ИНТЕГРАЦИЯ С PLAYWRIGHT

from playwright.sync_api import sync_playwright

class VisionAgent:
    def __init__(self):
        self.playwright = sync_playwright().start()
        self.browser = self.playwright.chromium.launch()
        self.page = self.browser.new_page()

    def act_on_page(self, url, instruction, max_steps=5):
        self.page.goto(url)
        for step in range(max_steps):
            screenshot = "current_state.png"
            self.page.screenshot(path=screenshot, full_page=True)

            # Анализируем скриншот и получаем следующее действие
            result = analyze_image(screenshot,
                f"Goal: {instruction}. Step {step+1}.\n"
                "What ONE action should be taken now? Answer with:\n"
                "ACTION: click|type|scroll|done\n"
                "SELECTOR: css_or_text_selector\n"
                "VALUE: text_to_type (if type action)"
            )
            action_text = result.content[0].text
            print(f"Step {step+1}: {action_text}")

            if "ACTION: done" in action_text:
                print("Goal achieved!")
                break
            elif "ACTION: click" in action_text:
                self.page.click(extract_selector(action_text))
            elif "ACTION: type" in action_text:
                selector = extract_selector(action_text)
                value = extract_value(action_text)
                self.page.fill(selector, value)

    def close(self):
        self.browser.close()
        self.playwright.stop()

# 5. ПРОДАКШЕН: МУЛЬТИМОДАЛЬНЫЙ RAG-ПАЙПЛАЙН

import json

class DocumentVisionPipeline:
    """Обрабатывает PDF со скриншотами: извлекает текст с изображений."""
    def __init__(self):
        self.client = anthropic.Anthropic()

    def extract_text_from_screenshot(self, path):
        resp = analyze_image(path,
            "Extract ALL visible text. For tables, output as JSON array of rows.\n"
            "Format: {'type':'text'|'table', 'content': ...}"
        )
        return json.loads(resp.content[0].text)

    def process_document(self, screenshots):
        chunks = []
        for path in screenshots:
            data = self.extract_text_from_screenshot(path)
            chunks.append({"source": path, "data": data})
        return chunks

pipeline = DocumentVisionPipeline()
docs = pipeline.process_document(["page1.png", "page2.png", "table1.png"])
print(f"Extracted {len(docs)} document chunks from screenshots")

🔗 Полезные ссылки

📖 Claude Vision 📖 GPT-4V 🎭 Playwright