Агенты, понимающие изображения. Скриншоты → код, vision RAG, Playwright.
import anthropic import base64 from pathlib import Path client = anthropic.Anthropic() def encode_image(path): with open(path, "rb") as f: return base64.b64encode(f.read()).decode() def analyze_image(image_path, prompt): ext = Path(image_path).suffix.lower() media_type = {".png": "image/png", ".jpg": "image/jpeg", ".webp": "image/webp"}.get(ext, "image/png") return client.messages.create( model="claude-sonnet-4-20250514", max_tokens=1024, messages=[{ "role": "user", "content": [ {"type": "image", "source": { "type": "base64", "media_type": media_type, "data": encode_image(image_path), }}, {"type": "text", "text": prompt}, ] }] ) result = analyze_image("screenshot.png", "Describe every UI element: buttons, inputs, dropdowns. List their positions.") print(result.content[0].text)
def screenshot_to_playwright(screenshot_path, goal): analysis = analyze_image(screenshot_path, f"Goal: {goal}.\n" "Write Playwright Python code to achieve this.\n" "Use selectors based on visible text/aria-labels.\n" "Return ONLY Python code, no explanation." ) return analysis.content[0].text code = screenshot_to_playwright("login_page.png", "Fill username 'admin', password 'secret', click Login button") print("Generated Playwright code:\n", code)
import openai oai = openai.OpenAI() def multimodal_rag(question, images, text_context=""): content = [{"type": "text", "text": f"Context: {text_context}\nQuestion: {question}"}] for img_path in images: b64 = encode_image(img_path) content.append({ "type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}", "detail": "high"}, }) return oai.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": content}], max_tokens=500, ) response = multimodal_rag( "Compare Q1 and Q2 charts. Which quarter performed better?", images=["q1_chart.png", "q2_chart.png"], text_context="Q1 target was 100K, Q2 target 120K. Product: SaaS subscriptions.", ) print(response.choices[0].message.content)
from playwright.sync_api import sync_playwright class VisionAgent: def __init__(self): self.playwright = sync_playwright().start() self.browser = self.playwright.chromium.launch() self.page = self.browser.new_page() def act_on_page(self, url, instruction, max_steps=5): self.page.goto(url) for step in range(max_steps): screenshot = "current_state.png" self.page.screenshot(path=screenshot, full_page=True) # Анализируем скриншот и получаем следующее действие result = analyze_image(screenshot, f"Goal: {instruction}. Step {step+1}.\n" "What ONE action should be taken now? Answer with:\n" "ACTION: click|type|scroll|done\n" "SELECTOR: css_or_text_selector\n" "VALUE: text_to_type (if type action)" ) action_text = result.content[0].text print(f"Step {step+1}: {action_text}") if "ACTION: done" in action_text: print("Goal achieved!") break elif "ACTION: click" in action_text: self.page.click(extract_selector(action_text)) elif "ACTION: type" in action_text: selector = extract_selector(action_text) value = extract_value(action_text) self.page.fill(selector, value) def close(self): self.browser.close() self.playwright.stop()
import json class DocumentVisionPipeline: """Обрабатывает PDF со скриншотами: извлекает текст с изображений.""" def __init__(self): self.client = anthropic.Anthropic() def extract_text_from_screenshot(self, path): resp = analyze_image(path, "Extract ALL visible text. For tables, output as JSON array of rows.\n" "Format: {'type':'text'|'table', 'content': ...}" ) return json.loads(resp.content[0].text) def process_document(self, screenshots): chunks = [] for path in screenshots: data = self.extract_text_from_screenshot(path) chunks.append({"source": path, "data": data}) return chunks pipeline = DocumentVisionPipeline() docs = pipeline.process_document(["page1.png", "page2.png", "table1.png"]) print(f"Extracted {len(docs)} document chunks from screenshots")