Practical Vision Tool-Calling with OpenAI Agents

Vision

Author

Branden Collingsworth

Published

July 14, 2025

Turning an LLM into a reliable image analyst: expose vision tools, and let the model decide when to call each one.

1. Minimal Image-Prep Agent Toolkit

Step	Why	Typical function signature*
Resize / Tile	Keeps token cost predictable and preserves fine detail when tiling	`resize_image(max_dim=2048)` / `tile_image(tile_px=512)`
Zoom / Crop	Cropping + original image boosts VQA accuracy by 1-3 pp on GUI & scene datasets (arXiv, OpenReview)	`crop_region(x1,y1,x2,y2)`
Rotate / Deskew	Skewed scans tank OCR line-segmentation quality (Tesseract OCR)	`deskew_image(angle="auto")`
Contrast / Threshold	CLAHE or Otsu binarisation cuts OCR errors 10-40 % (Stack Overflow, Docparser)	`adjust_contrast(method="clahe")` / `binarize(threshold="otsu")`

*Shown as OpenAI function-calling stubs.

2. Simple Vision Agent (OpenAI Agents SDK)

# vision_agent_sdk.py
# pip install openai openai-agents Pillow opencv-python numpy

import os
import uuid

import cv2
import dotenv
import numpy as np
from agents import Agent, Runner, function_tool, set_default_openai_key
from openai import OpenAI
from PIL import Image

# load environment variables and set the default openai key
dotenv.load_dotenv()
set_default_openai_key(os.getenv("OPENAI_API_KEY"))

# create a directory to store the processed images
PROC_DIR = "processed_images"
os.makedirs(PROC_DIR, exist_ok=True)
# ---------------------------------------------------------------------


def _ensure_path(src: str) -> str:
    """Ensure src is a real file path.

    If src is an OpenAI `file-…` ID, download it to *processed_images* and
    return the local temp path.
    Otherwise return src unchanged (assumed to already be a file path).
    """

    # Handle OpenAI file IDs (e.g. "file-abc123") --------------------
    if src.startswith("file-"):
        tmp_path = _fname("oaid", "src")

        data = client.files.content(src).read()
        with open(tmp_path, "wb") as f:
            f.write(data)

        return tmp_path

    return src


def _fname(src, suffix):
    root, _ = os.path.splitext(os.path.basename(src))
    return os.path.join(PROC_DIR, f"{root}_{suffix}_{uuid.uuid4().hex[:6]}.png")


# ---------------------------------------------------------------------
# ——— basic vision tools ———


@function_tool
def resize_image(path: str, max_dim: int = 2048) -> str:
    """Down-samples the image so the longest edge ≤ max_dim px."""
    path = _ensure_path(path)
    img = Image.open(path)
    img.thumbnail((max_dim, max_dim))
    out = _fname(path, "rs")
    img.save(out, "PNG")
    return out


@function_tool
def crop_region(path: str, x1: int, y1: int, x2: int, y2: int) -> str:
    """Returns a cropped sub-image defined by (x1, y1) → (x2, y2)."""
    path = _ensure_path(path)
    out = _fname(path, "crop")
    Image.open(path).crop((x1, y1, x2, y2)).save(out, "PNG")
    return out


@function_tool
def deskew_image(path: str, angle: float | None = None) -> str:
    """Rotates the image by `angle` degrees.  If omitted, auto-detect skew."""
    path = _ensure_path(path)
    gray = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    if angle is None:
        pts = np.column_stack(np.where(gray < 250))
        angle = cv2.minAreaRect(pts)[-1]
        angle += 90 if angle < -45 else 0
    M = cv2.getRotationMatrix2D(tuple(np.array(gray.shape[::-1]) / 2), angle, 1.0)
    rot = cv2.warpAffine(gray, M, gray.shape[::-1], flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    out = _fname(path, "deskew")
    cv2.imwrite(out, rot)
    return out


@function_tool
def enhance_contrast(path: str, method: str = "clahe") -> str:
    """CLAHE (default) or global histogram equalisation."""
    path = _ensure_path(path)
    g = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    g = cv2.createCLAHE(2.0, (8, 8)).apply(g) if method == "clahe" else cv2.equalizeHist(g)
    out = _fname(path, "contrast")
    cv2.imwrite(out, g)
    return out


# ---------------------------------------------------------------------
# ——— agent definition ———
client = OpenAI()

vision_agent = Agent(
    name="Vision-Prep",
    instructions=(
        "You answer questions about images. "
        "When you call a tool, ALWAYS pass the actual file_id (or data-URI) "
        "of the image as the 'path' argument - never the placeholder 'input_image'. "
        "Decide whether resizing, cropping, deskewing, or contrast enhancement "
        "will help first; call those tool(s), then reply."
    ),
    tools=[resize_image, crop_region, deskew_image, enhance_contrast],
    model="o4-mini-2025-04-16",  # swap for any vision-capable model
)


# ---------------------------------------------------------------------
# ——— quick demo ———
def demo():
    """Quick test: upload image via OpenAI files API and reference file_id."""

    # load the image
    img_path = "production_use_corn_us.jpg"

    # Upload once per run; purpose "vision" accepted for image inputs
    with open(img_path, "rb") as f:
        file_id = client.files.create(file=f, purpose="vision").id

    user_msg = {
        "role": "user",
        "content": [
            {"type": "input_image", "file_id": file_id, "detail": "high"},
            {
                "type": "input_text",
                "text": f"Here is the image (file_id={file_id}). "
                "Interpret the chart; call the tools to help test them.",
            },
        ],
    }

    run = Runner.run_sync(
        vision_agent,
        [user_msg],
    )

    print(run.final_output)
    print("New items:", run.new_items)


if __name__ == "__main__":
    demo()

The LLM now chooses—at runtime—whether to resize first, then zoom on the total, or rotate a skewed scan before reading it. Tool outputs are fed straight back into the model via the SDK. (OpenAI GitHub, OpenAI Platform)

3. Advanced Use Cases

OCR - Pre-process (deskew → denoise → contrast) before calling Tesseract, EasyOCR, or GPT-V’s native OCR for long documents. Maintain hOCR/TSV layouts for structured extraction. (Tesseract OCR, Docparser)
Segmentation / Masking - Feed Segment Anything (SAM) masks into follow-up prompts to isolate parts of complex scenes; zero-shot performance often matches fully-supervised baselines. (arXiv)
Smart Cropping - Frameworks like ViCrop and FOCUS auto-localise salient patches then combine “full + crop” views for sharper answers on fine-detail questions. (arXiv, arXiv)
Other models - Grounding DINO for phrase-conditioned boxes, LaVIN/LLava-Zoom for question-guided zoom, Stable SAM for generative inpainting.

4. Quality Control & Pitfalls

Keep context: always send the original and the crop; over-cropping hides clues.
PNG for text: JPEG artefacts degrade OCR.
Token budgeting: pre-resize yourself; don’t rely on server heuristics.
Audit: store both pre- and post-tool images; spot-check failures and 5 % of random successes.

Using these image tools, AI agents can make more dependable vision answers than any “raw image in, hope for the best” approach.