1 Vision OpenAI

Guide: https://platform.openai.com/docs/guides/vision?lang=python

import os
import openai
from IPython.display import display_markdown
from pyhere import here
from openai import OpenAI

1.1 Online Image

client = OpenAI()

response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "What’s in this image?"},
        {
          "type": "image_url",
          "image_url": {
            "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
          },
        },
      ],
    }
  ],
  max_tokens=300,
)

print(response.choices[0])

Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="The image depicts a serene landscape featuring a wooden boardwalk or pathway that extends through a lush green marsh or meadow. On either side of the path, there's an abundance of tall grass and vegetation. The sky above is mostly clear with scattered clouds, suggesting a pleasant day. This scene evokes a sense of tranquility and natural beauty.", refusal=None, role='assistant', function_call=None, tool_calls=None))

print(response.choices[0].message.content)

The image depicts a serene landscape featuring a wooden boardwalk or pathway that extends through a lush green marsh or meadow. On either side of the path, there's an abundance of tall grass and vegetation. The sky above is mostly clear with scattered clouds, suggesting a pleasant day. This scene evokes a sense of tranquility and natural beauty.

1.2 Local Image

1.2.1 Local Image (One)

import base64
import requests

# OpenAI API Key
api_key = os.environ.get("OPENAI_API_KEY")

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Getting the base64 string
base64_image = encode_image(here("img/ocr/slide-ex1.png"))

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

payload = {
  "model": "gpt-4o-mini",
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What’s in this image?"
        },
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{base64_image}"
          }
        }
      ]
    }
  ],
  "max_tokens": 300
}

response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

print(response.json())

{'id': 'chatcmpl-AGU7JT4iHfnFDBs2hPNmzIZYzblSM', 'object': 'chat.completion', 'created': 1728492381, 'model': 'gpt-4o-mini-2024-07-18', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The image appears to be a slide related to medical imaging interpretation. It outlines two main categories: "Normal and variations" and "Abnormal." \n\nIn the left section, it discusses:\n\n- **Morphology**: Referring to anatomy and its relationships.\n- **Physiology**: Focused on function.\n\nIn the right section, it addresses aspects of abnormal findings, including:\n\n- **Detection**: Identifying issues.\n- **Characterization**: Examining both morphology and physiology.\n- **Diagnosis/DDx**: Likely referring to diagnosis and differential diagnosis (DDx).\n\nThe bottom part mentions "Imaging Modalities," suggesting that different imaging techniques are being discussed in relation to these concepts.', 'refusal': None}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 36848, 'completion_tokens': 142, 'total_tokens': 36990, 'prompt_tokens_details': {'cached_tokens': 1024}, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'system_fingerprint': 'fp_74ba47b4ac'}

display_markdown(response.json()["choices"][0]["message"]["content"], raw = True)

The image appears to be a slide related to medical imaging interpretation. It outlines two main categories: “Normal and variations” and “Abnormal.”

In the left section, it discusses:

Morphology: Referring to anatomy and its relationships.
Physiology: Focused on function.

In the right section, it addresses aspects of abnormal findings, including:

Detection: Identifying issues.
Characterization: Examining both morphology and physiology.
Diagnosis/DDx: Likely referring to diagnosis and differential diagnosis (DDx).

The bottom part mentions “Imaging Modalities,” suggesting that different imaging techniques are being discussed in relation to these concepts.

1.2.2 Local Multiple Image

from openai import OpenAI

client = OpenAI()
response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What are in these images? Is there any difference between them?",
        },
        {
          "type": "image_url",
          "image_url": {
            "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
          },
        },
        {
          "type": "image_url",
          "image_url": {
            "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
          },
        },
      ],
    }
  ],
  max_tokens=300,
)

print(response.choices[0])

Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The images you provided appear to be identical, both depicting a scenic view of a grassy landscape with a wooden pathway leading into the distance under a blue sky with some clouds. Since they look exactly the same, there is no observable difference between them.', refusal=None, role='assistant', function_call=None, tool_calls=None))

print(response.choices[0].message.content)

The images you provided appear to be identical, both depicting a scenic view of a grassy landscape with a wooden pathway leading into the distance under a blue sky with some clouds. Since they look exactly the same, there is no observable difference between them.