A tutorial for using OpenAI's GPT-4O (Omni Model) through the Vision and Text APIs.
MY GPT 4o API Beginning Course
1. Image( multi modal)
? 2. Summary( Video + Audio)
3. QA( Video + Audio based chat)
This time it's a hands-on practice on summarizing video and audio.
GPT 4o API hands-on 1. Multi-modal - Video : Video summarization
- resource
- code
*Video(@AndyHertzfeld): https://www.youtube.com/watch?v=oTtQ0l0ukvQ
*Generate shorts(@ssemble) : https://www.ssemble.com/
- code
# Load environment variables (API key) from .env
from dotenv import load_dotenv
load_dotenv()
# Load required libraries
from openai import OpenAI
import os
import cv2
from moviepy.editor import VideoFileClip
import base64
# Initialize the OpenAI client and model
client = OpenAI()
MODEL="gpt-4o"
# Set the video path
VIDEO_PATH = "resource/Macintosh_Team_Interview.mp4"
# Define a function that processes the video
# Compute total frame count, then skip frames at the given interval,
# encode each kept frame as a base64 string,
# and extract the video audio into a separate file.
def process_video(video_path, seconds_per_frame=2):
base64Frames = [] # List that stores frames encoded as base64
base_video_path, _ = os.path.splitext(video_path)
video = cv2.VideoCapture(video_path)
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
fps = video.get(cv2.CAP_PROP_FPS)
frames_to_skip = int(fps * seconds_per_frame)
curr_frame=0
# Check total frame count and, at the given interval, encode and store frames as base64
while curr_frame < total_frames - 1:
video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
success, frame = video.read()
if not success:
break
_, buffer = cv2.imencode(".jpg", frame)
base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
curr_frame += frames_to_skip
video.release()
# Extract the audio from the video into a separate file.
audio_path = f"{base_video_path}.mp3"
clip = VideoFileClip(video_path)
clip.audio.write_audiofile(audio_path, bitrate="32k")
clip.audio.close()
clip.close()
print(f"Extracted {len(base64Frames)} frames")
print(f"Extracted audio to {audio_path}")
return base64Frames, audio_path
# Run the function defined above to process the video
base64Frames, audio_path = process_video(VIDEO_PATH, seconds_per_frame=1)
# Start a chat using the OpenAI API client
# Request a video summary; the call returns a response object
response = client.chat.completions.create(
# Specify the LLM model to generate the response
model=MODEL,
# Build the system message (instructing the model to summarize the video) and
# the user message that wraps each frame as an 'image_url' style message for the OpenAI model
messages=[
{"role": "system", "content": "You are generating a video summary. Please provide a summary of the video. Respond in Markdown."},
{"role": "user", "content": [
"These are the frames from the video.",
*map(lambda x: {"type": "image_url",
"image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)
],
}
],
# Control the randomness of the response with a value 0~2; lower = more consistent, higher = more random and creative
temperature=0,
)
# Print the generated summary
print(response.choices[0].message.content)
- result
GPT 4o API hands-on 2. Multi-modal - Audio : Audio summarization
- resource
- code
*The Video(@AndyHertzfeld) file produced through the code in hands-on 1 above.
- code
# Load environment variables (API key) from .env
from dotenv import load_dotenv
load_dotenv()
# Load required libraries
from openai import OpenAI
client = OpenAI()
MODEL="gpt-4o"
audio_path = "resource/Macintosh_Team_Interview.mp3"
# Use the 'whisper-1' model to transcribe the audio file into text.
transcription = client.audio.transcriptions.create(
model="whisper-1",
file=open(audio_path, "rb"),
)
# Generate a summary based on the transcribed text.
# Provide both a user message and a system message: the system message tells the AI what to do, and the user message gives the input on which to act.
# Here, the system message tells the AI to summarize the text, and the user message provides the text to summarize.
response = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": """You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."""},
{"role": "user", "content": [
{"type": "text", "text": f"The audio transcription is: {transcription.text}"}
],
}
],
temperature=0,
)
# Print the summary generated by the AI.
# Since temperature is set to 0, the output is deterministic.
print(response.choices[0].message.content)
- result
GPT 4o API hands-on 2. Multi-modal - Audio + Video : Combined summarization
- resource
*The Audio and Video files used in the hands-on examples above.
- code
# Load environment variables (API key) from .env
from dotenv import load_dotenv
load_dotenv()
# Load required libraries
from openai import OpenAI
from C04_Summary_Video import base64Frames
from C05_Summary_Audio import transcription
client = OpenAI()
MODEL="gpt-4o"
# client.chat.completions.create uses the OpenAI API to issue an AI query and return the result.
response = client.chat.completions.create(
# Specify the name of the AI model to use.
model=MODEL,
messages=[
# The system message tells the AI what this task is and instructs it.
{"role": "system", "content":"""You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"""},
# The user message provides the input to the AI.
{"role": "user", "content": [
# The next sentence describes the video frames.
"These are the frames from the video.",
# Each frame the user provides is delivered as an 'image_url' style message.
# Use map() and a lambda to turn each base64Frames entry into an image URL.
*map(lambda x: {"type": "image_url",
"image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
# Finally, the audio transcription of the video is delivered as a 'text' style message.
{"type": "text", "text": f"The audio transcription is: {transcription.text}"}
],
}],
# temperature controls how varied the output is. Lower values make the output more deterministic and predictable.
temperature=0,
)
# Print the summary generated by the AI (the first choice in the response).
print("\\n\\nAudio + Visual Summary:\\n" + response.choices[0].message.content)
- result

