# Load environment variables (API key) from .env
from dotenv import load_dotenv
load_dotenv() 

# Load required libraries
from openai import OpenAI 
import os
import cv2
from moviepy.editor import VideoFileClip
import base64

# Initialize the OpenAI client and model
client = OpenAI()
MODEL="gpt-4o"

# Set the video path
VIDEO_PATH = "resource/Macintosh_Team_Interview.mp4"

# Define a function that processes the video
# Compute total frame count, then skip frames at the given interval,
# encode each kept frame as a base64 string,
# and extract the video audio into a separate file.
def process_video(video_path, seconds_per_frame=2):
    base64Frames = [] # List that stores frames encoded as base64
    base_video_path, _ = os.path.splitext(video_path)

    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame=0

    # Check total frame count and, at the given interval, encode and store frames as base64
    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()

    # Extract the audio from the video into a separate file.
    audio_path = f"{base_video_path}.mp3"
    clip = VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_path, bitrate="32k")
    clip.audio.close()
    clip.close()

    print(f"Extracted {len(base64Frames)} frames")
    print(f"Extracted audio to {audio_path}")
    return base64Frames, audio_path

# Run the function defined above to process the video
base64Frames, audio_path = process_video(VIDEO_PATH, seconds_per_frame=1)

# Start a chat using the OpenAI API client
# Request a video summary; the call returns a response object
response = client.chat.completions.create(
    # Specify the LLM model to generate the response
    model=MODEL, 
    # Build the system message (instructing the model to summarize the video) and
    # the user message that wraps each frame as an 'image_url' style message for the OpenAI model
    messages=[ 
    {"role": "system", "content": "You are generating a video summary. Please provide a summary of the video. Respond in Markdown."},
    {"role": "user", "content": [
        "These are the frames from the video.",
        *map(lambda x: {"type": "image_url", 
                        "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)
        ],
    }
    ],
    # Control the randomness of the response with a value 0~2; lower = more consistent, higher = more random and creative
    temperature=0,
)

# Print the generated summary
print(response.choices[0].message.content)

- result

GPT 4o API hands-on 2. Multi-modal - Audio : Audio summarization

- resource

Macintosh_Team_Interview.mp3

0.10MB

- code

*The Video(@AndyHertzfeld) file produced through the code in hands-on 1 above.

- code

# Load environment variables (API key) from .env
from dotenv import load_dotenv
load_dotenv() 

# Load required libraries
from openai import OpenAI 

client = OpenAI()
MODEL="gpt-4o"

audio_path = "resource/Macintosh_Team_Interview.mp3"

# Use the 'whisper-1' model to transcribe the audio file into text.
transcription = client.audio.transcriptions.create(
    model="whisper-1",
    file=open(audio_path, "rb"),
)

# Generate a summary based on the transcribed text.
# Provide both a user message and a system message: the system message tells the AI what to do, and the user message gives the input on which to act.
# Here, the system message tells the AI to summarize the text, and the user message provides the text to summarize.
response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": """You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."""},
        {"role": "user", "content": [
            {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
        ],
    }
    ],
    temperature=0,  
)

# Print the summary generated by the AI.
# Since temperature is set to 0, the output is deterministic.
print(response.choices[0].message.content)

- result

GPT 4o API hands-on 2. Multi-modal - Audio + Video : Combined summarization

- resource

*The Audio and Video files used in the hands-on examples above.

- code

# Load environment variables (API key) from .env
from dotenv import load_dotenv
load_dotenv() 

# Load required libraries
from openai import OpenAI 
from C04_Summary_Video import base64Frames
from C05_Summary_Audio import transcription

client = OpenAI()
MODEL="gpt-4o"

# client.chat.completions.create uses the OpenAI API to issue an AI query and return the result.
response = client.chat.completions.create(
    # Specify the name of the AI model to use.
    model=MODEL,
    messages=[
        # The system message tells the AI what this task is and instructs it.
        {"role": "system", "content":"""You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"""},
        # The user message provides the input to the AI.
        {"role": "user", "content": [
            # The next sentence describes the video frames.
            "These are the frames from the video.",
            # Each frame the user provides is delivered as an 'image_url' style message.
            # Use map() and a lambda to turn each base64Frames entry into an image URL.
            *map(lambda x: {"type": "image_url", 
                            "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
            # Finally, the audio transcription of the video is delivered as a 'text' style message.
            {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
        ],
    }],
    # temperature controls how varied the output is. Lower values make the output more deterministic and predictable.
    temperature=0,
)
# Print the summary generated by the AI (the first choice in the response).
print("\\n\\nAudio + Visual Summary:\\n" + response.choices[0].message.content)

- result