import time
from pathlib import Path
def upload_and_wait(client, file_path: str) -> str:
"""Upload a file via File API and return the URI after processing."""
path = Path(file_path)
print(f"Uploading {path.name} ({path.stat().st_size / 1e6:.1f} MB)...")
uploaded = client.files.upload(path=str(path))
# Wait for server-side processing (typically 5-30 seconds)
while uploaded.state.name == "PROCESSING":
time.sleep(2)
uploaded = client.files.get(name=uploaded.name)
if uploaded.state.name == "FAILED":
raise RuntimeError(f"Upload failed: {path.name}")
print(f" -> Done. URI: {uploaded.uri[:50]}...")
return uploaded.uri2. Embedding - Audio, Video, and Multimodal
Understanding the File API
Audio and video files must be uploaded to Google’s servers via the File API before they can be referenced in an embedding request.
- Inline bytes (
from_bytes): Works for files under 20MB - File API upload: Required for files over 20MB. YouTube Shorts MP4s range from 5-50MB, so the File API is the safe default.
Files uploaded via the File API are automatically deleted after 48 hours. For large-scale batch processing, upload just before embedding and save results immediately.
Upload helper function
Text embedding: Transcript vs. title
Before embedding audio or video, we need a text baseline. There are two options, and the choice matters more than expected.
Title + description (metadata)
result = client.models.embed_content(
model="gemini-embedding-exp-03-07",
contents=f"{title}\n{description}",
config=types.EmbedContentConfig(
task_type="SEMANTIC_SIMILARITY"
)
)
text_title_vec = np.array(result.embeddings[0].values) # (3072,)Full transcript (Whisper)
We use local Whisper transcription (whisper-medium-komixv2-mlx, Korean fine-tuned, MLX GPU-accelerated) to generate speech transcripts for all Shorts. This runs on Apple Silicon at ~2.3 sec/video with zero API cost.
result = client.models.embed_content(
model="gemini-embedding-exp-03-07",
contents=transcript_text,
config=types.EmbedContentConfig(
task_type="SEMANTIC_SIMILARITY"
)
)
text_transcript_vec = np.array(result.embeddings[0].values) # (3072,)Our pilot (Section 5) revealed that title-based embeddings produce an artifactual party signal (+0.124 same-party similarity gap) driven by politician names and party hashtags in YouTube titles. Transcript-based embeddings show no such artifact (-0.018 gap). Use transcripts as the text baseline for any substantive analysis.
Transcript source unification
For the full corpus (51,197 Shorts), we use MLX Whisper transcripts exclusively rather than mixing YouTube auto-captions (available for only ~6-8% of Shorts). This eliminates transcription source as a confound.
| Whisper output | Share | Treatment |
|---|---|---|
| Usable text (50+ characters) | ~74% | Embed as-is |
| Minimal text (<50 characters) | ~18% | Flag; embed with caution |
| No speech detected | ~8% | Exclude from text strategy |
Audio embedding
Extracting audio from MP4
ffmpeg -i input_video.mp4 -vn -acodec libmp3lame -q:a 4 output_audio.mp3| Flag | Meaning |
|---|---|
-vn |
Strip video stream |
-acodec libmp3lame |
Encode as MP3 |
-q:a 4 |
Quality (0 = best, 9 = worst; 4 is sufficient) |
Calling the embedding API
audio_uri = upload_and_wait(client, "output_audio.mp3")
result = client.models.embed_content(
model="gemini-embedding-exp-03-07",
contents=types.Content(
parts=[
types.Part.from_uri(
file_uri=audio_uri,
mime_type="audio/mpeg"
)
]
)
)
audio_vec = np.array(result.embeddings[0].values)
print(f"Audio embedding shape: {audio_vec.shape}") # (3072,)Multimodal embedding (video + audio + text)
Place multiple Part objects inside a single Content to produce one unified embedding.
import subprocess
from pathlib import Path
def embed_short_multimodal(
client, mp4_path: str, title: str, description: str = ""
) -> np.ndarray:
"""Embed a single YouTube Short using video + audio + text."""
mp4 = Path(mp4_path)
audio = mp4.with_suffix(".mp3")
# Extract audio track
if not audio.exists():
subprocess.run([
"ffmpeg", "-i", str(mp4),
"-vn", "-acodec", "libmp3lame", "-q:a", "4",
str(audio), "-y", "-loglevel", "error"
], check=True)
# Upload both files
video_uri = upload_and_wait(client, str(mp4))
audio_uri = upload_and_wait(client, str(audio))
# Request multimodal embedding
result = client.models.embed_content(
model="gemini-embedding-exp-03-07",
contents=types.Content(
parts=[
types.Part(text=f"{title}\n{description}".strip()),
types.Part.from_uri(file_uri=audio_uri, mime_type="audio/mpeg"),
types.Part.from_uri(file_uri=video_uri, mime_type="video/mp4"),
]
)
)
return np.array(result.embeddings[0].values) # (3072,)Example
vec = embed_short_multimodal(
client=client,
mp4_path="./data/pilot_videos/EhkWgomdVSg.mp4",
title="The person to break the establishment! Kim Wooyoung!",
description="#DemocraticParty"
)
print(f"Multimodal embedding: {vec.shape}, norm={np.linalg.norm(vec):.4f}")When an MP4 file is sent as video/mp4, Google processes the video frames. Whether the audio track within the MP4 is also embedded is not documented. The pricing page lists audio and video as separate line items, suggesting they are processed independently.
To ensure speech content is captured, we extract and send the audio track as a separate Part. The pilot study (Section 3) compares video-only, audio-only, and multimodal embeddings to validate this choice empirically.
Video-only embedding
For computing the Visual-Verbal Gap (VVG), we need a pure video embedding with audio stripped. This isolates what the viewer sees from what the speaker says.
# Strip audio track, copy video stream without re-encoding
ffmpeg -i input.mp4 -an -c:v copy video_only.mp4video_only_uri = upload_and_wait(client, "video_only.mp4")
result = client.models.embed_content(
model="gemini-embedding-exp-03-07",
contents=types.Content(
parts=[
types.Part.from_uri(
file_uri=video_only_uri,
mime_type="video/mp4"
)
]
)
)
video_only_vec = np.array(result.embeddings[0].values) # (3072,)Strategy comparison
| Strategy | Input | What it captures | Cost per Short (45s) |
|---|---|---|---|
| Text (transcript) | Whisper transcript | Verbal content (speech) | ~$0.00001 |
| Text (title) | YouTube title + description | Self-labeling, metadata | ~$0.00001 |
| Audio | MP3 | Speech + tone + music | ~$0.007 |
| Video-only | Muted MP4 | Visual framing, editing, setting | ~$0.043 |
| Multimodal | MP4 + MP3 + title | All channels combined | ~$0.043 |
For speech-heavy Shorts (~70-80% of legislative content), audio and transcript embeddings are nearly redundant (cosine distance 0.11-0.20). They diverge only for meme/music content (0.40-0.43). See Section 5 for details.
Cost estimates: Tiered strategy
Rather than embedding the full corpus with every strategy, we use a tiered approach informed by pilot results.
| Tier | Scope | Strategies | Cost | Purpose |
|---|---|---|---|---|
| 1 | 51,197 Shorts | text_transcript | ~$0.50 | Full-corpus baseline, clustering, regression DV |
| 2 | 2,500 stratified subsample | video_only + multimodal + audio | ~$234 | VVG computation, hypothesis testing |
| 3 | Optional expansion | Depends on Tier 2 results | Variable | Full-corpus video if signal is strong |
This tiered design yields the same analytical power as full-corpus multimodal embedding at roughly 5% of the cost.
Saving embeddings
Embedding generation costs money and takes time. Always save results.
# Save
np.save("embeddings.npy", vectors) # (N, 3072)
df_meta.to_csv("metadata.csv", index=False)
# Load
vectors = np.load("embeddings.npy")
df_meta = pd.read_csv("metadata.csv")