Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- Fix: user_video never had a real UNIQUE(user_id, video_id) constraint. The model declared it
-- as a bare `db.UniqueConstraint(...)` expression (a no-op outside __table_args__), so no DDL was
-- ever emitted. As a result, concurrent first-open requests (/user_video + /video_opened fire
-- together on reader load) both INSERTed a row, producing duplicates -- after which
-- UserVideo.find / find_or_create (which used .one()) 500'd with MultipleResultsFound.
--
-- Dedupe existing rows (keep the lowest id per user+video), then add the missing unique key so
-- the race never recurs (the find_or_create rollback-and-requery handler now actually fires).
--
-- Note on cost: with no index yet on (user_id, video_id), the DELETE...JOIN below does a full
-- scan and GROUP BY on user_video, and the ALTER TABLE that follows rewrites the table to add
-- the unique key. For large tables (>1M rows) this will lock the table for a noticeable window
-- -- prefer to run during a maintenance window or off-peak.

DELETE uv FROM user_video uv
JOIN (
SELECT user_id, video_id, MIN(id) AS keep_id
FROM user_video
GROUP BY user_id, video_id
HAVING COUNT(*) > 1
) d ON uv.user_id = d.user_id
AND uv.video_id = d.video_id
AND uv.id <> d.keep_id;

ALTER TABLE user_video
ADD UNIQUE KEY uq_user_video_user_video (user_id, video_id);
1 change: 1 addition & 0 deletions zeeguu/api/endpoints/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from .listening_sessions import *
from . import user_video
from . import user_watching_session
from . import video_upload
from . import audio_lessons
from . import article_simplification
from . import generated_examples
Expand Down
107 changes: 107 additions & 0 deletions zeeguu/api/endpoints/video_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Endpoint for sharing a single video to Zeeguu for interactive viewing.

Mirrors article_upload, but for video. The client (browser extension / iOS WKWebView)
extracts the captions from YouTube's authorized player and hands them to us, sidestepping
the server-side caption fetch that YouTube blocks from datacenter IPs. We fetch only
metadata (via the Data API key, which is key-authenticated and not IP-blocked) and create
the Video + Caption rows, then the client opens the existing /user_video reader.
"""
import flask
from flask import request
from sqlalchemy.orm.exc import NoResultFound

from zeeguu.core.model import User, Language
from zeeguu.core.model.video import Video
from zeeguu.core.youtube_api.youtube_api import (
extract_youtube_video_id,
normalize_caption_list,
NO_CAPTIONS_AVAILABLE,
NOT_IN_EXPECTED_LANGUAGE,
DUBBED_AUDIO,
CAPTIONS_TOO_SHORT,
VIDEO_IS_MISSING_DURATION,
)
from zeeguu.api.utils.json_result import json_result
from zeeguu.api.utils.route_wrappers import cross_domain, requires_session

from . import api, db_session


BROKEN_CODE_MESSAGES = {
NO_CAPTIONS_AVAILABLE: "This video has no subtitles yet",
NOT_IN_EXPECTED_LANGUAGE: "Video is not in the requested language",
DUBBED_AUDIO: "Video audio is dubbed; original captions unavailable",
CAPTIONS_TOO_SHORT: "Video captions are too sparse for interactive reading",
VIDEO_IS_MISSING_DURATION: "Video duration is unavailable",
}


def _payload():
"""Read fields from a JSON body, falling back to form for scalars."""
data = request.get_json(silent=True) or {}

def field(name):
value = data.get(name)
if value is None:
value = request.form.get(name)
return value

return data, field


@api.route("/video_upload/create", methods=["POST"])
@cross_domain
@requires_session
def video_upload_create():
user = User.find_by_id(flask.g.user_id)

data, field = _payload()

video_unique_key = (field("video_unique_key") or "").strip()
if not video_unique_key:
video_unique_key = extract_youtube_video_id(field("url") or "")
if not video_unique_key:
flask.abort(400, "A YouTube url or video_unique_key is required")

lang_code = (field("language") or "").strip()
if not lang_code:
flask.abort(400, "language required")
try:
Language.find(lang_code)
except NoResultFound:
flask.abort(406, "Language not supported")

# Client-extracted caption segments: list of {time_start, time_end, text} (times in ms).
# Optional -- if absent we fall through to broken=NO_CAPTIONS_AVAILABLE below.
raw_captions = data.get("captions")
if raw_captions is not None:
if not isinstance(raw_captions, list):
flask.abort(400, "captions must be a list")
if len(raw_captions) == 0:
flask.abort(400, "captions list is empty")

provided_captions = normalize_caption_list(raw_captions)
if raw_captions is not None and provided_captions is None:
flask.abort(400, "captions contained no usable text")

video = Video.find_or_create(
db_session,
video_unique_key,
lang_code,
upload_index=False, # share-flow: don't block the redirect on Elasticsearch
captions=provided_captions,
enforce_language=False,
enforce_caption_length=False,
)

if video is None:
flask.abort(422, "Could not fetch video info from YouTube")
if video.broken:
message = BROKEN_CODE_MESSAGES.get(
video.broken, f"Video not usable (broken={video.broken})"
)
flask.abort(422, message)

return json_result(
{"video_id": video.id, "video_unique_key": video.video_unique_key}
)
62 changes: 35 additions & 27 deletions zeeguu/core/model/user_video.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging
from datetime import datetime

from sqlalchemy import or_
from sqlalchemy.exc import IntegrityError
from zeeguu.core.model.db import db
from zeeguu.core.model.user import User
from zeeguu.core.model.video import Video
Expand All @@ -9,19 +11,22 @@
from zeeguu.core.model.video_caption_context import VideoCaptionContext
from zeeguu.core.util.encoding import datetime_to_json

logger = logging.getLogger(__name__)


class UserVideo(db.Model):
__tablename__ = "user_video"
table_args = {"mysql_collate": "utf8_bin"}
__table_args__ = (
db.UniqueConstraint("user_id", "video_id"),
{"mysql_collate": "utf8_bin"},
)

id = db.Column(db.Integer, primary_key=True)
user_id = db.Column(db.Integer, db.ForeignKey("user.id"))
user = db.relationship("User")
video_id = db.Column(db.Integer, db.ForeignKey("video.id"))
video = db.relationship("Video")

db.UniqueConstraint("user_id", "video_id")

opened = db.Column(db.DateTime)

liked = db.Column(db.Boolean)
Expand Down Expand Up @@ -69,10 +74,9 @@ def find_by_video(cls, video: Video):

@classmethod
def find(cls, user: User, video: Video):
try:
return cls.query.filter_by(user=user, video=video).one()
except NoResultFound:
return None
# .first() (not .one()) so that pre-existing duplicate rows never raise
# MultipleResultsFound; the unique constraint prevents new dups going forward.
return cls.query.filter_by(user=user, video=video).first()

@classmethod
def find_or_create(
Expand All @@ -84,27 +88,31 @@ def find_or_create(
liked=None,
playback_position=None,
):
existing = cls.query.filter_by(user=user, video=video).first()
if existing:
return existing
try:
return cls.query.filter_by(user=user, video=video).one()
except NoResultFound:
try:
new = cls(
user,
video,
opened=opened,
liked=liked,
playback_position=playback_position,
)
session.add(new)
session.commit()
return new
except Exception as e:
from sentry_sdk import capture_exception

capture_exception(e)
print("Seems we avoided a race condition")
session.rollback()
return cls.query.filter_by(user=user, video=video).one()
new = cls(
user,
video,
opened=opened,
liked=liked,
playback_position=playback_position,
)
session.add(new)
session.commit()
return new
except IntegrityError:
# Concurrent insert hit the UNIQUE(user_id, video_id) constraint -- not a bug.
session.rollback()
logger.info("UserVideo race avoided: returning row inserted by concurrent request")
return cls.query.filter_by(user=user, video=video).first()
except Exception as e:
from sentry_sdk import capture_exception

capture_exception(e)
session.rollback()
return cls.query.filter_by(user=user, video=video).first()

@classmethod
def all_liked_videos_of_user(cls, user):
Expand Down
16 changes: 15 additions & 1 deletion zeeguu/core/model/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,15 @@ def find_or_create(
video_unique_key,
language,
upload_index=True,
captions=None,
enforce_language=True,
enforce_caption_length=True,
):
"""captions: already-normalized {text, captions} dict from a client (browser
extension / iOS WKWebView), used in place of the server-side fetch. For
user-shared videos, callers pass enforce_language=False and
enforce_caption_length=False (the user chose the video, so we don't reject
it on language detection or caption length)."""
from zeeguu.core.elastic.indexing import index_video

# Import here to avoid circular dependency:
Expand All @@ -111,7 +119,13 @@ def find_or_create(
return video

try:
video_info = fetch_video_info(video_unique_key, language)
video_info = fetch_video_info(
video_unique_key,
language,
provided_captions=captions,
enforce_language=enforce_language,
enforce_caption_length=enforce_caption_length,
)
except ValueError as e:
print(f"Error fetching video info for {video_unique_key}: {e}")
return None
Expand Down
71 changes: 66 additions & 5 deletions zeeguu/core/youtube_api/youtube_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,66 @@ def get_video_unique_keys(lang, category_id=None, topic_id=None, max_results=50)
return video_ids


def fetch_video_info(video_unique_key, lang):
YOUTUBE_ID_RE = r"[0-9A-Za-z_-]{11}"


def extract_youtube_video_id(url):
"""Parse the 11-char video id from common YouTube URL shapes
(watch?v=, youtu.be/, /shorts/, /embed/, /live/) or a bare id. None if not found."""
if not url:
return None
url = url.strip()
m = re.search(
rf"(?:v=|/shorts/|/embed/|/live/|youtu\.be/)({YOUTUBE_ID_RE})", url
)
if m:
return m.group(1)
if re.fullmatch(YOUTUBE_ID_RE, url):
return url
return None


def normalize_caption_list(caption_list):
"""Turn a client-supplied list of {time_start, time_end, text} segments (times in
milliseconds) into the {text, captions} shape the rest of the pipeline expects.
Used for captions extracted client-side (browser extension / iOS WKWebView) from
YouTube's authorized player, sidestepping the server-side fetch YouTube blocks."""
if not caption_list:
return None
normalized = []
full_text = []
for c in caption_list:
clean = text_cleaner(c.get("text", ""))
if not clean.strip():
continue
normalized.append(
{
"time_start": int(c["time_start"]),
"time_end": int(c["time_end"]),
"text": clean,
}
)
full_text.append(clean)
if not normalized:
return None
return {"text": "\n".join(full_text), "captions": normalized}


def fetch_video_info(
video_unique_key,
lang,
provided_captions=None,
enforce_language=True,
enforce_caption_length=True,
):
"""
video_unique_key is the video id, e.g. "8-GrLwHK8SQ"


provided_captions: already-normalized {text, captions} dict (see
normalize_caption_list); when given, used in place of the server-side fetch.
enforce_language / enforce_caption_length default True for crawling, but are
relaxed for user-shared videos (the user already chose the video, so we don't
reject on language/length).
"""

def _get_thumbnail(item):
Expand Down Expand Up @@ -155,18 +210,24 @@ def _get_thumbnail(item):
video_info["broken"] = VIDEO_IS_MISSING_DURATION
return video_info

if not is_video_language_correct(
if enforce_language and not is_video_language_correct(
video_info["title"], video_info["description"], lang
):
print(f"Video {video_unique_key} is not in the expected language {lang}.")
video_info["broken"] = NOT_IN_EXPECTED_LANGUAGE
return video_info

captions = get_captions_from_json(video_unique_key, lang)
if provided_captions is not None:
captions = provided_captions
else:
captions = get_captions_from_json(video_unique_key, lang)

if captions is None:
print(f"Could not fetch captions for video {video_unique_key} in {lang}")
video_info["broken"] = NO_CAPTIONS_AVAILABLE
elif is_captions_too_short(captions["text"], video_info["duration"]):
elif enforce_caption_length and is_captions_too_short(
captions["text"], video_info["duration"]
):
video_info["broken"] = CAPTIONS_TOO_SHORT
else:
video_info["text"] = captions["text"]
Expand Down
Loading