From e2d179c76cc515cc1c99438dbccff9cca83d1f5d Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Tue, 26 May 2026 17:34:21 +0200 Subject: [PATCH 1/3] fix(video): dedupe and enforce UNIQUE(user_id, video_id) on user_video UserVideo declared its unique constraint as a bare db.UniqueConstraint(...) expression outside __table_args__, so no DDL was ever emitted. Concurrent first-open requests (/user_video + /video_opened fire together on reader load) both INSERTed, producing duplicate rows; UserVideo.find/find_or_create then 500'd with MultipleResultsFound and the video became permanently un-openable. - Move the constraint into __table_args__ (also fixes the table_args -> __table_args__ typo that silently dropped the collation). - find/find_or_create use .first() not .one() so pre-existing dups never raise; the rollback-and-requery race handler now actually fires once the unique index exists. - Migration dedupes existing rows (keep lowest id) then adds the unique key. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...-05-26-a--dedupe-and-unique-user-video.sql | 21 ++++++++ zeeguu/core/model/user_video.py | 54 +++++++++---------- 2 files changed, 48 insertions(+), 27 deletions(-) create mode 100644 tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql diff --git a/tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql b/tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql new file mode 100644 index 000000000..d70d841e8 --- /dev/null +++ b/tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql @@ -0,0 +1,21 @@ +-- Fix: user_video never had a real UNIQUE(user_id, video_id) constraint. The model declared it +-- as a bare `db.UniqueConstraint(...)` expression (a no-op outside __table_args__), so no DDL was +-- ever emitted. As a result, concurrent first-open requests (/user_video + /video_opened fire +-- together on reader load) both INSERTed a row, producing duplicates -- after which +-- UserVideo.find / find_or_create (which used .one()) 500'd with MultipleResultsFound. +-- +-- Dedupe existing rows (keep the lowest id per user+video), then add the missing unique key so +-- the race never recurs (the find_or_create rollback-and-requery handler now actually fires). + +DELETE uv FROM user_video uv +JOIN ( + SELECT user_id, video_id, MIN(id) AS keep_id + FROM user_video + GROUP BY user_id, video_id + HAVING COUNT(*) > 1 +) d ON uv.user_id = d.user_id + AND uv.video_id = d.video_id + AND uv.id <> d.keep_id; + +ALTER TABLE user_video + ADD UNIQUE KEY uq_user_video_user_video (user_id, video_id); diff --git a/zeeguu/core/model/user_video.py b/zeeguu/core/model/user_video.py index e534813c4..e075e90de 100644 --- a/zeeguu/core/model/user_video.py +++ b/zeeguu/core/model/user_video.py @@ -12,7 +12,10 @@ class UserVideo(db.Model): __tablename__ = "user_video" - table_args = {"mysql_collate": "utf8_bin"} + __table_args__ = ( + db.UniqueConstraint("user_id", "video_id"), + {"mysql_collate": "utf8_bin"}, + ) id = db.Column(db.Integer, primary_key=True) user_id = db.Column(db.Integer, db.ForeignKey("user.id")) @@ -20,8 +23,6 @@ class UserVideo(db.Model): video_id = db.Column(db.Integer, db.ForeignKey("video.id")) video = db.relationship("Video") - db.UniqueConstraint("user_id", "video_id") - opened = db.Column(db.DateTime) liked = db.Column(db.Boolean) @@ -69,10 +70,9 @@ def find_by_video(cls, video: Video): @classmethod def find(cls, user: User, video: Video): - try: - return cls.query.filter_by(user=user, video=video).one() - except NoResultFound: - return None + # .first() (not .one()) so that pre-existing duplicate rows never raise + # MultipleResultsFound; the unique constraint prevents new dups going forward. + return cls.query.filter_by(user=user, video=video).first() @classmethod def find_or_create( @@ -84,27 +84,27 @@ def find_or_create( liked=None, playback_position=None, ): + existing = cls.query.filter_by(user=user, video=video).first() + if existing: + return existing try: - return cls.query.filter_by(user=user, video=video).one() - except NoResultFound: - try: - new = cls( - user, - video, - opened=opened, - liked=liked, - playback_position=playback_position, - ) - session.add(new) - session.commit() - return new - except Exception as e: - from sentry_sdk import capture_exception - - capture_exception(e) - print("Seems we avoided a race condition") - session.rollback() - return cls.query.filter_by(user=user, video=video).one() + new = cls( + user, + video, + opened=opened, + liked=liked, + playback_position=playback_position, + ) + session.add(new) + session.commit() + return new + except Exception as e: + from sentry_sdk import capture_exception + + capture_exception(e) + print("Seems we avoided a race condition") + session.rollback() + return cls.query.filter_by(user=user, video=video).first() @classmethod def all_liked_videos_of_user(cls, user): From 1786819dbcd3ce2b7504760af2a8a43818790e89 Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Tue, 26 May 2026 17:34:28 +0200 Subject: [PATCH 2/3] feat(video): accept client-extracted captions via /video_upload/create Enables "share a video to Zeeguu": the client (browser extension / iOS WKWebView) extracts captions from YouTube's authorized player and POSTs them, sidestepping the server-side caption fetch YouTube blocks from datacenter IPs (and which now requires a PO token). The server fetches only metadata (Data API key, key-authenticated, not IP-blocked) and creates the Video + Caption rows; the existing /user_video reader serves them unchanged. - Video.find_or_create(..., captions=, enforce_language=False, enforce_caption_length=False): ingest supplied captions and skip the crawler-era reject filters for user-shared videos (the user chose the video). Defaults are unchanged, so the crawler path behaves identically. - youtube_api: extract_youtube_video_id (watch?v= / youtu.be / shorts / embed / bare id) and normalize_caption_list (unescape entities, drop empty segments); fetch_video_info gains provided_captions + enforce_* flags. - New POST /video_upload/create endpoint, registered in the blueprint. Co-Authored-By: Claude Opus 4.7 (1M context) --- zeeguu/api/endpoints/__init__.py | 1 + zeeguu/api/endpoints/video_upload.py | 79 ++++++++++++++++++++++++++ zeeguu/core/model/video.py | 15 ++++- zeeguu/core/youtube_api/youtube_api.py | 70 +++++++++++++++++++++-- 4 files changed, 159 insertions(+), 6 deletions(-) create mode 100644 zeeguu/api/endpoints/video_upload.py diff --git a/zeeguu/api/endpoints/__init__.py b/zeeguu/api/endpoints/__init__.py index 4d3b2777d..111eb57c5 100644 --- a/zeeguu/api/endpoints/__init__.py +++ b/zeeguu/api/endpoints/__init__.py @@ -41,6 +41,7 @@ from .listening_sessions import * from . import user_video from . import user_watching_session +from . import video_upload from . import audio_lessons from . import article_simplification from . import generated_examples diff --git a/zeeguu/api/endpoints/video_upload.py b/zeeguu/api/endpoints/video_upload.py new file mode 100644 index 000000000..e43f9b97d --- /dev/null +++ b/zeeguu/api/endpoints/video_upload.py @@ -0,0 +1,79 @@ +"""Endpoint for sharing a single video to Zeeguu for interactive viewing. + +Mirrors article_upload, but for video. The client (browser extension / iOS WKWebView) +extracts the captions from YouTube's authorized player and hands them to us, sidestepping +the server-side caption fetch that YouTube blocks from datacenter IPs. We fetch only +metadata (via the Data API key, which is key-authenticated and not IP-blocked) and create +the Video + Caption rows, then the client opens the existing /user_video reader. +""" +import flask +from flask import request +from sqlalchemy.orm.exc import NoResultFound + +from zeeguu.core.model import User, Language +from zeeguu.core.model.video import Video +from zeeguu.core.youtube_api.youtube_api import extract_youtube_video_id +from zeeguu.api.utils.json_result import json_result +from zeeguu.api.utils.route_wrappers import cross_domain, requires_session + +from . import api, db_session + + +def _payload(): + """Read fields from a JSON body, falling back to form for scalars.""" + data = request.get_json(silent=True) or {} + + def field(name): + value = data.get(name) + if value is None: + value = request.form.get(name) + return value + + return data, field + + +@api.route("/video_upload/create", methods=["POST"]) +@cross_domain +@requires_session +def video_upload_create(): + user = User.find_by_id(flask.g.user_id) + + data, field = _payload() + + video_unique_key = (field("video_unique_key") or "").strip() + if not video_unique_key: + video_unique_key = extract_youtube_video_id(field("url") or "") + if not video_unique_key: + flask.abort(400, "A YouTube url or video_unique_key is required") + + lang_code = (field("language") or "").strip() + if not lang_code: + flask.abort(400, "language required") + try: + Language.find(lang_code) + except NoResultFound: + flask.abort(406, "Language not supported") + + # Captions extracted client-side: list of {time_start, time_end, text} (times in ms). + # Optional — if absent we fall through to broken=NO_CAPTIONS and report it below. + captions = data.get("captions") + + video = Video.find_or_create( + db_session, + video_unique_key, + lang_code, + captions=captions, + enforce_language=False, + enforce_caption_length=False, + ) + + if video is None: + flask.abort(422, "Could not fetch video info from YouTube") + if video.broken != 0: + # 1=no captions, 5=missing duration. The client should surface a friendly + # "this video has no subtitles yet" for the no-captions case. + flask.abort(422, f"Video not usable for interactive viewing (code {video.broken})") + + return json_result( + {"video_id": video.id, "video_unique_key": video.video_unique_key} + ) diff --git a/zeeguu/core/model/video.py b/zeeguu/core/model/video.py index d391d8552..4fd5a2cf3 100644 --- a/zeeguu/core/model/video.py +++ b/zeeguu/core/model/video.py @@ -92,7 +92,14 @@ def find_or_create( video_unique_key, language, upload_index=True, + captions=None, + enforce_language=True, + enforce_caption_length=True, ): + """captions: client-extracted segments (browser extension / iOS WKWebView) used + instead of the server-side caption fetch. For user-shared videos, callers pass + enforce_language=False and enforce_caption_length=False (the user chose the + video, so we don't reject it on language detection or caption length).""" from zeeguu.core.elastic.indexing import index_video # Import here to avoid circular dependency: @@ -111,7 +118,13 @@ def find_or_create( return video try: - video_info = fetch_video_info(video_unique_key, language) + video_info = fetch_video_info( + video_unique_key, + language, + provided_captions=captions, + enforce_language=enforce_language, + enforce_caption_length=enforce_caption_length, + ) except ValueError as e: print(f"Error fetching video info for {video_unique_key}: {e}") return None diff --git a/zeeguu/core/youtube_api/youtube_api.py b/zeeguu/core/youtube_api/youtube_api.py index 5250fb47e..a66338d9d 100644 --- a/zeeguu/core/youtube_api/youtube_api.py +++ b/zeeguu/core/youtube_api/youtube_api.py @@ -92,11 +92,65 @@ def get_video_unique_keys(lang, category_id=None, topic_id=None, max_results=50) return video_ids -def fetch_video_info(video_unique_key, lang): +YOUTUBE_ID_RE = r"[0-9A-Za-z_-]{11}" + + +def extract_youtube_video_id(url): + """Parse the 11-char video id from common YouTube URL shapes + (watch?v=, youtu.be/, /shorts/, /embed/, /live/) or a bare id. None if not found.""" + if not url: + return None + url = url.strip() + m = re.search( + rf"(?:v=|/shorts/|/embed/|/live/|youtu\.be/)({YOUTUBE_ID_RE})", url + ) + if m: + return m.group(1) + if re.fullmatch(YOUTUBE_ID_RE, url): + return url + return None + + +def normalize_caption_list(caption_list): + """Turn a client-supplied list of {time_start, time_end, text} segments (times in + milliseconds) into the {text, captions} shape the rest of the pipeline expects. + Used for captions extracted client-side (browser extension / iOS WKWebView) from + YouTube's authorized player, sidestepping the server-side fetch YouTube blocks.""" + if not caption_list: + return None + normalized = [] + full_text = [] + for c in caption_list: + clean = text_cleaner(c.get("text", "")) + if not clean.strip(): + continue + normalized.append( + { + "time_start": int(c["time_start"]), + "time_end": int(c["time_end"]), + "text": clean, + } + ) + full_text.append(clean) + if not normalized: + return None + return {"text": "\n".join(full_text), "captions": normalized} + + +def fetch_video_info( + video_unique_key, + lang, + provided_captions=None, + enforce_language=True, + enforce_caption_length=True, +): """ video_unique_key is the video id, e.g. "8-GrLwHK8SQ" - + provided_captions: client-extracted caption segments; when given, they are used + directly instead of the server-side caption fetch. enforce_language / + enforce_caption_length default True for crawling, but are relaxed for user-shared + videos (the user already chose the video, so we don't reject on language/length). """ def _get_thumbnail(item): @@ -155,18 +209,24 @@ def _get_thumbnail(item): video_info["broken"] = VIDEO_IS_MISSING_DURATION return video_info - if not is_video_language_correct( + if enforce_language and not is_video_language_correct( video_info["title"], video_info["description"], lang ): print(f"Video {video_unique_key} is not in the expected language {lang}.") video_info["broken"] = NOT_IN_EXPECTED_LANGUAGE return video_info - captions = get_captions_from_json(video_unique_key, lang) + if provided_captions is not None: + captions = normalize_caption_list(provided_captions) + else: + captions = get_captions_from_json(video_unique_key, lang) + if captions is None: print(f"Could not fetch captions for video {video_unique_key} in {lang}") video_info["broken"] = NO_CAPTIONS_AVAILABLE - elif is_captions_too_short(captions["text"], video_info["duration"]): + elif enforce_caption_length and is_captions_too_short( + captions["text"], video_info["duration"] + ): video_info["broken"] = CAPTIONS_TOO_SHORT else: video_info["text"] = captions["text"] From 3e7fa10915f0606fc9268ff6cae3970563bdc7f6 Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Mon, 1 Jun 2026 16:00:09 +0200 Subject: [PATCH 3/3] review polish: video_upload endpoint, race handler, migration note Address review feedback on the share-to-video ingestion PR: - UserVideo.find_or_create: catch IntegrityError separately from the generic Exception arm. The race against UNIQUE(user_id, video_id) is expected and is now logged via the logger (info), not captured to Sentry. The generic Exception arm still captures real bugs. - video_upload endpoint: * Replace stringified `broken` integer in the 422 message with a per-code BROKEN_CODE_MESSAGES lookup using the named constants from youtube_api (NO_CAPTIONS_AVAILABLE, NOT_IN_EXPECTED_LANGUAGE, DUBBED_AUDIO, CAPTIONS_TOO_SHORT, VIDEO_IS_MISSING_DURATION). * Validate the client-supplied `captions` payload at the endpoint (must be a list, must be non-empty, must contain usable text after normalization) instead of silently ingesting empty/malformed shares. * Lift the normalize_caption_list call out of fetch_video_info up to the endpoint, so fetch_video_info receives an already-normalized {text, captions} dict and the layer boundary is clearer. * Pass upload_index=False to Video.find_or_create so the user's redirect isn't blocked on a synchronous Elasticsearch POST. - Migration: header note about the table-scan/rewrite cost of the dedupe + ALTER for large user_video tables (run during a maintenance window if >1M rows). Co-Authored-By: Claude Opus 4.7 (1M context) --- ...-05-26-a--dedupe-and-unique-user-video.sql | 5 ++ zeeguu/api/endpoints/video_upload.py | 46 +++++++++++++++---- zeeguu/core/model/user_video.py | 10 +++- zeeguu/core/model/video.py | 9 ++-- zeeguu/core/youtube_api/youtube_api.py | 11 +++-- 5 files changed, 62 insertions(+), 19 deletions(-) diff --git a/tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql b/tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql index d70d841e8..cb4bccf00 100644 --- a/tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql +++ b/tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql @@ -6,6 +6,11 @@ -- -- Dedupe existing rows (keep the lowest id per user+video), then add the missing unique key so -- the race never recurs (the find_or_create rollback-and-requery handler now actually fires). +-- +-- Note on cost: with no index yet on (user_id, video_id), the DELETE...JOIN below does a full +-- scan and GROUP BY on user_video, and the ALTER TABLE that follows rewrites the table to add +-- the unique key. For large tables (>1M rows) this will lock the table for a noticeable window +-- -- prefer to run during a maintenance window or off-peak. DELETE uv FROM user_video uv JOIN ( diff --git a/zeeguu/api/endpoints/video_upload.py b/zeeguu/api/endpoints/video_upload.py index e43f9b97d..934274d80 100644 --- a/zeeguu/api/endpoints/video_upload.py +++ b/zeeguu/api/endpoints/video_upload.py @@ -12,13 +12,30 @@ from zeeguu.core.model import User, Language from zeeguu.core.model.video import Video -from zeeguu.core.youtube_api.youtube_api import extract_youtube_video_id +from zeeguu.core.youtube_api.youtube_api import ( + extract_youtube_video_id, + normalize_caption_list, + NO_CAPTIONS_AVAILABLE, + NOT_IN_EXPECTED_LANGUAGE, + DUBBED_AUDIO, + CAPTIONS_TOO_SHORT, + VIDEO_IS_MISSING_DURATION, +) from zeeguu.api.utils.json_result import json_result from zeeguu.api.utils.route_wrappers import cross_domain, requires_session from . import api, db_session +BROKEN_CODE_MESSAGES = { + NO_CAPTIONS_AVAILABLE: "This video has no subtitles yet", + NOT_IN_EXPECTED_LANGUAGE: "Video is not in the requested language", + DUBBED_AUDIO: "Video audio is dubbed; original captions unavailable", + CAPTIONS_TOO_SHORT: "Video captions are too sparse for interactive reading", + VIDEO_IS_MISSING_DURATION: "Video duration is unavailable", +} + + def _payload(): """Read fields from a JSON body, falling back to form for scalars.""" data = request.get_json(silent=True) or {} @@ -54,25 +71,36 @@ def video_upload_create(): except NoResultFound: flask.abort(406, "Language not supported") - # Captions extracted client-side: list of {time_start, time_end, text} (times in ms). - # Optional — if absent we fall through to broken=NO_CAPTIONS and report it below. - captions = data.get("captions") + # Client-extracted caption segments: list of {time_start, time_end, text} (times in ms). + # Optional -- if absent we fall through to broken=NO_CAPTIONS_AVAILABLE below. + raw_captions = data.get("captions") + if raw_captions is not None: + if not isinstance(raw_captions, list): + flask.abort(400, "captions must be a list") + if len(raw_captions) == 0: + flask.abort(400, "captions list is empty") + + provided_captions = normalize_caption_list(raw_captions) + if raw_captions is not None and provided_captions is None: + flask.abort(400, "captions contained no usable text") video = Video.find_or_create( db_session, video_unique_key, lang_code, - captions=captions, + upload_index=False, # share-flow: don't block the redirect on Elasticsearch + captions=provided_captions, enforce_language=False, enforce_caption_length=False, ) if video is None: flask.abort(422, "Could not fetch video info from YouTube") - if video.broken != 0: - # 1=no captions, 5=missing duration. The client should surface a friendly - # "this video has no subtitles yet" for the no-captions case. - flask.abort(422, f"Video not usable for interactive viewing (code {video.broken})") + if video.broken: + message = BROKEN_CODE_MESSAGES.get( + video.broken, f"Video not usable (broken={video.broken})" + ) + flask.abort(422, message) return json_result( {"video_id": video.id, "video_unique_key": video.video_unique_key} diff --git a/zeeguu/core/model/user_video.py b/zeeguu/core/model/user_video.py index e075e90de..1e15b9fd2 100644 --- a/zeeguu/core/model/user_video.py +++ b/zeeguu/core/model/user_video.py @@ -1,6 +1,8 @@ +import logging from datetime import datetime from sqlalchemy import or_ +from sqlalchemy.exc import IntegrityError from zeeguu.core.model.db import db from zeeguu.core.model.user import User from zeeguu.core.model.video import Video @@ -9,6 +11,8 @@ from zeeguu.core.model.video_caption_context import VideoCaptionContext from zeeguu.core.util.encoding import datetime_to_json +logger = logging.getLogger(__name__) + class UserVideo(db.Model): __tablename__ = "user_video" @@ -98,11 +102,15 @@ def find_or_create( session.add(new) session.commit() return new + except IntegrityError: + # Concurrent insert hit the UNIQUE(user_id, video_id) constraint -- not a bug. + session.rollback() + logger.info("UserVideo race avoided: returning row inserted by concurrent request") + return cls.query.filter_by(user=user, video=video).first() except Exception as e: from sentry_sdk import capture_exception capture_exception(e) - print("Seems we avoided a race condition") session.rollback() return cls.query.filter_by(user=user, video=video).first() diff --git a/zeeguu/core/model/video.py b/zeeguu/core/model/video.py index 4fd5a2cf3..fc6f73669 100644 --- a/zeeguu/core/model/video.py +++ b/zeeguu/core/model/video.py @@ -96,10 +96,11 @@ def find_or_create( enforce_language=True, enforce_caption_length=True, ): - """captions: client-extracted segments (browser extension / iOS WKWebView) used - instead of the server-side caption fetch. For user-shared videos, callers pass - enforce_language=False and enforce_caption_length=False (the user chose the - video, so we don't reject it on language detection or caption length).""" + """captions: already-normalized {text, captions} dict from a client (browser + extension / iOS WKWebView), used in place of the server-side fetch. For + user-shared videos, callers pass enforce_language=False and + enforce_caption_length=False (the user chose the video, so we don't reject + it on language detection or caption length).""" from zeeguu.core.elastic.indexing import index_video # Import here to avoid circular dependency: diff --git a/zeeguu/core/youtube_api/youtube_api.py b/zeeguu/core/youtube_api/youtube_api.py index a66338d9d..deae905e7 100644 --- a/zeeguu/core/youtube_api/youtube_api.py +++ b/zeeguu/core/youtube_api/youtube_api.py @@ -147,10 +147,11 @@ def fetch_video_info( """ video_unique_key is the video id, e.g. "8-GrLwHK8SQ" - provided_captions: client-extracted caption segments; when given, they are used - directly instead of the server-side caption fetch. enforce_language / - enforce_caption_length default True for crawling, but are relaxed for user-shared - videos (the user already chose the video, so we don't reject on language/length). + provided_captions: already-normalized {text, captions} dict (see + normalize_caption_list); when given, used in place of the server-side fetch. + enforce_language / enforce_caption_length default True for crawling, but are + relaxed for user-shared videos (the user already chose the video, so we don't + reject on language/length). """ def _get_thumbnail(item): @@ -217,7 +218,7 @@ def _get_thumbnail(item): return video_info if provided_captions is not None: - captions = normalize_caption_list(provided_captions) + captions = provided_captions else: captions = get_captions_from_json(video_unique_key, lang)