diff --git a/tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql b/tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql new file mode 100644 index 000000000..cb4bccf00 --- /dev/null +++ b/tools/migrations/26-05-26-a--dedupe-and-unique-user-video.sql @@ -0,0 +1,26 @@ +-- Fix: user_video never had a real UNIQUE(user_id, video_id) constraint. The model declared it +-- as a bare `db.UniqueConstraint(...)` expression (a no-op outside __table_args__), so no DDL was +-- ever emitted. As a result, concurrent first-open requests (/user_video + /video_opened fire +-- together on reader load) both INSERTed a row, producing duplicates -- after which +-- UserVideo.find / find_or_create (which used .one()) 500'd with MultipleResultsFound. +-- +-- Dedupe existing rows (keep the lowest id per user+video), then add the missing unique key so +-- the race never recurs (the find_or_create rollback-and-requery handler now actually fires). +-- +-- Note on cost: with no index yet on (user_id, video_id), the DELETE...JOIN below does a full +-- scan and GROUP BY on user_video, and the ALTER TABLE that follows rewrites the table to add +-- the unique key. For large tables (>1M rows) this will lock the table for a noticeable window +-- -- prefer to run during a maintenance window or off-peak. + +DELETE uv FROM user_video uv +JOIN ( + SELECT user_id, video_id, MIN(id) AS keep_id + FROM user_video + GROUP BY user_id, video_id + HAVING COUNT(*) > 1 +) d ON uv.user_id = d.user_id + AND uv.video_id = d.video_id + AND uv.id <> d.keep_id; + +ALTER TABLE user_video + ADD UNIQUE KEY uq_user_video_user_video (user_id, video_id); diff --git a/zeeguu/api/endpoints/__init__.py b/zeeguu/api/endpoints/__init__.py index 4d3b2777d..111eb57c5 100644 --- a/zeeguu/api/endpoints/__init__.py +++ b/zeeguu/api/endpoints/__init__.py @@ -41,6 +41,7 @@ from .listening_sessions import * from . import user_video from . import user_watching_session +from . import video_upload from . import audio_lessons from . import article_simplification from . import generated_examples diff --git a/zeeguu/api/endpoints/video_upload.py b/zeeguu/api/endpoints/video_upload.py new file mode 100644 index 000000000..934274d80 --- /dev/null +++ b/zeeguu/api/endpoints/video_upload.py @@ -0,0 +1,107 @@ +"""Endpoint for sharing a single video to Zeeguu for interactive viewing. + +Mirrors article_upload, but for video. The client (browser extension / iOS WKWebView) +extracts the captions from YouTube's authorized player and hands them to us, sidestepping +the server-side caption fetch that YouTube blocks from datacenter IPs. We fetch only +metadata (via the Data API key, which is key-authenticated and not IP-blocked) and create +the Video + Caption rows, then the client opens the existing /user_video reader. +""" +import flask +from flask import request +from sqlalchemy.orm.exc import NoResultFound + +from zeeguu.core.model import User, Language +from zeeguu.core.model.video import Video +from zeeguu.core.youtube_api.youtube_api import ( + extract_youtube_video_id, + normalize_caption_list, + NO_CAPTIONS_AVAILABLE, + NOT_IN_EXPECTED_LANGUAGE, + DUBBED_AUDIO, + CAPTIONS_TOO_SHORT, + VIDEO_IS_MISSING_DURATION, +) +from zeeguu.api.utils.json_result import json_result +from zeeguu.api.utils.route_wrappers import cross_domain, requires_session + +from . import api, db_session + + +BROKEN_CODE_MESSAGES = { + NO_CAPTIONS_AVAILABLE: "This video has no subtitles yet", + NOT_IN_EXPECTED_LANGUAGE: "Video is not in the requested language", + DUBBED_AUDIO: "Video audio is dubbed; original captions unavailable", + CAPTIONS_TOO_SHORT: "Video captions are too sparse for interactive reading", + VIDEO_IS_MISSING_DURATION: "Video duration is unavailable", +} + + +def _payload(): + """Read fields from a JSON body, falling back to form for scalars.""" + data = request.get_json(silent=True) or {} + + def field(name): + value = data.get(name) + if value is None: + value = request.form.get(name) + return value + + return data, field + + +@api.route("/video_upload/create", methods=["POST"]) +@cross_domain +@requires_session +def video_upload_create(): + user = User.find_by_id(flask.g.user_id) + + data, field = _payload() + + video_unique_key = (field("video_unique_key") or "").strip() + if not video_unique_key: + video_unique_key = extract_youtube_video_id(field("url") or "") + if not video_unique_key: + flask.abort(400, "A YouTube url or video_unique_key is required") + + lang_code = (field("language") or "").strip() + if not lang_code: + flask.abort(400, "language required") + try: + Language.find(lang_code) + except NoResultFound: + flask.abort(406, "Language not supported") + + # Client-extracted caption segments: list of {time_start, time_end, text} (times in ms). + # Optional -- if absent we fall through to broken=NO_CAPTIONS_AVAILABLE below. + raw_captions = data.get("captions") + if raw_captions is not None: + if not isinstance(raw_captions, list): + flask.abort(400, "captions must be a list") + if len(raw_captions) == 0: + flask.abort(400, "captions list is empty") + + provided_captions = normalize_caption_list(raw_captions) + if raw_captions is not None and provided_captions is None: + flask.abort(400, "captions contained no usable text") + + video = Video.find_or_create( + db_session, + video_unique_key, + lang_code, + upload_index=False, # share-flow: don't block the redirect on Elasticsearch + captions=provided_captions, + enforce_language=False, + enforce_caption_length=False, + ) + + if video is None: + flask.abort(422, "Could not fetch video info from YouTube") + if video.broken: + message = BROKEN_CODE_MESSAGES.get( + video.broken, f"Video not usable (broken={video.broken})" + ) + flask.abort(422, message) + + return json_result( + {"video_id": video.id, "video_unique_key": video.video_unique_key} + ) diff --git a/zeeguu/core/model/user_video.py b/zeeguu/core/model/user_video.py index e534813c4..1e15b9fd2 100644 --- a/zeeguu/core/model/user_video.py +++ b/zeeguu/core/model/user_video.py @@ -1,6 +1,8 @@ +import logging from datetime import datetime from sqlalchemy import or_ +from sqlalchemy.exc import IntegrityError from zeeguu.core.model.db import db from zeeguu.core.model.user import User from zeeguu.core.model.video import Video @@ -9,10 +11,15 @@ from zeeguu.core.model.video_caption_context import VideoCaptionContext from zeeguu.core.util.encoding import datetime_to_json +logger = logging.getLogger(__name__) + class UserVideo(db.Model): __tablename__ = "user_video" - table_args = {"mysql_collate": "utf8_bin"} + __table_args__ = ( + db.UniqueConstraint("user_id", "video_id"), + {"mysql_collate": "utf8_bin"}, + ) id = db.Column(db.Integer, primary_key=True) user_id = db.Column(db.Integer, db.ForeignKey("user.id")) @@ -20,8 +27,6 @@ class UserVideo(db.Model): video_id = db.Column(db.Integer, db.ForeignKey("video.id")) video = db.relationship("Video") - db.UniqueConstraint("user_id", "video_id") - opened = db.Column(db.DateTime) liked = db.Column(db.Boolean) @@ -69,10 +74,9 @@ def find_by_video(cls, video: Video): @classmethod def find(cls, user: User, video: Video): - try: - return cls.query.filter_by(user=user, video=video).one() - except NoResultFound: - return None + # .first() (not .one()) so that pre-existing duplicate rows never raise + # MultipleResultsFound; the unique constraint prevents new dups going forward. + return cls.query.filter_by(user=user, video=video).first() @classmethod def find_or_create( @@ -84,27 +88,31 @@ def find_or_create( liked=None, playback_position=None, ): + existing = cls.query.filter_by(user=user, video=video).first() + if existing: + return existing try: - return cls.query.filter_by(user=user, video=video).one() - except NoResultFound: - try: - new = cls( - user, - video, - opened=opened, - liked=liked, - playback_position=playback_position, - ) - session.add(new) - session.commit() - return new - except Exception as e: - from sentry_sdk import capture_exception - - capture_exception(e) - print("Seems we avoided a race condition") - session.rollback() - return cls.query.filter_by(user=user, video=video).one() + new = cls( + user, + video, + opened=opened, + liked=liked, + playback_position=playback_position, + ) + session.add(new) + session.commit() + return new + except IntegrityError: + # Concurrent insert hit the UNIQUE(user_id, video_id) constraint -- not a bug. + session.rollback() + logger.info("UserVideo race avoided: returning row inserted by concurrent request") + return cls.query.filter_by(user=user, video=video).first() + except Exception as e: + from sentry_sdk import capture_exception + + capture_exception(e) + session.rollback() + return cls.query.filter_by(user=user, video=video).first() @classmethod def all_liked_videos_of_user(cls, user): diff --git a/zeeguu/core/model/video.py b/zeeguu/core/model/video.py index d391d8552..fc6f73669 100644 --- a/zeeguu/core/model/video.py +++ b/zeeguu/core/model/video.py @@ -92,7 +92,15 @@ def find_or_create( video_unique_key, language, upload_index=True, + captions=None, + enforce_language=True, + enforce_caption_length=True, ): + """captions: already-normalized {text, captions} dict from a client (browser + extension / iOS WKWebView), used in place of the server-side fetch. For + user-shared videos, callers pass enforce_language=False and + enforce_caption_length=False (the user chose the video, so we don't reject + it on language detection or caption length).""" from zeeguu.core.elastic.indexing import index_video # Import here to avoid circular dependency: @@ -111,7 +119,13 @@ def find_or_create( return video try: - video_info = fetch_video_info(video_unique_key, language) + video_info = fetch_video_info( + video_unique_key, + language, + provided_captions=captions, + enforce_language=enforce_language, + enforce_caption_length=enforce_caption_length, + ) except ValueError as e: print(f"Error fetching video info for {video_unique_key}: {e}") return None diff --git a/zeeguu/core/youtube_api/youtube_api.py b/zeeguu/core/youtube_api/youtube_api.py index 5250fb47e..deae905e7 100644 --- a/zeeguu/core/youtube_api/youtube_api.py +++ b/zeeguu/core/youtube_api/youtube_api.py @@ -92,11 +92,66 @@ def get_video_unique_keys(lang, category_id=None, topic_id=None, max_results=50) return video_ids -def fetch_video_info(video_unique_key, lang): +YOUTUBE_ID_RE = r"[0-9A-Za-z_-]{11}" + + +def extract_youtube_video_id(url): + """Parse the 11-char video id from common YouTube URL shapes + (watch?v=, youtu.be/, /shorts/, /embed/, /live/) or a bare id. None if not found.""" + if not url: + return None + url = url.strip() + m = re.search( + rf"(?:v=|/shorts/|/embed/|/live/|youtu\.be/)({YOUTUBE_ID_RE})", url + ) + if m: + return m.group(1) + if re.fullmatch(YOUTUBE_ID_RE, url): + return url + return None + + +def normalize_caption_list(caption_list): + """Turn a client-supplied list of {time_start, time_end, text} segments (times in + milliseconds) into the {text, captions} shape the rest of the pipeline expects. + Used for captions extracted client-side (browser extension / iOS WKWebView) from + YouTube's authorized player, sidestepping the server-side fetch YouTube blocks.""" + if not caption_list: + return None + normalized = [] + full_text = [] + for c in caption_list: + clean = text_cleaner(c.get("text", "")) + if not clean.strip(): + continue + normalized.append( + { + "time_start": int(c["time_start"]), + "time_end": int(c["time_end"]), + "text": clean, + } + ) + full_text.append(clean) + if not normalized: + return None + return {"text": "\n".join(full_text), "captions": normalized} + + +def fetch_video_info( + video_unique_key, + lang, + provided_captions=None, + enforce_language=True, + enforce_caption_length=True, +): """ video_unique_key is the video id, e.g. "8-GrLwHK8SQ" - + provided_captions: already-normalized {text, captions} dict (see + normalize_caption_list); when given, used in place of the server-side fetch. + enforce_language / enforce_caption_length default True for crawling, but are + relaxed for user-shared videos (the user already chose the video, so we don't + reject on language/length). """ def _get_thumbnail(item): @@ -155,18 +210,24 @@ def _get_thumbnail(item): video_info["broken"] = VIDEO_IS_MISSING_DURATION return video_info - if not is_video_language_correct( + if enforce_language and not is_video_language_correct( video_info["title"], video_info["description"], lang ): print(f"Video {video_unique_key} is not in the expected language {lang}.") video_info["broken"] = NOT_IN_EXPECTED_LANGUAGE return video_info - captions = get_captions_from_json(video_unique_key, lang) + if provided_captions is not None: + captions = provided_captions + else: + captions = get_captions_from_json(video_unique_key, lang) + if captions is None: print(f"Could not fetch captions for video {video_unique_key} in {lang}") video_info["broken"] = NO_CAPTIONS_AVAILABLE - elif is_captions_too_short(captions["text"], video_info["duration"]): + elif enforce_caption_length and is_captions_too_short( + captions["text"], video_info["duration"] + ): video_info["broken"] = CAPTIONS_TOO_SHORT else: video_info["text"] = captions["text"]