From 1a1dab3daab5dfd3071855f5e7ca221df79891de Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Sun, 31 May 2026 21:08:26 +0200 Subject: [PATCH 1/2] feat(video): caption_translation_set + caption_translation models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tables to hold per-(video, target_language, target_cefr) translated subtitles for a shared video. Per-segment translation preserves the original Caption.time_start/time_end so the reader's timing/sync logic is unchanged — only the rendered text is in the learner's language. - caption_translation_set: the bundle, with status (pending/translating/ready/error) for the async job, error_message, and a UNIQUE(video_id, target_language_id, cefr_level) so a second request for the same target deduplicates instead of re-translating. - caption_translation: one row per original Caption inside a set, pointing at a NewText row for the translated content. UNIQUE(set_id, caption_id) so retried jobs resume cleanly. Mirrors the DailyAudioLesson ↔ DailyAudioLessonSegment shape already in the codebase. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../26-05-31-a--add_caption_translation.sql | 37 ++++++ zeeguu/core/model/__init__.py | 2 + zeeguu/core/model/caption_translation.py | 48 ++++++++ zeeguu/core/model/caption_translation_set.py | 116 ++++++++++++++++++ 4 files changed, 203 insertions(+) create mode 100644 tools/migrations/26-05-31-a--add_caption_translation.sql create mode 100644 zeeguu/core/model/caption_translation.py create mode 100644 zeeguu/core/model/caption_translation_set.py diff --git a/tools/migrations/26-05-31-a--add_caption_translation.sql b/tools/migrations/26-05-31-a--add_caption_translation.sql new file mode 100644 index 00000000..64442394 --- /dev/null +++ b/tools/migrations/26-05-31-a--add_caption_translation.sql @@ -0,0 +1,37 @@ +-- v1.5: translated captions for a shared video. +-- A `caption_translation_set` is a per-(video, target_language, target_cefr) bundle that owns +-- per-original-caption translated text rows. Timing stays on the parent `caption` rows so we +-- don't duplicate it (the player aligns by original time_start/time_end). Status drives the +-- async translation job (mirrors the daily-audio-lesson status pattern). + +CREATE TABLE `caption_translation_set` ( + `id` int NOT NULL AUTO_INCREMENT, + `video_id` int NOT NULL, + `target_language_id` int NOT NULL, + `cefr_level` enum('A1','A2','B1','B2','C1','C2') NOT NULL, + `status` enum('pending','translating','ready','error') NOT NULL DEFAULT 'pending', + `error_message` varchar(500) DEFAULT NULL, + `created_at` datetime NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `uq_caption_translation_set_video_lang_cefr` + (`video_id`, `target_language_id`, `cefr_level`), + CONSTRAINT `fk_caption_translation_set_video` + FOREIGN KEY (`video_id`) REFERENCES `video` (`id`), + CONSTRAINT `fk_caption_translation_set_target_language` + FOREIGN KEY (`target_language_id`) REFERENCES `language` (`id`) +); + +CREATE TABLE `caption_translation` ( + `id` int NOT NULL AUTO_INCREMENT, + `set_id` int NOT NULL, + `caption_id` int NOT NULL, + `text_id` int NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `uq_caption_translation_set_caption` (`set_id`, `caption_id`), + CONSTRAINT `fk_caption_translation_set` + FOREIGN KEY (`set_id`) REFERENCES `caption_translation_set` (`id`) ON DELETE CASCADE, + CONSTRAINT `fk_caption_translation_caption` + FOREIGN KEY (`caption_id`) REFERENCES `caption` (`id`), + CONSTRAINT `fk_caption_translation_text` + FOREIGN KEY (`text_id`) REFERENCES `new_text` (`id`) +); diff --git a/zeeguu/core/model/__init__.py b/zeeguu/core/model/__init__.py index e6533ae9..87fce1f3 100644 --- a/zeeguu/core/model/__init__.py +++ b/zeeguu/core/model/__init__.py @@ -88,6 +88,8 @@ from .yt_channel import YTChannel from .video import Video from .caption import Caption +from .caption_translation_set import CaptionTranslationSet +from .caption_translation import CaptionTranslation from .video_tag import VideoTag from .video_tag_map import VideoTagMap from .video_caption_context import VideoCaptionContext diff --git a/zeeguu/core/model/caption_translation.py b/zeeguu/core/model/caption_translation.py new file mode 100644 index 00000000..f1c4c53f --- /dev/null +++ b/zeeguu/core/model/caption_translation.py @@ -0,0 +1,48 @@ +"""A single translated caption — translated text for one original Caption inside a set. + +Timing (time_start / time_end) is read from the original Caption; we only store the new text. +""" +from zeeguu.core.model.db import db +from zeeguu.core.model.caption import Caption +from zeeguu.core.model.new_text import NewText + + +class CaptionTranslation(db.Model): + __tablename__ = "caption_translation" + __table_args__ = ( + db.UniqueConstraint("set_id", "caption_id", name="uq_caption_translation_set_caption"), + {"mysql_collate": "utf8_bin"}, + ) + + id = db.Column(db.Integer, primary_key=True) + + set_id = db.Column( + db.Integer, db.ForeignKey("caption_translation_set.id"), nullable=False + ) + translation_set = db.relationship( + "CaptionTranslationSet", back_populates="translations" + ) + + caption_id = db.Column(db.Integer, db.ForeignKey(Caption.id), nullable=False) + caption = db.relationship(Caption, foreign_keys="CaptionTranslation.caption_id") + + text_id = db.Column(db.Integer, db.ForeignKey(NewText.id), nullable=False) + text = db.relationship(NewText, foreign_keys="CaptionTranslation.text_id") + + def __init__(self, translation_set, caption, text): + self.translation_set = translation_set + self.caption = caption + self.text = text + + def __repr__(self): + return f"" + + def get_content(self): + return self.text.get_content() + + @classmethod + def create(cls, session, translation_set, caption, translated_text: str): + text_row = NewText.find_or_create(session, translated_text, False) + row = cls(translation_set=translation_set, caption=caption, text=text_row) + session.add(row) + return row diff --git a/zeeguu/core/model/caption_translation_set.py b/zeeguu/core/model/caption_translation_set.py new file mode 100644 index 00000000..fe7b3dd8 --- /dev/null +++ b/zeeguu/core/model/caption_translation_set.py @@ -0,0 +1,116 @@ +"""A per-(video, target_language, target_cefr) bundle of translated captions. + +Owns the async-job status so the reader can poll while translation runs in the background. +Timing is NOT stored here — it stays on the original Caption rows so we don't duplicate it. +""" +from datetime import datetime + +from sqlalchemy.orm.exc import NoResultFound + +from zeeguu.core.model.db import db +from zeeguu.core.model.language import Language +from zeeguu.core.model.video import Video + + +CEFR_LEVELS = ("A1", "A2", "B1", "B2", "C1", "C2") +STATUS_PENDING = "pending" +STATUS_TRANSLATING = "translating" +STATUS_READY = "ready" +STATUS_ERROR = "error" + + +class CaptionTranslationSet(db.Model): + __tablename__ = "caption_translation_set" + __table_args__ = ( + db.UniqueConstraint( + "video_id", + "target_language_id", + "cefr_level", + name="uq_caption_translation_set_video_lang_cefr", + ), + {"mysql_collate": "utf8_bin"}, + ) + + id = db.Column(db.Integer, primary_key=True) + + video_id = db.Column(db.Integer, db.ForeignKey(Video.id), nullable=False) + video = db.relationship(Video) + + target_language_id = db.Column(db.Integer, db.ForeignKey(Language.id), nullable=False) + target_language = db.relationship(Language) + + cefr_level = db.Column( + db.Enum(*CEFR_LEVELS, name="cefr_level_enum"), nullable=False + ) + + status = db.Column( + db.Enum( + STATUS_PENDING, STATUS_TRANSLATING, STATUS_READY, STATUS_ERROR, + name="caption_translation_set_status", + ), + nullable=False, + default=STATUS_PENDING, + ) + error_message = db.Column(db.String(500)) + created_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) + + translations = db.relationship( + "CaptionTranslation", back_populates="translation_set", cascade="all, delete-orphan" + ) + + def __init__(self, video, target_language, cefr_level): + self.video = video + self.target_language = target_language + self.cefr_level = cefr_level + self.status = STATUS_PENDING + self.created_at = datetime.utcnow() + + def __repr__(self): + return ( + f"" + ) + + def mark_translating(self): + self.status = STATUS_TRANSLATING + self.error_message = None + + def mark_ready(self): + self.status = STATUS_READY + self.error_message = None + + def mark_error(self, message: str): + self.status = STATUS_ERROR + self.error_message = (message or "")[:500] + + def as_dictionary(self): + return { + "id": self.id, + "video_id": self.video_id, + "target_language": self.target_language.code, + "cefr_level": self.cefr_level, + "status": self.status, + "error_message": self.error_message, + } + + @classmethod + def find_or_create(cls, session, video, target_language, cefr_level): + """Idempotent: a second request for the same (video, lang, cefr) returns the existing + set so callers can poll status without re-translating.""" + try: + return ( + cls.query.filter_by( + video_id=video.id, + target_language_id=target_language.id, + cefr_level=cefr_level, + ).one() + ) + except NoResultFound: + new_set = cls(video=video, target_language=target_language, cefr_level=cefr_level) + session.add(new_set) + session.commit() + return new_set + + @classmethod + def find_by_id(cls, set_id: int): + return cls.query.filter_by(id=set_id).first() From 2df32876b6831dec442e0b56f7714a8a067de887 Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Sun, 31 May 2026 21:08:42 +0200 Subject: [PATCH 2/2] feat(video): translate a shared video's captions to the learner's language MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the v1.5 plan: when a learner shares a YouTube video whose captions are in a different language, offer to translate the captions to the learner's language at their CEFR level, preserving the original per-segment timing so the existing interactive reader (tap-to-translate, bookmarks, time-synced highlight) keeps working unchanged. Audio is unaffected; only the reading surface changes. - New service core/llm_services/caption_translation_service.translate_set(set_id): batches ~30 captions per Haiku call with structured JSON output (numeric markers), falls back to per-caption translation when a batch's parsing or alignment fails so partial LLM failures degrade gracefully instead of zeroing the set. Reuses the existing haiku_client. - New endpoints in api/endpoints/caption_translation.py: - POST /video//translate_captions — find_or_create the set, kick off the background job via run_in_background, return 202 + set dict. Idempotent. - GET /video//translate_captions/status?set_id= — for the reader's polling loop. - Extended /user_video to accept optional caption_set_id; when the set is ready and belongs to the requested video, Video.video_info substitutes translated text + retokenises in the target language. context_identifier still references the original caption id so bookmark anchoring is stable across track switches. If the set isn't ready, we silently serve the original captions — the reader's separate status poll drives the eventual refetch. Co-Authored-By: Claude Opus 4.7 (1M context) --- zeeguu/api/endpoints/__init__.py | 1 + zeeguu/api/endpoints/caption_translation.py | 91 +++++++++ zeeguu/api/endpoints/user_video.py | 25 ++- .../caption_translation_service.py | 183 ++++++++++++++++++ zeeguu/core/model/user_video.py | 7 +- zeeguu/core/model/video.py | 27 ++- 6 files changed, 328 insertions(+), 6 deletions(-) create mode 100644 zeeguu/api/endpoints/caption_translation.py create mode 100644 zeeguu/core/llm_services/caption_translation_service.py diff --git a/zeeguu/api/endpoints/__init__.py b/zeeguu/api/endpoints/__init__.py index 4d3b2777..72ec971f 100644 --- a/zeeguu/api/endpoints/__init__.py +++ b/zeeguu/api/endpoints/__init__.py @@ -41,6 +41,7 @@ from .listening_sessions import * from . import user_video from . import user_watching_session +from . import caption_translation from . import audio_lessons from . import article_simplification from . import generated_examples diff --git a/zeeguu/api/endpoints/caption_translation.py b/zeeguu/api/endpoints/caption_translation.py new file mode 100644 index 00000000..2bc3dad0 --- /dev/null +++ b/zeeguu/api/endpoints/caption_translation.py @@ -0,0 +1,91 @@ +"""Endpoints for the per-video translated-captions feature (v1.5 of share-to-video). + +POST kicks off (or returns the existing) per-(video, target_language, cefr) translation set +and runs the LLM job in a background thread; GET polls the set's status. Once `ready`, the +reader calls /user_video?caption_set_id=... to get the translated caption block. +""" +import flask +from flask import request +from sqlalchemy.orm.exc import NoResultFound + +from zeeguu.core.model import User, Language +from zeeguu.core.model.video import Video +from zeeguu.core.model.caption_translation_set import ( + CaptionTranslationSet, + CEFR_LEVELS, + STATUS_READY, +) +from zeeguu.core.llm_services.caption_translation_service import translate_set +from zeeguu.api.utils.background import run_in_background +from zeeguu.api.utils.json_result import json_result +from zeeguu.api.utils.route_wrappers import cross_domain, requires_session + +from . import api, db_session + + +def _resolve_video_or_404(video_id: int) -> Video: + video = Video.find_by_id(video_id) + if video is None: + flask.abort(404, "video not found") + return video + + +def _resolve_language_or_406(code: str) -> Language: + try: + return Language.find(code) + except NoResultFound: + flask.abort(406, "Language not supported") + + +def _read_body(): + data = request.get_json(silent=True) or {} + return { + "target_language": (data.get("target_language") or request.form.get("target_language") or "").strip(), + "target_cefr": (data.get("target_cefr") or request.form.get("target_cefr") or "").strip().upper(), + } + + +@api.route("/video//translate_captions", methods=["POST"]) +@cross_domain +@requires_session +def video_translate_captions(video_id): + User.find_by_id(flask.g.user_id) # validates session and existence + video = _resolve_video_or_404(video_id) + + body = _read_body() + if not body["target_language"]: + flask.abort(400, "target_language required") + if body["target_cefr"] not in CEFR_LEVELS: + flask.abort(400, f"target_cefr must be one of {CEFR_LEVELS}") + target_language = _resolve_language_or_406(body["target_language"]) + + if target_language.code == video.language.code: + flask.abort(400, "target_language matches the video's caption language") + + # Idempotent: the second request for the same (video, language, cefr) returns the existing + # set without re-translating. If already ready, no background job — caller polls and goes. + translation_set = CaptionTranslationSet.find_or_create( + db_session, video, target_language, body["target_cefr"] + ) + + if translation_set.status != STATUS_READY: + run_in_background(translate_set, translation_set.id) + + return json_result(translation_set.as_dictionary()), 202 + + +@api.route("/video//translate_captions/status", methods=["GET"]) +@cross_domain +@requires_session +def video_translate_captions_status(video_id): + User.find_by_id(flask.g.user_id) + video = _resolve_video_or_404(video_id) + + set_id = request.args.get("set_id") + if not set_id: + flask.abort(400, "set_id required") + translation_set = CaptionTranslationSet.find_by_id(int(set_id)) + if translation_set is None or translation_set.video_id != video.id: + flask.abort(404, "translation set not found for this video") + + return json_result(translation_set.as_dictionary()) diff --git a/zeeguu/api/endpoints/user_video.py b/zeeguu/api/endpoints/user_video.py index b4d92faa..32c3cd31 100644 --- a/zeeguu/api/endpoints/user_video.py +++ b/zeeguu/api/endpoints/user_video.py @@ -1,6 +1,10 @@ import flask from flask import request from zeeguu.core.model import User, UserVideo, Video +from zeeguu.core.model.caption_translation_set import ( + CaptionTranslationSet, + STATUS_READY, +) from zeeguu.api.utils.route_wrappers import cross_domain, requires_session from zeeguu.api.utils.json_result import json_result @@ -24,7 +28,26 @@ def get_user_video(): user = User.find_by_id(flask.g.user_id) new_user_video = UserVideo.find_or_create(db_session, user, video) - return json_result(new_user_video.user_video_info(user, video, with_content=True)) + # Optional translated-caption track. If the set isn't ready yet (still translating, errored, + # or doesn't belong to this video) we silently serve the original captions — the reader + # polls the dedicated status endpoint and re-fetches when ready, so the worst UX is a + # one-cycle delay rather than a 4xx during a known-async wait. + translation_set = None + caption_set_id = request.args.get("caption_set_id") + if caption_set_id: + candidate = CaptionTranslationSet.find_by_id(int(caption_set_id)) + if ( + candidate + and candidate.video_id == video.id + and candidate.status == STATUS_READY + ): + translation_set = candidate + + return json_result( + new_user_video.user_video_info( + user, video, with_content=True, translation_set=translation_set + ) + ) # --------------------------------------------------------------------------- diff --git a/zeeguu/core/llm_services/caption_translation_service.py b/zeeguu/core/llm_services/caption_translation_service.py new file mode 100644 index 00000000..679fbd2b --- /dev/null +++ b/zeeguu/core/llm_services/caption_translation_service.py @@ -0,0 +1,183 @@ +"""Translate a video's captions into the learner's target language at their CEFR level. + +Per-segment translation preserves the original `time_start`/`time_end` of each `Caption`, so +the player's timing logic is unchanged — only the rendered text and tokenization differ. + +LLM strategy: batches of ~30 captions per Haiku call (cheap and fast), structured JSON output +keyed by numeric marker; on parse / missing-key failure we fall back to a single-caption call +for the affected items so partial LLM failures degrade gracefully instead of zeroing the set. +""" +from __future__ import annotations + +import json +import re +from typing import Iterable, Optional + +from zeeguu.core.model.db import db +from zeeguu.core.model.caption import Caption +from zeeguu.core.model.caption_translation import CaptionTranslation +from zeeguu.core.model.caption_translation_set import CaptionTranslationSet +from zeeguu.core.llm_services.haiku_client import haiku_completion +from zeeguu.logging import log + + +BATCH_SIZE = 30 +BATCH_MAX_TOKENS = 2000 # generous; ~30 short captions translated easily fit +SINGLE_MAX_TOKENS = 200 + + +def _batched(items, n): + for i in range(0, len(items), n): + yield items[i : i + n] + + +def _strip_code_fence(text: str) -> str: + text = text.strip() + if text.startswith("```"): + text = re.sub(r"^```(?:json)?\s*", "", text) + text = re.sub(r"\s*```$", "", text) + return text.strip() + + +def _build_batch_prompt( + captions: list[Caption], source_language: str, target_language: str, cefr: str +) -> str: + numbered = "\n".join(f"[{i + 1}] {c.get_content()}" for i, c in enumerate(captions)) + return f"""Translate each of the following {source_language} subtitle lines into {target_language} at CEFR level {cefr}. + +Rules: +- Preserve meaning faithfully; favor natural, idiomatic {target_language}. +- Adapt vocabulary and grammar to CEFR {cefr} (simpler words for A1-A2, intermediate for B1-B2, advanced for C1-C2). +- One line per input line — do NOT merge or split lines. +- Output STRICTLY a single JSON object, nothing else (no markdown fences, no commentary): +{{"1": "translation of line 1", "2": "translation of line 2", ...}} + +Lines to translate: +{numbered} +""" + + +def _build_single_prompt( + text: str, source_language: str, target_language: str, cefr: str +) -> str: + return ( + f"Translate the following {source_language} subtitle into {target_language} " + f"at CEFR level {cefr}. Output ONLY the translation — no quotes, no commentary.\n\n" + f"{text}" + ) + + +def _translate_batch( + captions: list[Caption], source_language: str, target_language: str, cefr: str +) -> dict[int, str]: + """Returns {1-based index in `captions` -> translation}. Missing keys mean the LLM didn't + provide a translation for that line; callers should fall back per-caption for those.""" + if not captions: + return {} + prompt = _build_batch_prompt(captions, source_language, target_language, cefr) + raw = haiku_completion(prompt, max_tokens=BATCH_MAX_TOKENS, temperature=0.1) + if not raw: + return {} + try: + # `strict=False` because LLMs sometimes embed literal newlines in JSON string values + # (which `json.loads` strict mode rejects). Matches the simplification_service fix. + parsed = json.loads(_strip_code_fence(raw), strict=False) + except (json.JSONDecodeError, ValueError) as e: + log(f"[caption_translation] batch JSON parse failed: {e}") + return {} + if not isinstance(parsed, dict): + return {} + out: dict[int, str] = {} + for k, v in parsed.items(): + try: + idx = int(str(k).strip()) + except ValueError: + continue + if isinstance(v, str) and v.strip(): + out[idx] = v.strip() + return out + + +def _translate_one( + text: str, source_language: str, target_language: str, cefr: str +) -> Optional[str]: + raw = haiku_completion( + _build_single_prompt(text, source_language, target_language, cefr), + max_tokens=SINGLE_MAX_TOKENS, + temperature=0.1, + ) + if not raw: + return None + cleaned = raw.strip().strip('"').strip() + return cleaned or None + + +def translate_set(set_id: int) -> None: + """Background-job entry point. Translates every caption in the set's video and stores the + rows. Idempotent at the row level: existing CaptionTranslations for the set are skipped so + a retried run resumes instead of duplicating.""" + translation_set = CaptionTranslationSet.find_by_id(set_id) + if translation_set is None: + log(f"[caption_translation] no set with id {set_id}") + return + + try: + translation_set.mark_translating() + db.session.commit() + + video = translation_set.video + source_language = video.language.code + target_language = translation_set.target_language.code + cefr = translation_set.cefr_level + + captions = sorted(video.captions, key=lambda c: c.time_start) + if not captions: + translation_set.mark_error("Video has no captions to translate.") + db.session.commit() + return + + already_done = { + ct.caption_id + for ct in CaptionTranslation.query.filter_by(set_id=translation_set.id).all() + } + todo = [c for c in captions if c.id not in already_done] + log( + f"[caption_translation] set={translation_set.id} translating " + f"{len(todo)}/{len(captions)} captions ({source_language} -> {target_language}, {cefr})" + ) + + for batch in _batched(todo, BATCH_SIZE): + batch_translations = _translate_batch( + batch, source_language, target_language, cefr + ) + for i, caption in enumerate(batch, start=1): + text = batch_translations.get(i) + if not text: + # Per-caption fallback for items the batch call dropped or mis-keyed. + text = _translate_one( + caption.get_content(), source_language, target_language, cefr + ) + if not text: + # Last resort: skip this caption rather than fail the whole set; the + # reader will show the original text for un-translated lines. + log( + f"[caption_translation] dropped caption {caption.id} " + f"(set={translation_set.id}) — LLM returned nothing" + ) + continue + CaptionTranslation.create( + db.session, translation_set, caption, text + ) + db.session.commit() + + translation_set.mark_ready() + db.session.commit() + log(f"[caption_translation] set={translation_set.id} ready") + except Exception as e: # noqa: BLE001 — background job; surface via status row + log(f"[caption_translation] set={set_id} error: {e}") + db.session.rollback() + # Reload after rollback to mark the set's error state cleanly. + translation_set = CaptionTranslationSet.find_by_id(set_id) + if translation_set: + translation_set.mark_error(str(e)) + db.session.commit() diff --git a/zeeguu/core/model/user_video.py b/zeeguu/core/model/user_video.py index e534813c..812a0d6c 100644 --- a/zeeguu/core/model/user_video.py +++ b/zeeguu/core/model/user_video.py @@ -128,13 +128,16 @@ def exists(cls, obj): @classmethod def user_video_info( - cls, user: User, video: Video, with_content=False, with_translations=True + cls, user: User, video: Video, with_content=False, with_translations=True, + translation_set=None, ): from zeeguu.core.model.bookmark import Bookmark from zeeguu.core.model.video_title_context import VideoTitleContext from zeeguu.core.model.user_activitiy_data import UserActivityData - returned_info = video.video_info(with_content=with_content) + returned_info = video.video_info( + with_content=with_content, translation_set=translation_set + ) user_video_info = UserVideo.find(user, video) # user_diff_feedback = VideoDifficultyFeedback.find(user, video) # user_topics_feedback = VideoTopicsFeedback.find_given_user_video(user, video) diff --git a/zeeguu/core/model/video.py b/zeeguu/core/model/video.py index d391d855..dd6eee9e 100644 --- a/zeeguu/core/model/video.py +++ b/zeeguu/core/model/video.py @@ -239,7 +239,12 @@ def topics_as_tuple(self): topics.append((topic.topic.title, topic.origin_type)) return topics - def video_info(self, with_content=False): + def video_info(self, with_content=False, translation_set=None): + """If `translation_set` is given, each caption's `text`/`tokenized_text` come from the + translated caption in the user's target language at the set's CEFR; timings and the + `context_identifier` (still keyed by the original caption id) are unchanged, so the + player's timing logic and the bookmark anchor are stable across original/translated + views. Captions missing a translation in the set fall back to the original text.""" text = self.get_content() summary = text[:MAX_CHAR_COUNT_IN_SUMMARY].replace("\n", " ") + "..." result_dict = dict( @@ -269,13 +274,29 @@ def video_info(self, with_content=False): if with_content: from zeeguu.core.mwe import tokenize_for_reading + translations_by_caption_id = {} + caption_language = self.language + if translation_set is not None: + translations_by_caption_id = { + ct.caption_id: ct.get_content() + for ct in translation_set.translations + } + caption_language = translation_set.target_language + result_dict["caption_set"] = { + "id": translation_set.id, + "target_language": translation_set.target_language.code, + "cefr_level": translation_set.cefr_level, + } + result_dict["captions"] = [ { "time_start": caption.time_start / 1000, # convert to seconds "time_end": caption.time_end / 1000, - "text": caption.get_content(), + "text": translations_by_caption_id.get(caption.id, caption.get_content()), "tokenized_text": tokenize_for_reading( - caption.get_content(), self.language, mode="stanza" + translations_by_caption_id.get(caption.id, caption.get_content()), + caption_language, + mode="stanza", ), "context_identifier": ContextIdentifier( ContextType.VIDEO_CAPTION, video_caption_id=caption.id