Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions tools/migrations/26-05-31-a--add_caption_translation.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-- v1.5: translated captions for a shared video.
-- A `caption_translation_set` is a per-(video, target_language, target_cefr) bundle that owns
-- per-original-caption translated text rows. Timing stays on the parent `caption` rows so we
-- don't duplicate it (the player aligns by original time_start/time_end). Status drives the
-- async translation job (mirrors the daily-audio-lesson status pattern).

CREATE TABLE `caption_translation_set` (
`id` int NOT NULL AUTO_INCREMENT,
`video_id` int NOT NULL,
`target_language_id` int NOT NULL,
`cefr_level` enum('A1','A2','B1','B2','C1','C2') NOT NULL,
`status` enum('pending','translating','ready','error') NOT NULL DEFAULT 'pending',
`error_message` varchar(500) DEFAULT NULL,
`created_at` datetime NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `uq_caption_translation_set_video_lang_cefr`
(`video_id`, `target_language_id`, `cefr_level`),
CONSTRAINT `fk_caption_translation_set_video`
FOREIGN KEY (`video_id`) REFERENCES `video` (`id`),
CONSTRAINT `fk_caption_translation_set_target_language`
FOREIGN KEY (`target_language_id`) REFERENCES `language` (`id`)
);

CREATE TABLE `caption_translation` (
`id` int NOT NULL AUTO_INCREMENT,
`set_id` int NOT NULL,
`caption_id` int NOT NULL,
`text_id` int NOT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `uq_caption_translation_set_caption` (`set_id`, `caption_id`),
CONSTRAINT `fk_caption_translation_set`
FOREIGN KEY (`set_id`) REFERENCES `caption_translation_set` (`id`) ON DELETE CASCADE,
CONSTRAINT `fk_caption_translation_caption`
FOREIGN KEY (`caption_id`) REFERENCES `caption` (`id`),
CONSTRAINT `fk_caption_translation_text`
FOREIGN KEY (`text_id`) REFERENCES `new_text` (`id`)
);
1 change: 1 addition & 0 deletions zeeguu/api/endpoints/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from .listening_sessions import *
from . import user_video
from . import user_watching_session
from . import caption_translation
from . import audio_lessons
from . import article_simplification
from . import generated_examples
Expand Down
91 changes: 91 additions & 0 deletions zeeguu/api/endpoints/caption_translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Endpoints for the per-video translated-captions feature (v1.5 of share-to-video).

POST kicks off (or returns the existing) per-(video, target_language, cefr) translation set
and runs the LLM job in a background thread; GET polls the set's status. Once `ready`, the
reader calls /user_video?caption_set_id=... to get the translated caption block.
"""
import flask
from flask import request
from sqlalchemy.orm.exc import NoResultFound

from zeeguu.core.model import User, Language
from zeeguu.core.model.video import Video
from zeeguu.core.model.caption_translation_set import (
CaptionTranslationSet,
CEFR_LEVELS,
STATUS_READY,
)
from zeeguu.core.llm_services.caption_translation_service import translate_set
from zeeguu.api.utils.background import run_in_background
from zeeguu.api.utils.json_result import json_result
from zeeguu.api.utils.route_wrappers import cross_domain, requires_session

from . import api, db_session


def _resolve_video_or_404(video_id: int) -> Video:
video = Video.find_by_id(video_id)
if video is None:
flask.abort(404, "video not found")
return video


def _resolve_language_or_406(code: str) -> Language:
try:
return Language.find(code)
except NoResultFound:
flask.abort(406, "Language not supported")


def _read_body():
data = request.get_json(silent=True) or {}
return {
"target_language": (data.get("target_language") or request.form.get("target_language") or "").strip(),
"target_cefr": (data.get("target_cefr") or request.form.get("target_cefr") or "").strip().upper(),
}


@api.route("/video/<int:video_id>/translate_captions", methods=["POST"])
@cross_domain
@requires_session
def video_translate_captions(video_id):
User.find_by_id(flask.g.user_id) # validates session and existence
video = _resolve_video_or_404(video_id)

body = _read_body()
if not body["target_language"]:
flask.abort(400, "target_language required")
if body["target_cefr"] not in CEFR_LEVELS:
flask.abort(400, f"target_cefr must be one of {CEFR_LEVELS}")
target_language = _resolve_language_or_406(body["target_language"])

if target_language.code == video.language.code:
flask.abort(400, "target_language matches the video's caption language")

# Idempotent: the second request for the same (video, language, cefr) returns the existing
# set without re-translating. If already ready, no background job — caller polls and goes.
translation_set = CaptionTranslationSet.find_or_create(
db_session, video, target_language, body["target_cefr"]
)

if translation_set.status != STATUS_READY:
run_in_background(translate_set, translation_set.id)

return json_result(translation_set.as_dictionary()), 202


@api.route("/video/<int:video_id>/translate_captions/status", methods=["GET"])
@cross_domain
@requires_session
def video_translate_captions_status(video_id):
User.find_by_id(flask.g.user_id)
video = _resolve_video_or_404(video_id)

set_id = request.args.get("set_id")
if not set_id:
flask.abort(400, "set_id required")
translation_set = CaptionTranslationSet.find_by_id(int(set_id))
if translation_set is None or translation_set.video_id != video.id:
flask.abort(404, "translation set not found for this video")

return json_result(translation_set.as_dictionary())
25 changes: 24 additions & 1 deletion zeeguu/api/endpoints/user_video.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import flask
from flask import request
from zeeguu.core.model import User, UserVideo, Video
from zeeguu.core.model.caption_translation_set import (
CaptionTranslationSet,
STATUS_READY,
)

from zeeguu.api.utils.route_wrappers import cross_domain, requires_session
from zeeguu.api.utils.json_result import json_result
Expand All @@ -24,7 +28,26 @@ def get_user_video():
user = User.find_by_id(flask.g.user_id)
new_user_video = UserVideo.find_or_create(db_session, user, video)

return json_result(new_user_video.user_video_info(user, video, with_content=True))
# Optional translated-caption track. If the set isn't ready yet (still translating, errored,
# or doesn't belong to this video) we silently serve the original captions — the reader
# polls the dedicated status endpoint and re-fetches when ready, so the worst UX is a
# one-cycle delay rather than a 4xx during a known-async wait.
translation_set = None
caption_set_id = request.args.get("caption_set_id")
if caption_set_id:
candidate = CaptionTranslationSet.find_by_id(int(caption_set_id))
if (
candidate
and candidate.video_id == video.id
and candidate.status == STATUS_READY
):
translation_set = candidate

return json_result(
new_user_video.user_video_info(
user, video, with_content=True, translation_set=translation_set
)
)


# ---------------------------------------------------------------------------
Expand Down
183 changes: 183 additions & 0 deletions zeeguu/core/llm_services/caption_translation_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""Translate a video's captions into the learner's target language at their CEFR level.

Per-segment translation preserves the original `time_start`/`time_end` of each `Caption`, so
the player's timing logic is unchanged — only the rendered text and tokenization differ.

LLM strategy: batches of ~30 captions per Haiku call (cheap and fast), structured JSON output
keyed by numeric marker; on parse / missing-key failure we fall back to a single-caption call
for the affected items so partial LLM failures degrade gracefully instead of zeroing the set.
"""
from __future__ import annotations

import json
import re
from typing import Iterable, Optional

from zeeguu.core.model.db import db
from zeeguu.core.model.caption import Caption
from zeeguu.core.model.caption_translation import CaptionTranslation
from zeeguu.core.model.caption_translation_set import CaptionTranslationSet
from zeeguu.core.llm_services.haiku_client import haiku_completion
from zeeguu.logging import log


BATCH_SIZE = 30
BATCH_MAX_TOKENS = 2000 # generous; ~30 short captions translated easily fit
SINGLE_MAX_TOKENS = 200


def _batched(items, n):
for i in range(0, len(items), n):
yield items[i : i + n]


def _strip_code_fence(text: str) -> str:
text = text.strip()
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```$", "", text)
return text.strip()


def _build_batch_prompt(
captions: list[Caption], source_language: str, target_language: str, cefr: str
) -> str:
numbered = "\n".join(f"[{i + 1}] {c.get_content()}" for i, c in enumerate(captions))
return f"""Translate each of the following {source_language} subtitle lines into {target_language} at CEFR level {cefr}.

Rules:
- Preserve meaning faithfully; favor natural, idiomatic {target_language}.
- Adapt vocabulary and grammar to CEFR {cefr} (simpler words for A1-A2, intermediate for B1-B2, advanced for C1-C2).
- One line per input line — do NOT merge or split lines.
- Output STRICTLY a single JSON object, nothing else (no markdown fences, no commentary):
{{"1": "translation of line 1", "2": "translation of line 2", ...}}

Lines to translate:
{numbered}
"""


def _build_single_prompt(
text: str, source_language: str, target_language: str, cefr: str
) -> str:
return (
f"Translate the following {source_language} subtitle into {target_language} "
f"at CEFR level {cefr}. Output ONLY the translation — no quotes, no commentary.\n\n"
f"{text}"
)


def _translate_batch(
captions: list[Caption], source_language: str, target_language: str, cefr: str
) -> dict[int, str]:
"""Returns {1-based index in `captions` -> translation}. Missing keys mean the LLM didn't
provide a translation for that line; callers should fall back per-caption for those."""
if not captions:
return {}
prompt = _build_batch_prompt(captions, source_language, target_language, cefr)
raw = haiku_completion(prompt, max_tokens=BATCH_MAX_TOKENS, temperature=0.1)
if not raw:
return {}
try:
# `strict=False` because LLMs sometimes embed literal newlines in JSON string values
# (which `json.loads` strict mode rejects). Matches the simplification_service fix.
parsed = json.loads(_strip_code_fence(raw), strict=False)
except (json.JSONDecodeError, ValueError) as e:
log(f"[caption_translation] batch JSON parse failed: {e}")
return {}
if not isinstance(parsed, dict):
return {}
out: dict[int, str] = {}
for k, v in parsed.items():
try:
idx = int(str(k).strip())
except ValueError:
continue
if isinstance(v, str) and v.strip():
out[idx] = v.strip()
return out


def _translate_one(
text: str, source_language: str, target_language: str, cefr: str
) -> Optional[str]:
raw = haiku_completion(
_build_single_prompt(text, source_language, target_language, cefr),
max_tokens=SINGLE_MAX_TOKENS,
temperature=0.1,
)
if not raw:
return None
cleaned = raw.strip().strip('"').strip()
return cleaned or None


def translate_set(set_id: int) -> None:
"""Background-job entry point. Translates every caption in the set's video and stores the
rows. Idempotent at the row level: existing CaptionTranslations for the set are skipped so
a retried run resumes instead of duplicating."""
translation_set = CaptionTranslationSet.find_by_id(set_id)
if translation_set is None:
log(f"[caption_translation] no set with id {set_id}")
return

try:
translation_set.mark_translating()
db.session.commit()

video = translation_set.video
source_language = video.language.code
target_language = translation_set.target_language.code
cefr = translation_set.cefr_level

captions = sorted(video.captions, key=lambda c: c.time_start)
if not captions:
translation_set.mark_error("Video has no captions to translate.")
db.session.commit()
return

already_done = {
ct.caption_id
for ct in CaptionTranslation.query.filter_by(set_id=translation_set.id).all()
}
todo = [c for c in captions if c.id not in already_done]
log(
f"[caption_translation] set={translation_set.id} translating "
f"{len(todo)}/{len(captions)} captions ({source_language} -> {target_language}, {cefr})"
)

for batch in _batched(todo, BATCH_SIZE):
batch_translations = _translate_batch(
batch, source_language, target_language, cefr
)
for i, caption in enumerate(batch, start=1):
text = batch_translations.get(i)
if not text:
# Per-caption fallback for items the batch call dropped or mis-keyed.
text = _translate_one(
caption.get_content(), source_language, target_language, cefr
)
if not text:
# Last resort: skip this caption rather than fail the whole set; the
# reader will show the original text for un-translated lines.
log(
f"[caption_translation] dropped caption {caption.id} "
f"(set={translation_set.id}) — LLM returned nothing"
)
continue
CaptionTranslation.create(
db.session, translation_set, caption, text
)
db.session.commit()

translation_set.mark_ready()
db.session.commit()
log(f"[caption_translation] set={translation_set.id} ready")
except Exception as e: # noqa: BLE001 — background job; surface via status row
log(f"[caption_translation] set={set_id} error: {e}")
db.session.rollback()
# Reload after rollback to mark the set's error state cleanly.
translation_set = CaptionTranslationSet.find_by_id(set_id)
if translation_set:
translation_set.mark_error(str(e))
db.session.commit()
2 changes: 2 additions & 0 deletions zeeguu/core/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@
from .yt_channel import YTChannel
from .video import Video
from .caption import Caption
from .caption_translation_set import CaptionTranslationSet
from .caption_translation import CaptionTranslation
from .video_tag import VideoTag
from .video_tag_map import VideoTagMap
from .video_caption_context import VideoCaptionContext
Expand Down
Loading
Loading