fix: add repetition_penalty to Qwen 0.8B to prevent looping text generation

ijbo · ijbo · commit f8c474040452 · 2026-03-25T16:50:37.000+09:00
diff --git a/changelogs/CHANGELOG-ai-repetition-fix.md b/changelogs/CHANGELOG-ai-repetition-fix.md
@@ -0,0 +1,15 @@
+# Fix AI Text Repetition (Qwen 0.8B)
+
+## Overview
+Fixed repetitive/looping text generation when using `{{@AI:}}` tags with the Qwen 3.5 0.8B model. Stories and other generated content would enter infinite repetition loops due to missing anti-repetition parameters.
+
+## Root Cause
+The main `ai-worker.js` lacked `repetition_penalty` and `no_repeat_ngram_size` in its generation config, while all other workers (Florence, Docling, GLM-OCR) already used `repetition_penalty: 1.2–1.5`. Small models like Qwen 0.8B are especially prone to degenerate repetition without these parameters.
+
+## Changes
+- **`public/ai-worker.js`** — Added anti-repetition parameters to both generation paths (multimodal + text-only):
+  - Non-thinking mode: `repetition_penalty: 1.3`, `no_repeat_ngram_size: 5`
+  - Thinking mode: `repetition_penalty: 1.2`, `no_repeat_ngram_size: 4` (lower to avoid disrupting reasoning chains)
+
+## Files Modified
+- `public/ai-worker.js` — Generation config for both multimodal and text-only paths
diff --git a/public/ai-worker.js b/public/ai-worker.js
@@ -299,8 +299,8 @@ async function generate(taskType, context, userPrompt, messageId, enableThinking
 
             // Generate — Qwen3 model card: use sampling, NOT greedy, for thinking mode
             const genConfig = enableThinking
-                ? { do_sample: true, temperature: 0.6, top_p: 0.95, top_k: 20, max_new_tokens: Math.max(maxTokens, 4096) }
-                : { do_sample: true, temperature: 0.7, top_p: 0.8, top_k: 20, max_new_tokens: maxTokens };
+                ? { do_sample: true, temperature: 0.6, top_p: 0.95, top_k: 20, max_new_tokens: Math.max(maxTokens, 4096), repetition_penalty: 1.2, no_repeat_ngram_size: 4 }
+                : { do_sample: true, temperature: 0.7, top_p: 0.8, top_k: 20, max_new_tokens: maxTokens, repetition_penalty: 1.3, no_repeat_ngram_size: 5 };
             await model.generate({ ...inputs, ...genConfig, streamer });
 
             // Final cleanup — strip any remaining think tags or special tokens
@@ -389,8 +389,8 @@ async function generate(taskType, context, userPrompt, messageId, enableThinking
             // Generate — Qwen3 model card: use sampling, NOT greedy, for thinking mode
             // Thinking: temp=0.6, top_p=0.95, top_k=20 | Non-thinking: temp=0.7, top_p=0.8, top_k=20
             const genConfig = enableThinking
-                ? { do_sample: true, temperature: 0.6, top_p: 0.95, top_k: 20, max_new_tokens: Math.max(maxTokens, 4096) }
-                : { do_sample: true, temperature: 0.7, top_p: 0.8, top_k: 20, max_new_tokens: maxTokens };
+                ? { do_sample: true, temperature: 0.6, top_p: 0.95, top_k: 20, max_new_tokens: Math.max(maxTokens, 4096), repetition_penalty: 1.2, no_repeat_ngram_size: 4 }
+                : { do_sample: true, temperature: 0.7, top_p: 0.8, top_k: 20, max_new_tokens: maxTokens, repetition_penalty: 1.3, no_repeat_ngram_size: 5 };
             await model.generate({ ...inputs, ...genConfig, streamer });
 
             // Final cleanup: strip any remaining think tags or reasoning artifacts